Import LLVM, at r72732.

author: ed <ed@FreeBSD.org> 2009-06-02 17:52:33 +0000
committer: ed <ed@FreeBSD.org> 2009-06-02 17:52:33 +0000
commit: 3277b69d734b9c90b44ebde4ede005717e2c3b2e (patch)
tree: 64ba909838c23261cace781ece27d106134ea451 /lib/Transforms
download: FreeBSD-src-3277b69d734b9c90b44ebde4ede005717e2c3b2e.zip
FreeBSD-src-3277b69d734b9c90b44ebde4ede005717e2c3b2e.tar.gz
98 files changed, 60463 insertions, 0 deletions
diff --git a/lib/Transforms/Hello/CMakeLists.txt b/lib/Transforms/Hello/CMakeLists.txt
new file mode 100644
index 0000000..b80d15b
--- /dev/null
+++ b/lib/Transforms/Hello/CMakeLists.txt
@@ -0,0 +1,3 @@
+add_llvm_library( LLVMHello
+  Hello.cpp
+  )
diff --git a/lib/Transforms/Hello/Hello.cpp b/lib/Transforms/Hello/Hello.cpp
new file mode 100644
index 0000000..d07f613
--- /dev/null
+++ b/lib/Transforms/Hello/Hello.cpp
@@ -0,0 +1,67 @@
+//===- Hello.cpp - Example code from "Writing an LLVM Pass" ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements two versions of the LLVM "Hello World" pass described
+// in docs/WritingAnLLVMPass.html
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "hello"
+#include "llvm/Pass.h"
+#include "llvm/Function.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/Streams.h"
+#include "llvm/ADT/Statistic.h"
+using namespace llvm;
+
+STATISTIC(HelloCounter, "Counts number of functions greeted");
+
+namespace {
+  // Hello - The first implementation, without getAnalysisUsage.
+  struct Hello : public FunctionPass {
+    static char ID; // Pass identification, replacement for typeid
+    Hello() : FunctionPass(&ID) {}
+
+    virtual bool runOnFunction(Function &F) {
+      HelloCounter++;
+      std::string fname = F.getName();
+      EscapeString(fname);
+      cerr << "Hello: " << fname << "\n";
+      return false;
+    }
+  };
+}
+
+char Hello::ID = 0;
+static RegisterPass<Hello> X("hello", "Hello World Pass");
+
+namespace {
+  // Hello2 - The second implementation with getAnalysisUsage implemented.
+  struct Hello2 : public FunctionPass {
+    static char ID; // Pass identification, replacement for typeid
+    Hello2() : FunctionPass(&ID) {}
+
+    virtual bool runOnFunction(Function &F) {
+      HelloCounter++;
+      std::string fname = F.getName();
+      EscapeString(fname);
+      cerr << "Hello: " << fname << "\n";
+      return false;
+    }
+
+    // We don't modify the program, so we preserve all analyses
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesAll();
+    };
+  };
+}
+
+char Hello2::ID = 0;
+static RegisterPass<Hello2>
+Y("hello2", "Hello World Pass (with getAnalysisUsage implemented)");
diff --git a/lib/Transforms/Hello/Makefile b/lib/Transforms/Hello/Makefile
new file mode 100644
index 0000000..c5e75d4
--- /dev/null
+++ b/lib/Transforms/Hello/Makefile
@@ -0,0 +1,16 @@
+##===- lib/Transforms/Hello/Makefile -----------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../../..
+LIBRARYNAME = LLVMHello
+LOADABLE_MODULE = 1
+USEDLIBS =
+
+include $(LEVEL)/Makefile.common
+
diff --git a/lib/Transforms/IPO/ArgumentPromotion.cpp b/lib/Transforms/IPO/ArgumentPromotion.cpp
new file mode 100644
index 0000000..2bb6428
--- /dev/null
+++ b/lib/Transforms/IPO/ArgumentPromotion.cpp
@@ -0,0 +1,863 @@
+//===-- ArgumentPromotion.cpp - Promote by-reference arguments ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass promotes "by reference" arguments to be "by value" arguments.  In
+// practice, this means looking for internal functions that have pointer
+// arguments.  If it can prove, through the use of alias analysis, that an
+// argument is *only* loaded, then it can pass the value into the function
+// instead of the address of the value.  This can cause recursive simplification
+// of code and lead to the elimination of allocas (especially in C++ template
+// code like the STL).
+//
+// This pass also handles aggregate arguments that are passed into a function,
+// scalarizing them if the elements of the aggregate are only loaded.  Note that
+// by default it refuses to scalarize aggregates which would require passing in
+// more than three operands to the function, because passing thousands of
+// operands for a large array or structure is unprofitable! This limit can be
+// configured or disabled, however.
+//
+// Note that this transformation could also be done for arguments that are only
+// stored to (returning the value instead), but does not currently.  This case
+// would be best handled when and if LLVM begins supporting multiple return
+// values from functions.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "argpromotion"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Module.h"
+#include "llvm/CallGraphSCCPass.h"
+#include "llvm/Instructions.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Support/CallSite.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/Compiler.h"
+#include <set>
+using namespace llvm;
+
+STATISTIC(NumArgumentsPromoted , "Number of pointer arguments promoted");
+STATISTIC(NumAggregatesPromoted, "Number of aggregate arguments promoted");
+STATISTIC(NumByValArgsPromoted , "Number of byval arguments promoted");
+STATISTIC(NumArgumentsDead     , "Number of dead pointer args eliminated");
+
+namespace {
+  /// ArgPromotion - The 'by reference' to 'by value' argument promotion pass.
+  ///
+  struct VISIBILITY_HIDDEN ArgPromotion : public CallGraphSCCPass {
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequired<AliasAnalysis>();
+      AU.addRequired<TargetData>();
+      CallGraphSCCPass::getAnalysisUsage(AU);
+    }
+
+    virtual bool runOnSCC(const std::vector<CallGraphNode *> &SCC);
+    static char ID; // Pass identification, replacement for typeid
+    explicit ArgPromotion(unsigned maxElements = 3)
+      : CallGraphSCCPass(&ID), maxElements(maxElements) {}
+
+    /// A vector used to hold the indices of a single GEP instruction
+    typedef std::vector<uint64_t> IndicesVector;
+
+  private:
+    bool PromoteArguments(CallGraphNode *CGN);
+    bool isSafeToPromoteArgument(Argument *Arg, bool isByVal) const;
+    Function *DoPromotion(Function *F,
+                          SmallPtrSet<Argument*, 8> &ArgsToPromote,
+                          SmallPtrSet<Argument*, 8> &ByValArgsToTransform);
+    /// The maximum number of elements to expand, or 0 for unlimited.
+    unsigned maxElements;
+  };
+}
+
+char ArgPromotion::ID = 0;
+static RegisterPass<ArgPromotion>
+X("argpromotion", "Promote 'by reference' arguments to scalars");
+
+Pass *llvm::createArgumentPromotionPass(unsigned maxElements) {
+  return new ArgPromotion(maxElements);
+}
+
+bool ArgPromotion::runOnSCC(const std::vector<CallGraphNode *> &SCC) {
+  bool Changed = false, LocalChange;
+
+  do {  // Iterate until we stop promoting from this SCC.
+    LocalChange = false;
+    // Attempt to promote arguments from all functions in this SCC.
+    for (unsigned i = 0, e = SCC.size(); i != e; ++i)
+      LocalChange |= PromoteArguments(SCC[i]);
+    Changed |= LocalChange;               // Remember that we changed something.
+  } while (LocalChange);
+
+  return Changed;
+}
+
+/// PromoteArguments - This method checks the specified function to see if there
+/// are any promotable arguments and if it is safe to promote the function (for
+/// example, all callers are direct).  If safe to promote some arguments, it
+/// calls the DoPromotion method.
+///
+bool ArgPromotion::PromoteArguments(CallGraphNode *CGN) {
+  Function *F = CGN->getFunction();
+
+  // Make sure that it is local to this module.
+  if (!F || !F->hasLocalLinkage()) return false;
+
+  // First check: see if there are any pointer arguments!  If not, quick exit.
+  SmallVector<std::pair<Argument*, unsigned>, 16> PointerArgs;
+  unsigned ArgNo = 0;
+  for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end();
+       I != E; ++I, ++ArgNo)
+    if (isa<PointerType>(I->getType()))
+      PointerArgs.push_back(std::pair<Argument*, unsigned>(I, ArgNo));
+  if (PointerArgs.empty()) return false;
+
+  // Second check: make sure that all callers are direct callers.  We can't
+  // transform functions that have indirect callers.
+  for (Value::use_iterator UI = F->use_begin(), E = F->use_end();
+       UI != E; ++UI) {
+    CallSite CS = CallSite::get(*UI);
+    if (!CS.getInstruction())       // "Taking the address" of the function
+      return false;
+
+    // Ensure that this call site is CALLING the function, not passing it as
+    // an argument.
+    if (!CS.isCallee(UI))
+      return false;
+  }
+
+  // Check to see which arguments are promotable.  If an argument is promotable,
+  // add it to ArgsToPromote.
+  SmallPtrSet<Argument*, 8> ArgsToPromote;
+  SmallPtrSet<Argument*, 8> ByValArgsToTransform;
+  for (unsigned i = 0; i != PointerArgs.size(); ++i) {
+    bool isByVal = F->paramHasAttr(PointerArgs[i].second+1, Attribute::ByVal);
+
+    // If this is a byval argument, and if the aggregate type is small, just
+    // pass the elements, which is always safe.
+    Argument *PtrArg = PointerArgs[i].first;
+    if (isByVal) {
+      const Type *AgTy = cast<PointerType>(PtrArg->getType())->getElementType();
+      if (const StructType *STy = dyn_cast<StructType>(AgTy)) {
+        if (maxElements > 0 && STy->getNumElements() > maxElements) {
+          DOUT << "argpromotion disable promoting argument '"
+               << PtrArg->getName() << "' because it would require adding more "
+               << "than " << maxElements << " arguments to the function.\n";
+        } else {
+          // If all the elements are single-value types, we can promote it.
+          bool AllSimple = true;
+          for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i)
+            if (!STy->getElementType(i)->isSingleValueType()) {
+              AllSimple = false;
+              break;
+            }
+
+          // Safe to transform, don't even bother trying to "promote" it.
+          // Passing the elements as a scalar will allow scalarrepl to hack on
+          // the new alloca we introduce.
+          if (AllSimple) {
+            ByValArgsToTransform.insert(PtrArg);
+            continue;
+          }
+        }
+      }
+    }
+
+    // Otherwise, see if we can promote the pointer to its value.
+    if (isSafeToPromoteArgument(PtrArg, isByVal))
+      ArgsToPromote.insert(PtrArg);
+  }
+
+  // No promotable pointer arguments.
+  if (ArgsToPromote.empty() && ByValArgsToTransform.empty()) return false;
+
+  Function *NewF = DoPromotion(F, ArgsToPromote, ByValArgsToTransform);
+
+  // Update the call graph to know that the function has been transformed.
+  getAnalysis<CallGraph>().changeFunction(F, NewF);
+  return true;
+}
+
+/// IsAlwaysValidPointer - Return true if the specified pointer is always legal
+/// to load.
+static bool IsAlwaysValidPointer(Value *V) {
+  if (isa<AllocaInst>(V) || isa<GlobalVariable>(V)) return true;
+  if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(V))
+    return IsAlwaysValidPointer(GEP->getOperand(0));
+  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V))
+    if (CE->getOpcode() == Instruction::GetElementPtr)
+      return IsAlwaysValidPointer(CE->getOperand(0));
+
+  return false;
+}
+
+/// AllCalleesPassInValidPointerForArgument - Return true if we can prove that
+/// all callees pass in a valid pointer for the specified function argument.
+static bool AllCalleesPassInValidPointerForArgument(Argument *Arg) {
+  Function *Callee = Arg->getParent();
+
+  unsigned ArgNo = std::distance(Callee->arg_begin(),
+                                 Function::arg_iterator(Arg));
+
+  // Look at all call sites of the function.  At this pointer we know we only
+  // have direct callees.
+  for (Value::use_iterator UI = Callee->use_begin(), E = Callee->use_end();
+       UI != E; ++UI) {
+    CallSite CS = CallSite::get(*UI);
+    assert(CS.getInstruction() && "Should only have direct calls!");
+
+    if (!IsAlwaysValidPointer(CS.getArgument(ArgNo)))
+      return false;
+  }
+  return true;
+}
+
+/// Returns true if Prefix is a prefix of longer. That means, Longer has a size
+/// that is greater than or equal to the size of prefix, and each of the
+/// elements in Prefix is the same as the corresponding elements in Longer.
+///
+/// This means it also returns true when Prefix and Longer are equal!
+static bool IsPrefix(const ArgPromotion::IndicesVector &Prefix,
+                     const ArgPromotion::IndicesVector &Longer) {
+  if (Prefix.size() > Longer.size())
+    return false;
+  for (unsigned i = 0, e = Prefix.size(); i != e; ++i)
+    if (Prefix[i] != Longer[i])
+      return false;
+  return true;
+}
+
+
+/// Checks if Indices, or a prefix of Indices, is in Set.
+static bool PrefixIn(const ArgPromotion::IndicesVector &Indices,
+                     std::set<ArgPromotion::IndicesVector> &Set) {
+    std::set<ArgPromotion::IndicesVector>::iterator Low;
+    Low = Set.upper_bound(Indices);
+    if (Low != Set.begin())
+      Low--;
+    // Low is now the last element smaller than or equal to Indices. This means
+    // it points to a prefix of Indices (possibly Indices itself), if such
+    // prefix exists.
+    //
+    // This load is safe if any prefix of its operands is safe to load.
+    return Low != Set.end() && IsPrefix(*Low, Indices);
+}
+
+/// Mark the given indices (ToMark) as safe in the the given set of indices
+/// (Safe). Marking safe usually means adding ToMark to Safe. However, if there
+/// is already a prefix of Indices in Safe, Indices are implicitely marked safe
+/// already. Furthermore, any indices that Indices is itself a prefix of, are
+/// removed from Safe (since they are implicitely safe because of Indices now).
+static void MarkIndicesSafe(const ArgPromotion::IndicesVector &ToMark,
+                            std::set<ArgPromotion::IndicesVector> &Safe) {
+  std::set<ArgPromotion::IndicesVector>::iterator Low;
+  Low = Safe.upper_bound(ToMark);
+  // Guard against the case where Safe is empty
+  if (Low != Safe.begin())
+    Low--;
+  // Low is now the last element smaller than or equal to Indices. This
+  // means it points to a prefix of Indices (possibly Indices itself), if
+  // such prefix exists.
+  if (Low != Safe.end()) {
+    if (IsPrefix(*Low, ToMark))
+      // If there is already a prefix of these indices (or exactly these
+      // indices) marked a safe, don't bother adding these indices
+      return;
+
+    // Increment Low, so we can use it as a "insert before" hint
+    ++Low;
+  }
+  // Insert
+  Low = Safe.insert(Low, ToMark);
+  ++Low;
+  // If there we're a prefix of longer index list(s), remove those
+  std::set<ArgPromotion::IndicesVector>::iterator End = Safe.end();
+  while (Low != End && IsPrefix(ToMark, *Low)) {
+    std::set<ArgPromotion::IndicesVector>::iterator Remove = Low;
+    ++Low;
+    Safe.erase(Remove);
+  }
+}
+
+/// isSafeToPromoteArgument - As you might guess from the name of this method,
+/// it checks to see if it is both safe and useful to promote the argument.
+/// This method limits promotion of aggregates to only promote up to three
+/// elements of the aggregate in order to avoid exploding the number of
+/// arguments passed in.
+bool ArgPromotion::isSafeToPromoteArgument(Argument *Arg, bool isByVal) const {
+  typedef std::set<IndicesVector> GEPIndicesSet;
+
+  // Quick exit for unused arguments
+  if (Arg->use_empty())
+    return true;
+
+  // We can only promote this argument if all of the uses are loads, or are GEP
+  // instructions (with constant indices) that are subsequently loaded.
+  //
+  // Promoting the argument causes it to be loaded in the caller
+  // unconditionally. This is only safe if we can prove that either the load
+  // would have happened in the callee anyway (ie, there is a load in the entry
+  // block) or the pointer passed in at every call site is guaranteed to be
+  // valid.
+  // In the former case, invalid loads can happen, but would have happened
+  // anyway, in the latter case, invalid loads won't happen. This prevents us
+  // from introducing an invalid load that wouldn't have happened in the
+  // original code.
+  //
+  // This set will contain all sets of indices that are loaded in the entry
+  // block, and thus are safe to unconditionally load in the caller.
+  GEPIndicesSet SafeToUnconditionallyLoad;
+
+  // This set contains all the sets of indices that we are planning to promote.
+  // This makes it possible to limit the number of arguments added.
+  GEPIndicesSet ToPromote;
+
+  // If the pointer is always valid, any load with first index 0 is valid.
+  if(isByVal || AllCalleesPassInValidPointerForArgument(Arg))
+    SafeToUnconditionallyLoad.insert(IndicesVector(1, 0));
+
+  // First, iterate the entry block and mark loads of (geps of) arguments as
+  // safe.
+  BasicBlock *EntryBlock = Arg->getParent()->begin();
+  // Declare this here so we can reuse it
+  IndicesVector Indices;
+  for (BasicBlock::iterator I = EntryBlock->begin(), E = EntryBlock->end();
+       I != E; ++I)
+    if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+      Value *V = LI->getPointerOperand();
+      if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(V)) {
+        V = GEP->getPointerOperand();
+        if (V == Arg) {
+          // This load actually loads (part of) Arg? Check the indices then.
+          Indices.reserve(GEP->getNumIndices());
+          for (User::op_iterator II = GEP->idx_begin(), IE = GEP->idx_end();
+               II != IE; ++II)
+            if (ConstantInt *CI = dyn_cast<ConstantInt>(*II))
+              Indices.push_back(CI->getSExtValue());
+            else
+              // We found a non-constant GEP index for this argument? Bail out
+              // right away, can't promote this argument at all.
+              return false;
+
+          // Indices checked out, mark them as safe
+          MarkIndicesSafe(Indices, SafeToUnconditionallyLoad);
+          Indices.clear();
+        }
+      } else if (V == Arg) {
+        // Direct loads are equivalent to a GEP with a single 0 index.
+        MarkIndicesSafe(IndicesVector(1, 0), SafeToUnconditionallyLoad);
+      }
+    }
+
+  // Now, iterate all uses of the argument to see if there are any uses that are
+  // not (GEP+)loads, or any (GEP+)loads that are not safe to promote.
+  SmallVector<LoadInst*, 16> Loads;
+  IndicesVector Operands;
+  for (Value::use_iterator UI = Arg->use_begin(), E = Arg->use_end();
+       UI != E; ++UI) {
+    Operands.clear();
+    if (LoadInst *LI = dyn_cast<LoadInst>(*UI)) {
+      if (LI->isVolatile()) return false;  // Don't hack volatile loads
+      Loads.push_back(LI);
+      // Direct loads are equivalent to a GEP with a zero index and then a load.
+      Operands.push_back(0);
+    } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(*UI)) {
+      if (GEP->use_empty()) {
+        // Dead GEP's cause trouble later.  Just remove them if we run into
+        // them.
+        getAnalysis<AliasAnalysis>().deleteValue(GEP);
+        GEP->eraseFromParent();
+        // TODO: This runs the above loop over and over again for dead GEPS
+        // Couldn't we just do increment the UI iterator earlier and erase the
+        // use?
+        return isSafeToPromoteArgument(Arg, isByVal);
+      }
+
+      // Ensure that all of the indices are constants.
+      for (User::op_iterator i = GEP->idx_begin(), e = GEP->idx_end();
+        i != e; ++i)
+        if (ConstantInt *C = dyn_cast<ConstantInt>(*i))
+          Operands.push_back(C->getSExtValue());
+        else
+          return false;  // Not a constant operand GEP!
+
+      // Ensure that the only users of the GEP are load instructions.
+      for (Value::use_iterator UI = GEP->use_begin(), E = GEP->use_end();
+           UI != E; ++UI)
+        if (LoadInst *LI = dyn_cast<LoadInst>(*UI)) {
+          if (LI->isVolatile()) return false;  // Don't hack volatile loads
+          Loads.push_back(LI);
+        } else {
+          // Other uses than load?
+          return false;
+        }
+    } else {
+      return false;  // Not a load or a GEP.
+    }
+
+    // Now, see if it is safe to promote this load / loads of this GEP. Loading
+    // is safe if Operands, or a prefix of Operands, is marked as safe.
+    if (!PrefixIn(Operands, SafeToUnconditionallyLoad))
+      return false;
+
+    // See if we are already promoting a load with these indices. If not, check
+    // to make sure that we aren't promoting too many elements.  If so, nothing
+    // to do.
+    if (ToPromote.find(Operands) == ToPromote.end()) {
+      if (maxElements > 0 && ToPromote.size() == maxElements) {
+        DOUT << "argpromotion not promoting argument '"
+             << Arg->getName() << "' because it would require adding more "
+             << "than " << maxElements << " arguments to the function.\n";
+        // We limit aggregate promotion to only promoting up to a fixed number
+        // of elements of the aggregate.
+        return false;
+      }
+      ToPromote.insert(Operands);
+    }
+  }
+
+  if (Loads.empty()) return true;  // No users, this is a dead argument.
+
+  // Okay, now we know that the argument is only used by load instructions and
+  // it is safe to unconditionally perform all of them. Use alias analysis to
+  // check to see if the pointer is guaranteed to not be modified from entry of
+  // the function to each of the load instructions.
+
+  // Because there could be several/many load instructions, remember which
+  // blocks we know to be transparent to the load.
+  SmallPtrSet<BasicBlock*, 16> TranspBlocks;
+
+  AliasAnalysis &AA = getAnalysis<AliasAnalysis>();
+  TargetData &TD = getAnalysis<TargetData>();
+
+  for (unsigned i = 0, e = Loads.size(); i != e; ++i) {
+    // Check to see if the load is invalidated from the start of the block to
+    // the load itself.
+    LoadInst *Load = Loads[i];
+    BasicBlock *BB = Load->getParent();
+
+    const PointerType *LoadTy =
+      cast<PointerType>(Load->getPointerOperand()->getType());
+    unsigned LoadSize = (unsigned)TD.getTypeStoreSize(LoadTy->getElementType());
+
+    if (AA.canInstructionRangeModify(BB->front(), *Load, Arg, LoadSize))
+      return false;  // Pointer is invalidated!
+
+    // Now check every path from the entry block to the load for transparency.
+    // To do this, we perform a depth first search on the inverse CFG from the
+    // loading block.
+    for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI)
+      for (idf_ext_iterator<BasicBlock*, SmallPtrSet<BasicBlock*, 16> >
+             I = idf_ext_begin(*PI, TranspBlocks),
+             E = idf_ext_end(*PI, TranspBlocks); I != E; ++I)
+        if (AA.canBasicBlockModify(**I, Arg, LoadSize))
+          return false;
+  }
+
+  // If the path from the entry of the function to each load is free of
+  // instructions that potentially invalidate the load, we can make the
+  // transformation!
+  return true;
+}
+
+/// DoPromotion - This method actually performs the promotion of the specified
+/// arguments, and returns the new function.  At this point, we know that it's
+/// safe to do so.
+Function *ArgPromotion::DoPromotion(Function *F,
+                                    SmallPtrSet<Argument*, 8> &ArgsToPromote,
+                              SmallPtrSet<Argument*, 8> &ByValArgsToTransform) {
+
+  // Start by computing a new prototype for the function, which is the same as
+  // the old function, but has modified arguments.
+  const FunctionType *FTy = F->getFunctionType();
+  std::vector<const Type*> Params;
+
+  typedef std::set<IndicesVector> ScalarizeTable;
+
+  // ScalarizedElements - If we are promoting a pointer that has elements
+  // accessed out of it, keep track of which elements are accessed so that we
+  // can add one argument for each.
+  //
+  // Arguments that are directly loaded will have a zero element value here, to
+  // handle cases where there are both a direct load and GEP accesses.
+  //
+  std::map<Argument*, ScalarizeTable> ScalarizedElements;
+
+  // OriginalLoads - Keep track of a representative load instruction from the
+  // original function so that we can tell the alias analysis implementation
+  // what the new GEP/Load instructions we are inserting look like.
+  std::map<IndicesVector, LoadInst*> OriginalLoads;
+
+  // Attributes - Keep track of the parameter attributes for the arguments
+  // that we are *not* promoting. For the ones that we do promote, the parameter
+  // attributes are lost
+  SmallVector<AttributeWithIndex, 8> AttributesVec;
+  const AttrListPtr &PAL = F->getAttributes();
+
+  // Add any return attributes.
+  if (Attributes attrs = PAL.getRetAttributes())
+    AttributesVec.push_back(AttributeWithIndex::get(0, attrs));
+
+  // First, determine the new argument list
+  unsigned ArgIndex = 1;
+  for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(); I != E;
+       ++I, ++ArgIndex) {
+    if (ByValArgsToTransform.count(I)) {
+      // Simple byval argument? Just add all the struct element types.
+      const Type *AgTy = cast<PointerType>(I->getType())->getElementType();
+      const StructType *STy = cast<StructType>(AgTy);
+      for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i)
+        Params.push_back(STy->getElementType(i));
+      ++NumByValArgsPromoted;
+    } else if (!ArgsToPromote.count(I)) {
+      // Unchanged argument
+      Params.push_back(I->getType());
+      if (Attributes attrs = PAL.getParamAttributes(ArgIndex))
+        AttributesVec.push_back(AttributeWithIndex::get(Params.size(), attrs));
+    } else if (I->use_empty()) {
+      // Dead argument (which are always marked as promotable)
+      ++NumArgumentsDead;
+    } else {
+      // Okay, this is being promoted. This means that the only uses are loads
+      // or GEPs which are only used by loads
+
+      // In this table, we will track which indices are loaded from the argument
+      // (where direct loads are tracked as no indices).
+      ScalarizeTable &ArgIndices = ScalarizedElements[I];
+      for (Value::use_iterator UI = I->use_begin(), E = I->use_end(); UI != E;
+           ++UI) {
+        Instruction *User = cast<Instruction>(*UI);
+        assert(isa<LoadInst>(User) || isa<GetElementPtrInst>(User));
+        IndicesVector Indices;
+        Indices.reserve(User->getNumOperands() - 1);
+        // Since loads will only have a single operand, and GEPs only a single
+        // non-index operand, this will record direct loads without any indices,
+        // and gep+loads with the GEP indices.
+        for (User::op_iterator II = User->op_begin() + 1, IE = User->op_end();
+             II != IE; ++II)
+          Indices.push_back(cast<ConstantInt>(*II)->getSExtValue());
+        // GEPs with a single 0 index can be merged with direct loads
+        if (Indices.size() == 1 && Indices.front() == 0)
+          Indices.clear();
+        ArgIndices.insert(Indices);
+        LoadInst *OrigLoad;
+        if (LoadInst *L = dyn_cast<LoadInst>(User))
+          OrigLoad = L;
+        else
+          // Take any load, we will use it only to update Alias Analysis
+          OrigLoad = cast<LoadInst>(User->use_back());
+        OriginalLoads[Indices] = OrigLoad;
+      }
+
+      // Add a parameter to the function for each element passed in.
+      for (ScalarizeTable::iterator SI = ArgIndices.begin(),
+             E = ArgIndices.end(); SI != E; ++SI) {
+        // not allowed to dereference ->begin() if size() is 0
+        Params.push_back(GetElementPtrInst::getIndexedType(I->getType(),
+                                                           SI->begin(),
+                                                           SI->end()));
+        assert(Params.back());
+      }
+
+      if (ArgIndices.size() == 1 && ArgIndices.begin()->empty())
+        ++NumArgumentsPromoted;
+      else
+        ++NumAggregatesPromoted;
+    }
+  }
+
+  // Add any function attributes.
+  if (Attributes attrs = PAL.getFnAttributes())
+    AttributesVec.push_back(AttributeWithIndex::get(~0, attrs));
+
+  const Type *RetTy = FTy->getReturnType();
+
+  // Work around LLVM bug PR56: the CWriter cannot emit varargs functions which
+  // have zero fixed arguments.
+  bool ExtraArgHack = false;
+  if (Params.empty() && FTy->isVarArg()) {
+    ExtraArgHack = true;
+    Params.push_back(Type::Int32Ty);
+  }
+
+  // Construct the new function type using the new arguments.
+  FunctionType *NFTy = FunctionType::get(RetTy, Params, FTy->isVarArg());
+
+  // Create the new function body and insert it into the module...
+  Function *NF = Function::Create(NFTy, F->getLinkage(), F->getName());
+  NF->copyAttributesFrom(F);
+
+  // Recompute the parameter attributes list based on the new arguments for
+  // the function.
+  NF->setAttributes(AttrListPtr::get(AttributesVec.begin(), AttributesVec.end()));
+  AttributesVec.clear();
+
+  F->getParent()->getFunctionList().insert(F, NF);
+  NF->takeName(F);
+
+  // Get the alias analysis information that we need to update to reflect our
+  // changes.
+  AliasAnalysis &AA = getAnalysis<AliasAnalysis>();
+
+  // Get the callgraph information that we need to update to reflect our
+  // changes.
+  CallGraph &CG = getAnalysis<CallGraph>();
+
+  // Loop over all of the callers of the function, transforming the call sites
+  // to pass in the loaded pointers.
+  //
+  SmallVector<Value*, 16> Args;
+  while (!F->use_empty()) {
+    CallSite CS = CallSite::get(F->use_back());
+    Instruction *Call = CS.getInstruction();
+    const AttrListPtr &CallPAL = CS.getAttributes();
+
+    // Add any return attributes.
+    if (Attributes attrs = CallPAL.getRetAttributes())
+      AttributesVec.push_back(AttributeWithIndex::get(0, attrs));
+
+    // Loop over the operands, inserting GEP and loads in the caller as
+    // appropriate.
+    CallSite::arg_iterator AI = CS.arg_begin();
+    ArgIndex = 1;
+    for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end();
+         I != E; ++I, ++AI, ++ArgIndex)
+      if (!ArgsToPromote.count(I) && !ByValArgsToTransform.count(I)) {
+        Args.push_back(*AI);          // Unmodified argument
+
+        if (Attributes Attrs = CallPAL.getParamAttributes(ArgIndex))
+          AttributesVec.push_back(AttributeWithIndex::get(Args.size(), Attrs));
+
+      } else if (ByValArgsToTransform.count(I)) {
+        // Emit a GEP and load for each element of the struct.
+        const Type *AgTy = cast<PointerType>(I->getType())->getElementType();
+        const StructType *STy = cast<StructType>(AgTy);
+        Value *Idxs[2] = { ConstantInt::get(Type::Int32Ty, 0), 0 };
+        for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
+          Idxs[1] = ConstantInt::get(Type::Int32Ty, i);
+          Value *Idx = GetElementPtrInst::Create(*AI, Idxs, Idxs+2,
+                                                 (*AI)->getName()+"."+utostr(i),
+                                                 Call);
+          // TODO: Tell AA about the new values?
+          Args.push_back(new LoadInst(Idx, Idx->getName()+".val", Call));
+        }
+      } else if (!I->use_empty()) {
+        // Non-dead argument: insert GEPs and loads as appropriate.
+        ScalarizeTable &ArgIndices = ScalarizedElements[I];
+        // Store the Value* version of the indices in here, but declare it now
+        // for reuse
+        std::vector<Value*> Ops;
+        for (ScalarizeTable::iterator SI = ArgIndices.begin(),
+               E = ArgIndices.end(); SI != E; ++SI) {
+          Value *V = *AI;
+          LoadInst *OrigLoad = OriginalLoads[*SI];
+          if (!SI->empty()) {
+            Ops.reserve(SI->size());
+            const Type *ElTy = V->getType();
+            for (IndicesVector::const_iterator II = SI->begin(),
+                 IE = SI->end(); II != IE; ++II) {
+              // Use i32 to index structs, and i64 for others (pointers/arrays).
+              // This satisfies GEP constraints.
+              const Type *IdxTy = (isa<StructType>(ElTy) ? Type::Int32Ty : Type::Int64Ty);
+              Ops.push_back(ConstantInt::get(IdxTy, *II));
+              // Keep track of the type we're currently indexing
+              ElTy = cast<CompositeType>(ElTy)->getTypeAtIndex(*II);
+            }
+            // And create a GEP to extract those indices
+            V = GetElementPtrInst::Create(V, Ops.begin(), Ops.end(),
+                                          V->getName()+".idx", Call);
+            Ops.clear();
+            AA.copyValue(OrigLoad->getOperand(0), V);
+          }
+          Args.push_back(new LoadInst(V, V->getName()+".val", Call));
+          AA.copyValue(OrigLoad, Args.back());
+        }
+      }
+
+    if (ExtraArgHack)
+      Args.push_back(Constant::getNullValue(Type::Int32Ty));
+
+    // Push any varargs arguments on the list
+    for (; AI != CS.arg_end(); ++AI, ++ArgIndex) {
+      Args.push_back(*AI);
+      if (Attributes Attrs = CallPAL.getParamAttributes(ArgIndex))
+        AttributesVec.push_back(AttributeWithIndex::get(Args.size(), Attrs));
+    }
+
+    // Add any function attributes.
+    if (Attributes attrs = CallPAL.getFnAttributes())
+      AttributesVec.push_back(AttributeWithIndex::get(~0, attrs));
+
+    Instruction *New;
+    if (InvokeInst *II = dyn_cast<InvokeInst>(Call)) {
+      New = InvokeInst::Create(NF, II->getNormalDest(), II->getUnwindDest(),
+                               Args.begin(), Args.end(), "", Call);
+      cast<InvokeInst>(New)->setCallingConv(CS.getCallingConv());
+      cast<InvokeInst>(New)->setAttributes(AttrListPtr::get(AttributesVec.begin(),
+                                                          AttributesVec.end()));
+    } else {
+      New = CallInst::Create(NF, Args.begin(), Args.end(), "", Call);
+      cast<CallInst>(New)->setCallingConv(CS.getCallingConv());
+      cast<CallInst>(New)->setAttributes(AttrListPtr::get(AttributesVec.begin(),
+                                                        AttributesVec.end()));
+      if (cast<CallInst>(Call)->isTailCall())
+        cast<CallInst>(New)->setTailCall();
+    }
+    Args.clear();
+    AttributesVec.clear();
+
+    // Update the alias analysis implementation to know that we are replacing
+    // the old call with a new one.
+    AA.replaceWithNewValue(Call, New);
+
+    // Update the callgraph to know that the callsite has been transformed.
+    CG[Call->getParent()->getParent()]->replaceCallSite(Call, New);
+
+    if (!Call->use_empty()) {
+      Call->replaceAllUsesWith(New);
+      New->takeName(Call);
+    }
+
+    // Finally, remove the old call from the program, reducing the use-count of
+    // F.
+    Call->eraseFromParent();
+  }
+
+  // Since we have now created the new function, splice the body of the old
+  // function right into the new function, leaving the old rotting hulk of the
+  // function empty.
+  NF->getBasicBlockList().splice(NF->begin(), F->getBasicBlockList());
+
+  // Loop over the argument list, transfering uses of the old arguments over to
+  // the new arguments, also transfering over the names as well.
+  //
+  for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(),
+       I2 = NF->arg_begin(); I != E; ++I) {
+    if (!ArgsToPromote.count(I) && !ByValArgsToTransform.count(I)) {
+      // If this is an unmodified argument, move the name and users over to the
+      // new version.
+      I->replaceAllUsesWith(I2);
+      I2->takeName(I);
+      AA.replaceWithNewValue(I, I2);
+      ++I2;
+      continue;
+    }
+
+    if (ByValArgsToTransform.count(I)) {
+      // In the callee, we create an alloca, and store each of the new incoming
+      // arguments into the alloca.
+      Instruction *InsertPt = NF->begin()->begin();
+
+      // Just add all the struct element types.
+      const Type *AgTy = cast<PointerType>(I->getType())->getElementType();
+      Value *TheAlloca = new AllocaInst(AgTy, 0, "", InsertPt);
+      const StructType *STy = cast<StructType>(AgTy);
+      Value *Idxs[2] = { ConstantInt::get(Type::Int32Ty, 0), 0 };
+
+      for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
+        Idxs[1] = ConstantInt::get(Type::Int32Ty, i);
+        std::string Name = TheAlloca->getName()+"."+utostr(i);
+        Value *Idx = GetElementPtrInst::Create(TheAlloca, Idxs, Idxs+2,
+                                               Name, InsertPt);
+        I2->setName(I->getName()+"."+utostr(i));
+        new StoreInst(I2++, Idx, InsertPt);
+      }
+
+      // Anything that used the arg should now use the alloca.
+      I->replaceAllUsesWith(TheAlloca);
+      TheAlloca->takeName(I);
+      AA.replaceWithNewValue(I, TheAlloca);
+      continue;
+    }
+
+    if (I->use_empty()) {
+      AA.deleteValue(I);
+      continue;
+    }
+
+    // Otherwise, if we promoted this argument, then all users are load
+    // instructions (or GEPs with only load users), and all loads should be
+    // using the new argument that we added.
+    ScalarizeTable &ArgIndices = ScalarizedElements[I];
+
+    while (!I->use_empty()) {
+      if (LoadInst *LI = dyn_cast<LoadInst>(I->use_back())) {
+        assert(ArgIndices.begin()->empty() &&
+               "Load element should sort to front!");
+        I2->setName(I->getName()+".val");
+        LI->replaceAllUsesWith(I2);
+        AA.replaceWithNewValue(LI, I2);
+        LI->eraseFromParent();
+        DOUT << "*** Promoted load of argument '" << I->getName()
+             << "' in function '" << F->getName() << "'\n";
+      } else {
+        GetElementPtrInst *GEP = cast<GetElementPtrInst>(I->use_back());
+        IndicesVector Operands;
+        Operands.reserve(GEP->getNumIndices());
+        for (User::op_iterator II = GEP->idx_begin(), IE = GEP->idx_end();
+             II != IE; ++II)
+          Operands.push_back(cast<ConstantInt>(*II)->getSExtValue());
+
+        // GEPs with a single 0 index can be merged with direct loads
+        if (Operands.size() == 1 && Operands.front() == 0)
+          Operands.clear();
+
+        Function::arg_iterator TheArg = I2;
+        for (ScalarizeTable::iterator It = ArgIndices.begin();
+             *It != Operands; ++It, ++TheArg) {
+          assert(It != ArgIndices.end() && "GEP not handled??");
+        }
+
+        std::string NewName = I->getName();
+        for (unsigned i = 0, e = Operands.size(); i != e; ++i) {
+            NewName += "." + utostr(Operands[i]);
+        }
+        NewName += ".val";
+        TheArg->setName(NewName);
+
+        DOUT << "*** Promoted agg argument '" << TheArg->getName()
+             << "' of function '" << NF->getName() << "'\n";
+
+        // All of the uses must be load instructions.  Replace them all with
+        // the argument specified by ArgNo.
+        while (!GEP->use_empty()) {
+          LoadInst *L = cast<LoadInst>(GEP->use_back());
+          L->replaceAllUsesWith(TheArg);
+          AA.replaceWithNewValue(L, TheArg);
+          L->eraseFromParent();
+        }
+        AA.deleteValue(GEP);
+        GEP->eraseFromParent();
+      }
+    }
+
+    // Increment I2 past all of the arguments added for this promoted pointer.
+    for (unsigned i = 0, e = ArgIndices.size(); i != e; ++i)
+      ++I2;
+  }
+
+  // Notify the alias analysis implementation that we inserted a new argument.
+  if (ExtraArgHack)
+    AA.copyValue(Constant::getNullValue(Type::Int32Ty), NF->arg_begin());
+
+
+  // Tell the alias analysis that the old function is about to disappear.
+  AA.replaceWithNewValue(F, NF);
+
+  // Now that the old function is dead, delete it.
+  F->eraseFromParent();
+  return NF;
+}
diff --git a/lib/Transforms/IPO/CMakeLists.txt b/lib/Transforms/IPO/CMakeLists.txt
new file mode 100644
index 0000000..4b85e13
--- /dev/null
+++ b/lib/Transforms/IPO/CMakeLists.txt
@@ -0,0 +1,25 @@
+add_llvm_library(LLVMipo
+  FunctionAttrs.cpp
+  ArgumentPromotion.cpp
+  ConstantMerge.cpp
+  DeadArgumentElimination.cpp
+  DeadTypeElimination.cpp
+  ExtractGV.cpp
+  GlobalDCE.cpp
+  GlobalOpt.cpp
+  IndMemRemoval.cpp
+  InlineAlways.cpp
+  Inliner.cpp
+  InlineSimple.cpp
+  Internalize.cpp
+  IPConstantPropagation.cpp
+  LoopExtractor.cpp
+  LowerSetJmp.cpp
+  MergeFunctions.cpp
+  PartialSpecialization.cpp
+  PruneEH.cpp
+  RaiseAllocations.cpp
+  StripDeadPrototypes.cpp
+  StripSymbols.cpp
+  StructRetPromotion.cpp
+  )
diff --git a/lib/Transforms/IPO/ConstantMerge.cpp b/lib/Transforms/IPO/ConstantMerge.cpp
new file mode 100644
index 0000000..237e6db
--- /dev/null
+++ b/lib/Transforms/IPO/ConstantMerge.cpp
@@ -0,0 +1,114 @@
+//===- ConstantMerge.cpp - Merge duplicate global constants ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interface to a pass that merges duplicate global
+// constants together into a single constant that is shared.  This is useful
+// because some passes (ie TraceValues) insert a lot of string constants into
+// the program, regardless of whether or not an existing string is available.
+//
+// Algorithm: ConstantMerge is designed to build up a map of available constants
+// and eliminate duplicates when it is initialized.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "constmerge"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/Compiler.h"
+#include <map>
+using namespace llvm;
+
+STATISTIC(NumMerged, "Number of global constants merged");
+
+namespace {
+  struct VISIBILITY_HIDDEN ConstantMerge : public ModulePass {
+    static char ID; // Pass identification, replacement for typeid
+    ConstantMerge() : ModulePass(&ID) {}
+
+    // run - For this pass, process all of the globals in the module,
+    // eliminating duplicate constants.
+    //
+    bool runOnModule(Module &M);
+  };
+}
+
+char ConstantMerge::ID = 0;
+static RegisterPass<ConstantMerge>
+X("constmerge", "Merge Duplicate Global Constants");
+
+ModulePass *llvm::createConstantMergePass() { return new ConstantMerge(); }
+
+bool ConstantMerge::runOnModule(Module &M) {
+  // Map unique constant/section pairs to globals.  We don't want to merge
+  // globals in different sections.
+  std::map<std::pair<Constant*, std::string>, GlobalVariable*> CMap;
+
+  // Replacements - This vector contains a list of replacements to perform.
+  std::vector<std::pair<GlobalVariable*, GlobalVariable*> > Replacements;
+
+  bool MadeChange = false;
+
+  // Iterate constant merging while we are still making progress.  Merging two
+  // constants together may allow us to merge other constants together if the
+  // second level constants have initializers which point to the globals that
+  // were just merged.
+  while (1) {
+    // First pass: identify all globals that can be merged together, filling in
+    // the Replacements vector.  We cannot do the replacement in this pass
+    // because doing so may cause initializers of other globals to be rewritten,
+    // invalidating the Constant* pointers in CMap.
+    //
+    for (Module::global_iterator GVI = M.global_begin(), E = M.global_end();
+         GVI != E; ) {
+      GlobalVariable *GV = GVI++;
+      
+      // If this GV is dead, remove it.
+      GV->removeDeadConstantUsers();
+      if (GV->use_empty() && GV->hasLocalLinkage()) {
+        GV->eraseFromParent();
+        continue;
+      }
+      
+      // Only process constants with initializers.
+      if (GV->isConstant() && GV->hasInitializer()) {
+        Constant *Init = GV->getInitializer();
+
+        // Check to see if the initializer is already known.
+        GlobalVariable *&Slot = CMap[std::make_pair(Init, GV->getSection())];
+
+        if (Slot == 0) {    // Nope, add it to the map.
+          Slot = GV;
+        } else if (GV->hasLocalLinkage()) {    // Yup, this is a duplicate!
+          // Make all uses of the duplicate constant use the canonical version.
+          Replacements.push_back(std::make_pair(GV, Slot));
+        }
+      }
+    }
+
+    if (Replacements.empty())
+      return MadeChange;
+    CMap.clear();
+
+    // Now that we have figured out which replacements must be made, do them all
+    // now.  This avoid invalidating the pointers in CMap, which are unneeded
+    // now.
+    for (unsigned i = 0, e = Replacements.size(); i != e; ++i) {
+      // Eliminate any uses of the dead global...
+      Replacements[i].first->replaceAllUsesWith(Replacements[i].second);
+
+      // Delete the global value from the module...
+      M.getGlobalList().erase(Replacements[i].first);
+    }
+
+    NumMerged += Replacements.size();
+    Replacements.clear();
+  }
+}
diff --git a/lib/Transforms/IPO/DeadArgumentElimination.cpp b/lib/Transforms/IPO/DeadArgumentElimination.cpp
new file mode 100644
index 0000000..666db7e
--- /dev/null
+++ b/lib/Transforms/IPO/DeadArgumentElimination.cpp
@@ -0,0 +1,944 @@
+//===-- DeadArgumentElimination.cpp - Eliminate dead arguments ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass deletes dead arguments from internal functions.  Dead argument
+// elimination removes arguments which are directly dead, as well as arguments
+// only passed into function calls as dead arguments of other functions.  This
+// pass also deletes dead return values in a similar way.
+//
+// This pass is often useful as a cleanup pass to run after aggressive
+// interprocedural passes, which add possibly-dead arguments or return values.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "deadargelim"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/CallingConv.h"
+#include "llvm/Constant.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Instructions.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CallSite.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/Compiler.h"
+#include <map>
+#include <set>
+using namespace llvm;
+
+STATISTIC(NumArgumentsEliminated, "Number of unread args removed");
+STATISTIC(NumRetValsEliminated  , "Number of unused return values removed");
+
+namespace {
+  /// DAE - The dead argument elimination pass.
+  ///
+  class VISIBILITY_HIDDEN DAE : public ModulePass {
+  public:
+
+    /// Struct that represents (part of) either a return value or a function
+    /// argument.  Used so that arguments and return values can be used
+    /// interchangably.
+    struct RetOrArg {
+      RetOrArg(const Function* F, unsigned Idx, bool IsArg) : F(F), Idx(Idx),
+               IsArg(IsArg) {}
+      const Function *F;
+      unsigned Idx;
+      bool IsArg;
+
+      /// Make RetOrArg comparable, so we can put it into a map.
+      bool operator<(const RetOrArg &O) const {
+        if (F != O.F)
+          return F < O.F;
+        else if (Idx != O.Idx)
+          return Idx < O.Idx;
+        else
+          return IsArg < O.IsArg;
+      }
+
+      /// Make RetOrArg comparable, so we can easily iterate the multimap.
+      bool operator==(const RetOrArg &O) const {
+        return F == O.F && Idx == O.Idx && IsArg == O.IsArg;
+      }
+
+      std::string getDescription() const {
+        return std::string((IsArg ? "Argument #" : "Return value #")) 
+               + utostr(Idx) + " of function " + F->getName();
+      }
+    };
+
+    /// Liveness enum - During our initial pass over the program, we determine
+    /// that things are either alive or maybe alive. We don't mark anything
+    /// explicitly dead (even if we know they are), since anything not alive
+    /// with no registered uses (in Uses) will never be marked alive and will
+    /// thus become dead in the end.
+    enum Liveness { Live, MaybeLive };
+
+    /// Convenience wrapper
+    RetOrArg CreateRet(const Function *F, unsigned Idx) {
+      return RetOrArg(F, Idx, false);
+    }
+    /// Convenience wrapper
+    RetOrArg CreateArg(const Function *F, unsigned Idx) {
+      return RetOrArg(F, Idx, true);
+    }
+
+    typedef std::multimap<RetOrArg, RetOrArg> UseMap;
+    /// This maps a return value or argument to any MaybeLive return values or
+    /// arguments it uses. This allows the MaybeLive values to be marked live
+    /// when any of its users is marked live.
+    /// For example (indices are left out for clarity):
+    ///  - Uses[ret F] = ret G
+    ///    This means that F calls G, and F returns the value returned by G.
+    ///  - Uses[arg F] = ret G
+    ///    This means that some function calls G and passes its result as an
+    ///    argument to F.
+    ///  - Uses[ret F] = arg F
+    ///    This means that F returns one of its own arguments.
+    ///  - Uses[arg F] = arg G
+    ///    This means that G calls F and passes one of its own (G's) arguments
+    ///    directly to F.
+    UseMap Uses;
+
+    typedef std::set<RetOrArg> LiveSet;
+    typedef std::set<const Function*> LiveFuncSet;
+
+    /// This set contains all values that have been determined to be live.
+    LiveSet LiveValues;
+    /// This set contains all values that are cannot be changed in any way.
+    LiveFuncSet LiveFunctions;
+
+    typedef SmallVector<RetOrArg, 5> UseVector;
+
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    DAE() : ModulePass(&ID) {}
+    bool runOnModule(Module &M);
+
+    virtual bool ShouldHackArguments() const { return false; }
+
+  private:
+    Liveness MarkIfNotLive(RetOrArg Use, UseVector &MaybeLiveUses);
+    Liveness SurveyUse(Value::use_iterator U, UseVector &MaybeLiveUses,
+                       unsigned RetValNum = 0);
+    Liveness SurveyUses(Value *V, UseVector &MaybeLiveUses);
+
+    void SurveyFunction(Function &F);
+    void MarkValue(const RetOrArg &RA, Liveness L,
+                   const UseVector &MaybeLiveUses);
+    void MarkLive(const RetOrArg &RA);
+    void MarkLive(const Function &F);
+    void PropagateLiveness(const RetOrArg &RA);
+    bool RemoveDeadStuffFromFunction(Function *F);
+    bool DeleteDeadVarargs(Function &Fn);
+  };
+}
+
+
+char DAE::ID = 0;
+static RegisterPass<DAE>
+X("deadargelim", "Dead Argument Elimination");
+
+namespace {
+  /// DAH - DeadArgumentHacking pass - Same as dead argument elimination, but
+  /// deletes arguments to functions which are external.  This is only for use
+  /// by bugpoint.
+  struct DAH : public DAE {
+    static char ID;
+    virtual bool ShouldHackArguments() const { return true; }
+  };
+}
+
+char DAH::ID = 0;
+static RegisterPass<DAH>
+Y("deadarghaX0r", "Dead Argument Hacking (BUGPOINT USE ONLY; DO NOT USE)");
+
+/// createDeadArgEliminationPass - This pass removes arguments from functions
+/// which are not used by the body of the function.
+///
+ModulePass *llvm::createDeadArgEliminationPass() { return new DAE(); }
+ModulePass *llvm::createDeadArgHackingPass() { return new DAH(); }
+
+/// DeleteDeadVarargs - If this is an function that takes a ... list, and if
+/// llvm.vastart is never called, the varargs list is dead for the function.
+bool DAE::DeleteDeadVarargs(Function &Fn) {
+  assert(Fn.getFunctionType()->isVarArg() && "Function isn't varargs!");
+  if (Fn.isDeclaration() || !Fn.hasLocalLinkage()) return false;
+
+  // Ensure that the function is only directly called.
+  for (Value::use_iterator I = Fn.use_begin(), E = Fn.use_end(); I != E; ++I) {
+    // If this use is anything other than a call site, give up.
+    CallSite CS = CallSite::get(*I);
+    Instruction *TheCall = CS.getInstruction();
+    if (!TheCall) return false;   // Not a direct call site?
+
+    // The addr of this function is passed to the call.
+    if (!CS.isCallee(I)) return false;
+  }
+
+  // Okay, we know we can transform this function if safe.  Scan its body
+  // looking for calls to llvm.vastart.
+  for (Function::iterator BB = Fn.begin(), E = Fn.end(); BB != E; ++BB) {
+    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
+      if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+        if (II->getIntrinsicID() == Intrinsic::vastart)
+          return false;
+      }
+    }
+  }
+
+  // If we get here, there are no calls to llvm.vastart in the function body,
+  // remove the "..." and adjust all the calls.
+
+  // Start by computing a new prototype for the function, which is the same as
+  // the old function, but doesn't have isVarArg set.
+  const FunctionType *FTy = Fn.getFunctionType();
+  std::vector<const Type*> Params(FTy->param_begin(), FTy->param_end());
+  FunctionType *NFTy = FunctionType::get(FTy->getReturnType(), Params, false);
+  unsigned NumArgs = Params.size();
+
+  // Create the new function body and insert it into the module...
+  Function *NF = Function::Create(NFTy, Fn.getLinkage());
+  NF->copyAttributesFrom(&Fn);
+  Fn.getParent()->getFunctionList().insert(&Fn, NF);
+  NF->takeName(&Fn);
+
+  // Loop over all of the callers of the function, transforming the call sites
+  // to pass in a smaller number of arguments into the new function.
+  //
+  std::vector<Value*> Args;
+  while (!Fn.use_empty()) {
+    CallSite CS = CallSite::get(Fn.use_back());
+    Instruction *Call = CS.getInstruction();
+
+    // Pass all the same arguments.
+    Args.assign(CS.arg_begin(), CS.arg_begin()+NumArgs);
+
+    // Drop any attributes that were on the vararg arguments.
+    AttrListPtr PAL = CS.getAttributes();
+    if (!PAL.isEmpty() && PAL.getSlot(PAL.getNumSlots() - 1).Index > NumArgs) {
+      SmallVector<AttributeWithIndex, 8> AttributesVec;
+      for (unsigned i = 0; PAL.getSlot(i).Index <= NumArgs; ++i)
+        AttributesVec.push_back(PAL.getSlot(i));
+      if (Attributes FnAttrs = PAL.getFnAttributes()) 
+        AttributesVec.push_back(AttributeWithIndex::get(~0, FnAttrs));
+      PAL = AttrListPtr::get(AttributesVec.begin(), AttributesVec.end());
+    }
+
+    Instruction *New;
+    if (InvokeInst *II = dyn_cast<InvokeInst>(Call)) {
+      New = InvokeInst::Create(NF, II->getNormalDest(), II->getUnwindDest(),
+                               Args.begin(), Args.end(), "", Call);
+      cast<InvokeInst>(New)->setCallingConv(CS.getCallingConv());
+      cast<InvokeInst>(New)->setAttributes(PAL);
+    } else {
+      New = CallInst::Create(NF, Args.begin(), Args.end(), "", Call);
+      cast<CallInst>(New)->setCallingConv(CS.getCallingConv());
+      cast<CallInst>(New)->setAttributes(PAL);
+      if (cast<CallInst>(Call)->isTailCall())
+        cast<CallInst>(New)->setTailCall();
+    }
+    Args.clear();
+
+    if (!Call->use_empty())
+      Call->replaceAllUsesWith(New);
+
+    New->takeName(Call);
+
+    // Finally, remove the old call from the program, reducing the use-count of
+    // F.
+    Call->eraseFromParent();
+  }
+
+  // Since we have now created the new function, splice the body of the old
+  // function right into the new function, leaving the old rotting hulk of the
+  // function empty.
+  NF->getBasicBlockList().splice(NF->begin(), Fn.getBasicBlockList());
+
+  // Loop over the argument list, transfering uses of the old arguments over to
+  // the new arguments, also transfering over the names as well.  While we're at
+  // it, remove the dead arguments from the DeadArguments list.
+  //
+  for (Function::arg_iterator I = Fn.arg_begin(), E = Fn.arg_end(),
+       I2 = NF->arg_begin(); I != E; ++I, ++I2) {
+    // Move the name and users over to the new version.
+    I->replaceAllUsesWith(I2);
+    I2->takeName(I);
+  }
+
+  // Finally, nuke the old function.
+  Fn.eraseFromParent();
+  return true;
+}
+
+/// Convenience function that returns the number of return values. It returns 0
+/// for void functions and 1 for functions not returning a struct. It returns
+/// the number of struct elements for functions returning a struct.
+static unsigned NumRetVals(const Function *F) {
+  if (F->getReturnType() == Type::VoidTy)
+    return 0;
+  else if (const StructType *STy = dyn_cast<StructType>(F->getReturnType()))
+    return STy->getNumElements();
+  else
+    return 1;
+}
+
+/// MarkIfNotLive - This checks Use for liveness in LiveValues. If Use is not
+/// live, it adds Use to the MaybeLiveUses argument. Returns the determined
+/// liveness of Use.
+DAE::Liveness DAE::MarkIfNotLive(RetOrArg Use, UseVector &MaybeLiveUses) {
+  // We're live if our use or its Function is already marked as live.
+  if (LiveFunctions.count(Use.F) || LiveValues.count(Use))
+    return Live;
+
+  // We're maybe live otherwise, but remember that we must become live if
+  // Use becomes live.
+  MaybeLiveUses.push_back(Use);
+  return MaybeLive;
+}
+
+
+/// SurveyUse - This looks at a single use of an argument or return value
+/// and determines if it should be alive or not. Adds this use to MaybeLiveUses
+/// if it causes the used value to become MaybeAlive.
+///
+/// RetValNum is the return value number to use when this use is used in a
+/// return instruction. This is used in the recursion, you should always leave
+/// it at 0.
+DAE::Liveness DAE::SurveyUse(Value::use_iterator U, UseVector &MaybeLiveUses,
+                             unsigned RetValNum) {
+    Value *V = *U;
+    if (ReturnInst *RI = dyn_cast<ReturnInst>(V)) {
+      // The value is returned from a function. It's only live when the
+      // function's return value is live. We use RetValNum here, for the case
+      // that U is really a use of an insertvalue instruction that uses the
+      // orginal Use.
+      RetOrArg Use = CreateRet(RI->getParent()->getParent(), RetValNum);
+      // We might be live, depending on the liveness of Use.
+      return MarkIfNotLive(Use, MaybeLiveUses);
+    }
+    if (InsertValueInst *IV = dyn_cast<InsertValueInst>(V)) {
+      if (U.getOperandNo() != InsertValueInst::getAggregateOperandIndex()
+          && IV->hasIndices())
+        // The use we are examining is inserted into an aggregate. Our liveness
+        // depends on all uses of that aggregate, but if it is used as a return
+        // value, only index at which we were inserted counts.
+        RetValNum = *IV->idx_begin();
+
+      // Note that if we are used as the aggregate operand to the insertvalue,
+      // we don't change RetValNum, but do survey all our uses.
+
+      Liveness Result = MaybeLive;
+      for (Value::use_iterator I = IV->use_begin(),
+           E = V->use_end(); I != E; ++I) {
+        Result = SurveyUse(I, MaybeLiveUses, RetValNum);
+        if (Result == Live)
+          break;
+      }
+      return Result;
+    }
+    CallSite CS = CallSite::get(V);
+    if (CS.getInstruction()) {
+      Function *F = CS.getCalledFunction();
+      if (F) {
+        // Used in a direct call.
+  
+        // Find the argument number. We know for sure that this use is an
+        // argument, since if it was the function argument this would be an
+        // indirect call and the we know can't be looking at a value of the
+        // label type (for the invoke instruction).
+        unsigned ArgNo = CS.getArgumentNo(U.getOperandNo());
+
+        if (ArgNo >= F->getFunctionType()->getNumParams())
+          // The value is passed in through a vararg! Must be live.
+          return Live;
+
+        assert(CS.getArgument(ArgNo) 
+               == CS.getInstruction()->getOperand(U.getOperandNo()) 
+               && "Argument is not where we expected it");
+
+        // Value passed to a normal call. It's only live when the corresponding
+        // argument to the called function turns out live.
+        RetOrArg Use = CreateArg(F, ArgNo);
+        return MarkIfNotLive(Use, MaybeLiveUses);
+      }
+    }
+    // Used in any other way? Value must be live.
+    return Live;
+}
+
+/// SurveyUses - This looks at all the uses of the given value
+/// Returns the Liveness deduced from the uses of this value.
+///
+/// Adds all uses that cause the result to be MaybeLive to MaybeLiveRetUses. If
+/// the result is Live, MaybeLiveUses might be modified but its content should
+/// be ignored (since it might not be complete).
+DAE::Liveness DAE::SurveyUses(Value *V, UseVector &MaybeLiveUses) {
+  // Assume it's dead (which will only hold if there are no uses at all..).
+  Liveness Result = MaybeLive;
+  // Check each use.
+  for (Value::use_iterator I = V->use_begin(),
+       E = V->use_end(); I != E; ++I) {
+    Result = SurveyUse(I, MaybeLiveUses);
+    if (Result == Live)
+      break;
+  }
+  return Result;
+}
+
+// SurveyFunction - This performs the initial survey of the specified function,
+// checking out whether or not it uses any of its incoming arguments or whether
+// any callers use the return value.  This fills in the LiveValues set and Uses
+// map.
+//
+// We consider arguments of non-internal functions to be intrinsically alive as
+// well as arguments to functions which have their "address taken".
+//
+void DAE::SurveyFunction(Function &F) {
+  unsigned RetCount = NumRetVals(&F);
+  // Assume all return values are dead
+  typedef SmallVector<Liveness, 5> RetVals;
+  RetVals RetValLiveness(RetCount, MaybeLive);
+
+  typedef SmallVector<UseVector, 5> RetUses;
+  // These vectors map each return value to the uses that make it MaybeLive, so
+  // we can add those to the Uses map if the return value really turns out to be
+  // MaybeLive. Initialized to a list of RetCount empty lists.
+  RetUses MaybeLiveRetUses(RetCount);
+
+  for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB)
+    if (ReturnInst *RI = dyn_cast<ReturnInst>(BB->getTerminator()))
+      if (RI->getNumOperands() != 0 && RI->getOperand(0)->getType()
+          != F.getFunctionType()->getReturnType()) {
+        // We don't support old style multiple return values.
+        MarkLive(F);
+        return;
+      }
+
+  if (!F.hasLocalLinkage() && (!ShouldHackArguments() || F.isIntrinsic())) {
+    MarkLive(F);
+    return;
+  }
+
+  DOUT << "DAE - Inspecting callers for fn: " << F.getName() << "\n";
+  // Keep track of the number of live retvals, so we can skip checks once all
+  // of them turn out to be live.
+  unsigned NumLiveRetVals = 0;
+  const Type *STy = dyn_cast<StructType>(F.getReturnType());
+  // Loop all uses of the function.
+  for (Value::use_iterator I = F.use_begin(), E = F.use_end(); I != E; ++I) {
+    // If the function is PASSED IN as an argument, its address has been
+    // taken.
+    CallSite CS = CallSite::get(*I);
+    if (!CS.getInstruction() || !CS.isCallee(I)) {
+      MarkLive(F);
+      return;
+    }
+
+    // If this use is anything other than a call site, the function is alive.
+    Instruction *TheCall = CS.getInstruction();
+    if (!TheCall) {   // Not a direct call site?
+      MarkLive(F);
+      return;
+    }
+
+    // If we end up here, we are looking at a direct call to our function.
+
+    // Now, check how our return value(s) is/are used in this caller. Don't
+    // bother checking return values if all of them are live already.
+    if (NumLiveRetVals != RetCount) {
+      if (STy) {
+        // Check all uses of the return value.
+        for (Value::use_iterator I = TheCall->use_begin(),
+             E = TheCall->use_end(); I != E; ++I) {
+          ExtractValueInst *Ext = dyn_cast<ExtractValueInst>(*I);
+          if (Ext && Ext->hasIndices()) {
+            // This use uses a part of our return value, survey the uses of
+            // that part and store the results for this index only.
+            unsigned Idx = *Ext->idx_begin();
+            if (RetValLiveness[Idx] != Live) {
+              RetValLiveness[Idx] = SurveyUses(Ext, MaybeLiveRetUses[Idx]);
+              if (RetValLiveness[Idx] == Live)
+                NumLiveRetVals++;
+            }
+          } else {
+            // Used by something else than extractvalue. Mark all return
+            // values as live.
+            for (unsigned i = 0; i != RetCount; ++i )
+              RetValLiveness[i] = Live;
+            NumLiveRetVals = RetCount;
+            break;
+          }
+        }
+      } else {
+        // Single return value
+        RetValLiveness[0] = SurveyUses(TheCall, MaybeLiveRetUses[0]);
+        if (RetValLiveness[0] == Live)
+          NumLiveRetVals = RetCount;
+      }
+    }
+  }
+
+  // Now we've inspected all callers, record the liveness of our return values.
+  for (unsigned i = 0; i != RetCount; ++i)
+    MarkValue(CreateRet(&F, i), RetValLiveness[i], MaybeLiveRetUses[i]);
+
+  DOUT << "DAE - Inspecting args for fn: " << F.getName() << "\n";
+
+  // Now, check all of our arguments.
+  unsigned i = 0;
+  UseVector MaybeLiveArgUses;
+  for (Function::arg_iterator AI = F.arg_begin(),
+       E = F.arg_end(); AI != E; ++AI, ++i) {
+    // See what the effect of this use is (recording any uses that cause
+    // MaybeLive in MaybeLiveArgUses).
+    Liveness Result = SurveyUses(AI, MaybeLiveArgUses);
+    // Mark the result.
+    MarkValue(CreateArg(&F, i), Result, MaybeLiveArgUses);
+    // Clear the vector again for the next iteration.
+    MaybeLiveArgUses.clear();
+  }
+}
+
+/// MarkValue - This function marks the liveness of RA depending on L. If L is
+/// MaybeLive, it also takes all uses in MaybeLiveUses and records them in Uses,
+/// such that RA will be marked live if any use in MaybeLiveUses gets marked
+/// live later on.
+void DAE::MarkValue(const RetOrArg &RA, Liveness L,
+                    const UseVector &MaybeLiveUses) {
+  switch (L) {
+    case Live: MarkLive(RA); break;
+    case MaybeLive:
+    {
+      // Note any uses of this value, so this return value can be
+      // marked live whenever one of the uses becomes live.
+      for (UseVector::const_iterator UI = MaybeLiveUses.begin(),
+           UE = MaybeLiveUses.end(); UI != UE; ++UI)
+        Uses.insert(std::make_pair(*UI, RA));
+      break;
+    }
+  }
+}
+
+/// MarkLive - Mark the given Function as alive, meaning that it cannot be
+/// changed in any way. Additionally,
+/// mark any values that are used as this function's parameters or by its return
+/// values (according to Uses) live as well.
+void DAE::MarkLive(const Function &F) {
+    DOUT << "DAE - Intrinsically live fn: " << F.getName() << "\n";
+    // Mark the function as live.
+    LiveFunctions.insert(&F);
+    // Mark all arguments as live.
+    for (unsigned i = 0, e = F.arg_size(); i != e; ++i)
+      PropagateLiveness(CreateArg(&F, i));
+    // Mark all return values as live.
+    for (unsigned i = 0, e = NumRetVals(&F); i != e; ++i)
+      PropagateLiveness(CreateRet(&F, i));
+}
+
+/// MarkLive - Mark the given return value or argument as live. Additionally,
+/// mark any values that are used by this value (according to Uses) live as
+/// well.
+void DAE::MarkLive(const RetOrArg &RA) {
+  if (LiveFunctions.count(RA.F))
+    return; // Function was already marked Live.
+
+  if (!LiveValues.insert(RA).second)
+    return; // We were already marked Live.
+
+  DOUT << "DAE - Marking " << RA.getDescription() << " live\n";
+  PropagateLiveness(RA);
+}
+
+/// PropagateLiveness - Given that RA is a live value, propagate it's liveness
+/// to any other values it uses (according to Uses).
+void DAE::PropagateLiveness(const RetOrArg &RA) {
+  // We don't use upper_bound (or equal_range) here, because our recursive call
+  // to ourselves is likely to cause the upper_bound (which is the first value
+  // not belonging to RA) to become erased and the iterator invalidated.
+  UseMap::iterator Begin = Uses.lower_bound(RA);
+  UseMap::iterator E = Uses.end();
+  UseMap::iterator I;
+  for (I = Begin; I != E && I->first == RA; ++I)
+    MarkLive(I->second);
+
+  // Erase RA from the Uses map (from the lower bound to wherever we ended up
+  // after the loop).
+  Uses.erase(Begin, I);
+}
+
+// RemoveDeadStuffFromFunction - Remove any arguments and return values from F
+// that are not in LiveValues. Transform the function and all of the callees of
+// the function to not have these arguments and return values.
+//
+bool DAE::RemoveDeadStuffFromFunction(Function *F) {
+  // Don't modify fully live functions
+  if (LiveFunctions.count(F))
+    return false;
+
+  // Start by computing a new prototype for the function, which is the same as
+  // the old function, but has fewer arguments and a different return type.
+  const FunctionType *FTy = F->getFunctionType();
+  std::vector<const Type*> Params;
+
+  // Set up to build a new list of parameter attributes.
+  SmallVector<AttributeWithIndex, 8> AttributesVec;
+  const AttrListPtr &PAL = F->getAttributes();
+
+  // The existing function return attributes.
+  Attributes RAttrs = PAL.getRetAttributes();
+  Attributes FnAttrs = PAL.getFnAttributes();
+
+  // Find out the new return value.
+
+  const Type *RetTy = FTy->getReturnType();
+  const Type *NRetTy = NULL;
+  unsigned RetCount = NumRetVals(F);
+  // -1 means unused, other numbers are the new index
+  SmallVector<int, 5> NewRetIdxs(RetCount, -1);
+  std::vector<const Type*> RetTypes;
+  if (RetTy == Type::VoidTy) {
+    NRetTy = Type::VoidTy;
+  } else {
+    const StructType *STy = dyn_cast<StructType>(RetTy);
+    if (STy)
+      // Look at each of the original return values individually.
+      for (unsigned i = 0; i != RetCount; ++i) {
+        RetOrArg Ret = CreateRet(F, i);
+        if (LiveValues.erase(Ret)) {
+          RetTypes.push_back(STy->getElementType(i));
+          NewRetIdxs[i] = RetTypes.size() - 1;
+        } else {
+          ++NumRetValsEliminated;
+          DOUT << "DAE - Removing return value " << i << " from "
+               << F->getNameStart() << "\n";
+        }
+      }
+    else
+      // We used to return a single value.
+      if (LiveValues.erase(CreateRet(F, 0))) {
+        RetTypes.push_back(RetTy);
+        NewRetIdxs[0] = 0;
+      } else {
+        DOUT << "DAE - Removing return value from " << F->getNameStart()
+             << "\n";
+        ++NumRetValsEliminated;
+      }
+    if (RetTypes.size() > 1)
+      // More than one return type? Return a struct with them. Also, if we used
+      // to return a struct and didn't change the number of return values,
+      // return a struct again. This prevents changing {something} into
+      // something and {} into void.
+      // Make the new struct packed if we used to return a packed struct
+      // already.
+      NRetTy = StructType::get(RetTypes, STy->isPacked());
+    else if (RetTypes.size() == 1)
+      // One return type? Just a simple value then, but only if we didn't use to
+      // return a struct with that simple value before.
+      NRetTy = RetTypes.front();
+    else if (RetTypes.size() == 0)
+      // No return types? Make it void, but only if we didn't use to return {}.
+      NRetTy = Type::VoidTy;
+  }
+
+  assert(NRetTy && "No new return type found?");
+
+  // Remove any incompatible attributes, but only if we removed all return
+  // values. Otherwise, ensure that we don't have any conflicting attributes
+  // here. Currently, this should not be possible, but special handling might be
+  // required when new return value attributes are added.
+  if (NRetTy == Type::VoidTy)
+    RAttrs &= ~Attribute::typeIncompatible(NRetTy);
+  else
+    assert((RAttrs & Attribute::typeIncompatible(NRetTy)) == 0 
+           && "Return attributes no longer compatible?");
+
+  if (RAttrs)
+    AttributesVec.push_back(AttributeWithIndex::get(0, RAttrs));
+
+  // Remember which arguments are still alive.
+  SmallVector<bool, 10> ArgAlive(FTy->getNumParams(), false);
+  // Construct the new parameter list from non-dead arguments. Also construct
+  // a new set of parameter attributes to correspond. Skip the first parameter
+  // attribute, since that belongs to the return value.
+  unsigned i = 0;
+  for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end();
+       I != E; ++I, ++i) {
+    RetOrArg Arg = CreateArg(F, i);
+    if (LiveValues.erase(Arg)) {
+      Params.push_back(I->getType());
+      ArgAlive[i] = true;
+
+      // Get the original parameter attributes (skipping the first one, that is
+      // for the return value.
+      if (Attributes Attrs = PAL.getParamAttributes(i + 1))
+        AttributesVec.push_back(AttributeWithIndex::get(Params.size(), Attrs));
+    } else {
+      ++NumArgumentsEliminated;
+      DOUT << "DAE - Removing argument " << i << " (" << I->getNameStart()
+           << ") from " << F->getNameStart() << "\n";
+    }
+  }
+
+  if (FnAttrs != Attribute::None) 
+    AttributesVec.push_back(AttributeWithIndex::get(~0, FnAttrs));
+
+  // Reconstruct the AttributesList based on the vector we constructed.
+  AttrListPtr NewPAL = AttrListPtr::get(AttributesVec.begin(), AttributesVec.end());
+
+  // Work around LLVM bug PR56: the CWriter cannot emit varargs functions which
+  // have zero fixed arguments.
+  //
+  // Note that we apply this hack for a vararg fuction that does not have any
+  // arguments anymore, but did have them before (so don't bother fixing
+  // functions that were already broken wrt CWriter).
+  bool ExtraArgHack = false;
+  if (Params.empty() && FTy->isVarArg() && FTy->getNumParams() != 0) {
+    ExtraArgHack = true;
+    Params.push_back(Type::Int32Ty);
+  }
+
+  // Create the new function type based on the recomputed parameters.
+  FunctionType *NFTy = FunctionType::get(NRetTy, Params, FTy->isVarArg());
+
+  // No change?
+  if (NFTy == FTy)
+    return false;
+
+  // Create the new function body and insert it into the module...
+  Function *NF = Function::Create(NFTy, F->getLinkage());
+  NF->copyAttributesFrom(F);
+  NF->setAttributes(NewPAL);
+  // Insert the new function before the old function, so we won't be processing
+  // it again.
+  F->getParent()->getFunctionList().insert(F, NF);
+  NF->takeName(F);
+
+  // Loop over all of the callers of the function, transforming the call sites
+  // to pass in a smaller number of arguments into the new function.
+  //
+  std::vector<Value*> Args;
+  while (!F->use_empty()) {
+    CallSite CS = CallSite::get(F->use_back());
+    Instruction *Call = CS.getInstruction();
+
+    AttributesVec.clear();
+    const AttrListPtr &CallPAL = CS.getAttributes();
+
+    // The call return attributes.
+    Attributes RAttrs = CallPAL.getRetAttributes();
+    Attributes FnAttrs = CallPAL.getFnAttributes();
+    // Adjust in case the function was changed to return void.
+    RAttrs &= ~Attribute::typeIncompatible(NF->getReturnType());
+    if (RAttrs)
+      AttributesVec.push_back(AttributeWithIndex::get(0, RAttrs));
+
+    // Declare these outside of the loops, so we can reuse them for the second
+    // loop, which loops the varargs.
+    CallSite::arg_iterator I = CS.arg_begin();
+    unsigned i = 0;
+    // Loop over those operands, corresponding to the normal arguments to the
+    // original function, and add those that are still alive.
+    for (unsigned e = FTy->getNumParams(); i != e; ++I, ++i)
+      if (ArgAlive[i]) {
+        Args.push_back(*I);
+        // Get original parameter attributes, but skip return attributes.
+        if (Attributes Attrs = CallPAL.getParamAttributes(i + 1))
+          AttributesVec.push_back(AttributeWithIndex::get(Args.size(), Attrs));
+      }
+
+    if (ExtraArgHack)
+      Args.push_back(UndefValue::get(Type::Int32Ty));
+
+    // Push any varargs arguments on the list. Don't forget their attributes.
+    for (CallSite::arg_iterator E = CS.arg_end(); I != E; ++I, ++i) {
+      Args.push_back(*I);
+      if (Attributes Attrs = CallPAL.getParamAttributes(i + 1))
+        AttributesVec.push_back(AttributeWithIndex::get(Args.size(), Attrs));
+    }
+
+    if (FnAttrs != Attribute::None)
+      AttributesVec.push_back(AttributeWithIndex::get(~0, FnAttrs));
+
+    // Reconstruct the AttributesList based on the vector we constructed.
+    AttrListPtr NewCallPAL = AttrListPtr::get(AttributesVec.begin(),
+                                              AttributesVec.end());
+
+    Instruction *New;
+    if (InvokeInst *II = dyn_cast<InvokeInst>(Call)) {
+      New = InvokeInst::Create(NF, II->getNormalDest(), II->getUnwindDest(),
+                               Args.begin(), Args.end(), "", Call);
+      cast<InvokeInst>(New)->setCallingConv(CS.getCallingConv());
+      cast<InvokeInst>(New)->setAttributes(NewCallPAL);
+    } else {
+      New = CallInst::Create(NF, Args.begin(), Args.end(), "", Call);
+      cast<CallInst>(New)->setCallingConv(CS.getCallingConv());
+      cast<CallInst>(New)->setAttributes(NewCallPAL);
+      if (cast<CallInst>(Call)->isTailCall())
+        cast<CallInst>(New)->setTailCall();
+    }
+    Args.clear();
+
+    if (!Call->use_empty()) {
+      if (New->getType() == Call->getType()) {
+        // Return type not changed? Just replace users then.
+        Call->replaceAllUsesWith(New);
+        New->takeName(Call);
+      } else if (New->getType() == Type::VoidTy) {
+        // Our return value has uses, but they will get removed later on.
+        // Replace by null for now.
+        Call->replaceAllUsesWith(Constant::getNullValue(Call->getType()));
+      } else {
+        assert(isa<StructType>(RetTy) &&
+               "Return type changed, but not into a void. The old return type"
+               " must have been a struct!");
+        Instruction *InsertPt = Call;
+        if (InvokeInst *II = dyn_cast<InvokeInst>(Call)) {
+          BasicBlock::iterator IP = II->getNormalDest()->begin();
+          while (isa<PHINode>(IP)) ++IP;
+          InsertPt = IP;
+        }
+          
+        // We used to return a struct. Instead of doing smart stuff with all the
+        // uses of this struct, we will just rebuild it using
+        // extract/insertvalue chaining and let instcombine clean that up.
+        //
+        // Start out building up our return value from undef
+        Value *RetVal = llvm::UndefValue::get(RetTy);
+        for (unsigned i = 0; i != RetCount; ++i)
+          if (NewRetIdxs[i] != -1) {
+            Value *V;
+            if (RetTypes.size() > 1)
+              // We are still returning a struct, so extract the value from our
+              // return value
+              V = ExtractValueInst::Create(New, NewRetIdxs[i], "newret",
+                                           InsertPt);
+            else
+              // We are now returning a single element, so just insert that
+              V = New;
+            // Insert the value at the old position
+            RetVal = InsertValueInst::Create(RetVal, V, i, "oldret", InsertPt);
+          }
+        // Now, replace all uses of the old call instruction with the return
+        // struct we built
+        Call->replaceAllUsesWith(RetVal);
+        New->takeName(Call);
+      }
+    }
+
+    // Finally, remove the old call from the program, reducing the use-count of
+    // F.
+    Call->eraseFromParent();
+  }
+
+  // Since we have now created the new function, splice the body of the old
+  // function right into the new function, leaving the old rotting hulk of the
+  // function empty.
+  NF->getBasicBlockList().splice(NF->begin(), F->getBasicBlockList());
+
+  // Loop over the argument list, transfering uses of the old arguments over to
+  // the new arguments, also transfering over the names as well.
+  i = 0;
+  for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(),
+       I2 = NF->arg_begin(); I != E; ++I, ++i)
+    if (ArgAlive[i]) {
+      // If this is a live argument, move the name and users over to the new
+      // version.
+      I->replaceAllUsesWith(I2);
+      I2->takeName(I);
+      ++I2;
+    } else {
+      // If this argument is dead, replace any uses of it with null constants
+      // (these are guaranteed to become unused later on).
+      I->replaceAllUsesWith(Constant::getNullValue(I->getType()));
+    }
+
+  // If we change the return value of the function we must rewrite any return
+  // instructions.  Check this now.
+  if (F->getReturnType() != NF->getReturnType())
+    for (Function::iterator BB = NF->begin(), E = NF->end(); BB != E; ++BB)
+      if (ReturnInst *RI = dyn_cast<ReturnInst>(BB->getTerminator())) {
+        Value *RetVal;
+
+        if (NFTy->getReturnType() == Type::VoidTy) {
+          RetVal = 0;
+        } else {
+          assert (isa<StructType>(RetTy));
+          // The original return value was a struct, insert
+          // extractvalue/insertvalue chains to extract only the values we need
+          // to return and insert them into our new result.
+          // This does generate messy code, but we'll let it to instcombine to
+          // clean that up.
+          Value *OldRet = RI->getOperand(0);
+          // Start out building up our return value from undef
+          RetVal = llvm::UndefValue::get(NRetTy);
+          for (unsigned i = 0; i != RetCount; ++i)
+            if (NewRetIdxs[i] != -1) {
+              ExtractValueInst *EV = ExtractValueInst::Create(OldRet, i,
+                                                              "oldret", RI);
+              if (RetTypes.size() > 1) {
+                // We're still returning a struct, so reinsert the value into
+                // our new return value at the new index
+
+                RetVal = InsertValueInst::Create(RetVal, EV, NewRetIdxs[i],
+                                                 "newret", RI);
+              } else {
+                // We are now only returning a simple value, so just return the
+                // extracted value.
+                RetVal = EV;
+              }
+            }
+        }
+        // Replace the return instruction with one returning the new return
+        // value (possibly 0 if we became void).
+        ReturnInst::Create(RetVal, RI);
+        BB->getInstList().erase(RI);
+      }
+
+  // Now that the old function is dead, delete it.
+  F->eraseFromParent();
+
+  return true;
+}
+
+bool DAE::runOnModule(Module &M) {
+  bool Changed = false;
+
+  // First pass: Do a simple check to see if any functions can have their "..."
+  // removed.  We can do this if they never call va_start.  This loop cannot be
+  // fused with the next loop, because deleting a function invalidates
+  // information computed while surveying other functions.
+  DOUT << "DAE - Deleting dead varargs\n";
+  for (Module::iterator I = M.begin(), E = M.end(); I != E; ) {
+    Function &F = *I++;
+    if (F.getFunctionType()->isVarArg())
+      Changed |= DeleteDeadVarargs(F);
+  }
+
+  // Second phase:loop through the module, determining which arguments are live.
+  // We assume all arguments are dead unless proven otherwise (allowing us to
+  // determine that dead arguments passed into recursive functions are dead).
+  //
+  DOUT << "DAE - Determining liveness\n";
+  for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I)
+    SurveyFunction(*I);
+  
+  // Now, remove all dead arguments and return values from each function in
+  // turn
+  for (Module::iterator I = M.begin(), E = M.end(); I != E; ) {
+    // Increment now, because the function will probably get removed (ie
+    // replaced by a new one).
+    Function *F = I++;
+    Changed |= RemoveDeadStuffFromFunction(F);
+  }
+  return Changed;
+}
diff --git a/lib/Transforms/IPO/DeadTypeElimination.cpp b/lib/Transforms/IPO/DeadTypeElimination.cpp
new file mode 100644
index 0000000..85aed2b
--- /dev/null
+++ b/lib/Transforms/IPO/DeadTypeElimination.cpp
@@ -0,0 +1,107 @@
+//===- DeadTypeElimination.cpp - Eliminate unused types for symbol table --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass is used to cleanup the output of GCC.  It eliminate names for types
+// that are unused in the entire translation unit, using the FindUsedTypes pass.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "deadtypeelim"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Analysis/FindUsedTypes.h"
+#include "llvm/Module.h"
+#include "llvm/TypeSymbolTable.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/Compiler.h"
+using namespace llvm;
+
+STATISTIC(NumKilled, "Number of unused typenames removed from symtab");
+
+namespace {
+  struct VISIBILITY_HIDDEN DTE : public ModulePass {
+    static char ID; // Pass identification, replacement for typeid
+    DTE() : ModulePass(&ID) {}
+
+    // doPassInitialization - For this pass, it removes global symbol table
+    // entries for primitive types.  These are never used for linking in GCC and
+    // they make the output uglier to look at, so we nuke them.
+    //
+    // Also, initialize instance variables.
+    //
+    bool runOnModule(Module &M);
+
+    // getAnalysisUsage - This function needs FindUsedTypes to do its job...
+    //
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequired<FindUsedTypes>();
+    }
+  };
+}
+
+char DTE::ID = 0;
+static RegisterPass<DTE> X("deadtypeelim", "Dead Type Elimination");
+
+ModulePass *llvm::createDeadTypeEliminationPass() {
+  return new DTE();
+}
+
+
+// ShouldNukeSymtabEntry - Return true if this module level symbol table entry
+// should be eliminated.
+//
+static inline bool ShouldNukeSymtabEntry(const Type *Ty){
+  // Nuke all names for primitive types!
+  if (Ty->isPrimitiveType() || Ty->isInteger()) 
+    return true;
+
+  // Nuke all pointers to primitive types as well...
+  if (const PointerType *PT = dyn_cast<PointerType>(Ty))
+    if (PT->getElementType()->isPrimitiveType() ||
+        PT->getElementType()->isInteger()) 
+      return true;
+
+  return false;
+}
+
+// run - For this pass, it removes global symbol table entries for primitive
+// types.  These are never used for linking in GCC and they make the output
+// uglier to look at, so we nuke them.  Also eliminate types that are never used
+// in the entire program as indicated by FindUsedTypes.
+//
+bool DTE::runOnModule(Module &M) {
+  bool Changed = false;
+
+  TypeSymbolTable &ST = M.getTypeSymbolTable();
+  std::set<const Type *> UsedTypes = getAnalysis<FindUsedTypes>().getTypes();
+
+  // Check the symbol table for superfluous type entries...
+  //
+  // Grab the 'type' plane of the module symbol...
+  TypeSymbolTable::iterator TI = ST.begin();
+  TypeSymbolTable::iterator TE = ST.end();
+  while ( TI != TE ) {
+    // If this entry should be unconditionally removed, or if we detect that
+    // the type is not used, remove it.
+    const Type *RHS = TI->second;
+    if (ShouldNukeSymtabEntry(RHS) || !UsedTypes.count(RHS)) {
+      ST.remove(TI++);
+      ++NumKilled;
+      Changed = true;
+    } else {
+      ++TI;
+      // We only need to leave one name for each type.
+      UsedTypes.erase(RHS);
+    }
+  }
+
+  return Changed;
+}
+
+// vim: sw=2
diff --git a/lib/Transforms/IPO/ExtractGV.cpp b/lib/Transforms/IPO/ExtractGV.cpp
new file mode 100644
index 0000000..0c529d2
--- /dev/null
+++ b/lib/Transforms/IPO/ExtractGV.cpp
@@ -0,0 +1,173 @@
+//===-- ExtractGV.cpp - Global Value extraction pass ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass extracts global values
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Instructions.h"
+#include "llvm/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Constants.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Support/Compiler.h"
+#include <algorithm>
+using namespace llvm;
+
+namespace {
+  /// @brief A pass to extract specific functions and their dependencies.
+  class VISIBILITY_HIDDEN GVExtractorPass : public ModulePass {
+    std::vector<GlobalValue*> Named;
+    bool deleteStuff;
+    bool reLink;
+  public:
+    static char ID; // Pass identification, replacement for typeid
+
+    /// FunctionExtractorPass - If deleteFn is true, this pass deletes as the
+    /// specified function. Otherwise, it deletes as much of the module as
+    /// possible, except for the function specified.
+    ///
+    explicit GVExtractorPass(std::vector<GlobalValue*>& GVs, bool deleteS = true,
+                             bool relinkCallees = false)
+      : ModulePass(&ID), Named(GVs), deleteStuff(deleteS),
+        reLink(relinkCallees) {}
+
+    bool runOnModule(Module &M) {
+      if (Named.size() == 0) {
+        return false;  // Nothing to extract
+      }
+      
+      if (deleteStuff)
+        return deleteGV();
+      M.setModuleInlineAsm("");
+      return isolateGV(M);
+    }
+
+    bool deleteGV() {
+      for (std::vector<GlobalValue*>::iterator GI = Named.begin(), 
+             GE = Named.end(); GI != GE; ++GI) {
+        if (Function* NamedFunc = dyn_cast<Function>(*GI)) {
+         // If we're in relinking mode, set linkage of all internal callees to
+         // external. This will allow us extract function, and then - link
+         // everything together
+         if (reLink) {
+           for (Function::iterator B = NamedFunc->begin(), BE = NamedFunc->end();
+                B != BE; ++B) {
+             for (BasicBlock::iterator I = B->begin(), E = B->end();
+                  I != E; ++I) {
+               if (CallInst* callInst = dyn_cast<CallInst>(&*I)) {
+                 Function* Callee = callInst->getCalledFunction();
+                 if (Callee && Callee->hasLocalLinkage())
+                   Callee->setLinkage(GlobalValue::ExternalLinkage);
+               }
+             }
+           }
+         }
+         
+         NamedFunc->setLinkage(GlobalValue::ExternalLinkage);
+         NamedFunc->deleteBody();
+         assert(NamedFunc->isDeclaration() && "This didn't make the function external!");
+       } else {
+          if (!(*GI)->isDeclaration()) {
+            cast<GlobalVariable>(*GI)->setInitializer(0);  //clear the initializer
+            (*GI)->setLinkage(GlobalValue::ExternalLinkage);
+          }
+        }
+      }
+      return true;
+    }
+
+    bool isolateGV(Module &M) {
+      // Mark all globals internal
+      // FIXME: what should we do with private linkage?
+      for (Module::global_iterator I = M.global_begin(), E = M.global_end(); I != E; ++I)
+        if (!I->isDeclaration()) {
+          I->setLinkage(GlobalValue::InternalLinkage);
+        }
+      for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I)
+        if (!I->isDeclaration()) {
+          I->setLinkage(GlobalValue::InternalLinkage);
+        }
+
+      // Make sure our result is globally accessible...
+      // by putting them in the used array
+      {
+        std::vector<Constant *> AUGs;
+        const Type *SBP= PointerType::getUnqual(Type::Int8Ty);
+        for (std::vector<GlobalValue*>::iterator GI = Named.begin(), 
+               GE = Named.end(); GI != GE; ++GI) {
+          (*GI)->setLinkage(GlobalValue::ExternalLinkage);
+          AUGs.push_back(ConstantExpr::getBitCast(*GI, SBP));
+        }
+        ArrayType *AT = ArrayType::get(SBP, AUGs.size());
+        Constant *Init = ConstantArray::get(AT, AUGs);
+        GlobalValue *gv = new GlobalVariable(AT, false, 
+                                             GlobalValue::AppendingLinkage, 
+                                             Init, "llvm.used", &M);
+        gv->setSection("llvm.metadata");
+      }
+
+      // All of the functions may be used by global variables or the named
+      // globals.  Loop through them and create a new, external functions that
+      // can be "used", instead of ones with bodies.
+      std::vector<Function*> NewFunctions;
+
+      Function *Last = --M.end();  // Figure out where the last real fn is.
+
+      for (Module::iterator I = M.begin(); ; ++I) {
+        if (std::find(Named.begin(), Named.end(), &*I) == Named.end()) {
+          Function *New = Function::Create(I->getFunctionType(),
+                                           GlobalValue::ExternalLinkage);
+          New->copyAttributesFrom(I);
+
+          // If it's not the named function, delete the body of the function
+          I->dropAllReferences();
+
+          M.getFunctionList().push_back(New);
+          NewFunctions.push_back(New);
+          New->takeName(I);
+        }
+
+        if (&*I == Last) break;  // Stop after processing the last function
+      }
+
+      // Now that we have replacements all set up, loop through the module,
+      // deleting the old functions, replacing them with the newly created
+      // functions.
+      if (!NewFunctions.empty()) {
+        unsigned FuncNum = 0;
+        Module::iterator I = M.begin();
+        do {
+          if (std::find(Named.begin(), Named.end(), &*I) == Named.end()) {
+            // Make everything that uses the old function use the new dummy fn
+            I->replaceAllUsesWith(NewFunctions[FuncNum++]);
+
+            Function *Old = I;
+            ++I;  // Move the iterator to the new function
+
+            // Delete the old function!
+            M.getFunctionList().erase(Old);
+
+          } else {
+            ++I;  // Skip the function we are extracting
+          }
+        } while (&*I != NewFunctions[0]);
+      }
+
+      return true;
+    }
+  };
+
+  char GVExtractorPass::ID = 0;
+}
+
+ModulePass *llvm::createGVExtractionPass(std::vector<GlobalValue*>& GVs, 
+                                         bool deleteFn, bool relinkCallees) {
+  return new GVExtractorPass(GVs, deleteFn, relinkCallees);
+}
diff --git a/lib/Transforms/IPO/FunctionAttrs.cpp b/lib/Transforms/IPO/FunctionAttrs.cpp
new file mode 100644
index 0000000..e831524
--- /dev/null
+++ b/lib/Transforms/IPO/FunctionAttrs.cpp
@@ -0,0 +1,347 @@
+//===- FunctionAttrs.cpp - Pass which marks functions readnone or readonly ===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a simple interprocedural pass which walks the
+// call-graph, looking for functions which do not access or only read
+// non-local memory, and marking them readnone/readonly.  In addition,
+// it marks function arguments (of pointer type) 'nocapture' if a call
+// to the function does not create any copies of the pointer value that
+// outlive the call.  This more or less means that the pointer is only
+// dereferenced, and not returned from the function or stored in a global.
+// This pass is implemented as a bottom-up traversal of the call-graph.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "functionattrs"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/CallGraphSCCPass.h"
+#include "llvm/GlobalVariable.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/Analysis/CaptureTracking.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/UniqueVector.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/InstIterator.h"
+using namespace llvm;
+
+STATISTIC(NumReadNone, "Number of functions marked readnone");
+STATISTIC(NumReadOnly, "Number of functions marked readonly");
+STATISTIC(NumNoCapture, "Number of arguments marked nocapture");
+STATISTIC(NumNoAlias, "Number of function returns marked noalias");
+
+namespace {
+  struct VISIBILITY_HIDDEN FunctionAttrs : public CallGraphSCCPass {
+    static char ID; // Pass identification, replacement for typeid
+    FunctionAttrs() : CallGraphSCCPass(&ID) {}
+
+    // runOnSCC - Analyze the SCC, performing the transformation if possible.
+    bool runOnSCC(const std::vector<CallGraphNode *> &SCC);
+
+    // AddReadAttrs - Deduce readonly/readnone attributes for the SCC.
+    bool AddReadAttrs(const std::vector<CallGraphNode *> &SCC);
+
+    // AddNoCaptureAttrs - Deduce nocapture attributes for the SCC.
+    bool AddNoCaptureAttrs(const std::vector<CallGraphNode *> &SCC);
+
+    // IsFunctionMallocLike - Does this function allocate new memory?
+    bool IsFunctionMallocLike(Function *F,
+                              SmallPtrSet<CallGraphNode*, 8> &) const;
+
+    // AddNoAliasAttrs - Deduce noalias attributes for the SCC.
+    bool AddNoAliasAttrs(const std::vector<CallGraphNode *> &SCC);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesCFG();
+      CallGraphSCCPass::getAnalysisUsage(AU);
+    }
+
+    bool PointsToLocalMemory(Value *V);
+  };
+}
+
+char FunctionAttrs::ID = 0;
+static RegisterPass<FunctionAttrs>
+X("functionattrs", "Deduce function attributes");
+
+Pass *llvm::createFunctionAttrsPass() { return new FunctionAttrs(); }
+
+
+/// PointsToLocalMemory - Returns whether the given pointer value points to
+/// memory that is local to the function.  Global constants are considered
+/// local to all functions.
+bool FunctionAttrs::PointsToLocalMemory(Value *V) {
+  V = V->getUnderlyingObject();
+  // An alloca instruction defines local memory.
+  if (isa<AllocaInst>(V))
+    return true;
+  // A global constant counts as local memory for our purposes.
+  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(V))
+    return GV->isConstant();
+  // Could look through phi nodes and selects here, but it doesn't seem
+  // to be useful in practice.
+  return false;
+}
+
+/// AddReadAttrs - Deduce readonly/readnone attributes for the SCC.
+bool FunctionAttrs::AddReadAttrs(const std::vector<CallGraphNode *> &SCC) {
+  SmallPtrSet<CallGraphNode*, 8> SCCNodes;
+  CallGraph &CG = getAnalysis<CallGraph>();
+
+  // Fill SCCNodes with the elements of the SCC.  Used for quickly
+  // looking up whether a given CallGraphNode is in this SCC.
+  for (unsigned i = 0, e = SCC.size(); i != e; ++i)
+    SCCNodes.insert(SCC[i]);
+
+  // Check if any of the functions in the SCC read or write memory.  If they
+  // write memory then they can't be marked readnone or readonly.
+  bool ReadsMemory = false;
+  for (unsigned i = 0, e = SCC.size(); i != e; ++i) {
+    Function *F = SCC[i]->getFunction();
+
+    if (F == 0)
+      // External node - may write memory.  Just give up.
+      return false;
+
+    if (F->doesNotAccessMemory())
+      // Already perfect!
+      continue;
+
+    // Definitions with weak linkage may be overridden at linktime with
+    // something that writes memory, so treat them like declarations.
+    if (F->isDeclaration() || F->mayBeOverridden()) {
+      if (!F->onlyReadsMemory())
+        // May write memory.  Just give up.
+        return false;
+
+      ReadsMemory = true;
+      continue;
+    }
+
+    // Scan the function body for instructions that may read or write memory.
+    for (inst_iterator II = inst_begin(F), E = inst_end(F); II != E; ++II) {
+      Instruction *I = &*II;
+
+      // Some instructions can be ignored even if they read or write memory.
+      // Detect these now, skipping to the next instruction if one is found.
+      CallSite CS = CallSite::get(I);
+      if (CS.getInstruction()) {
+        // Ignore calls to functions in the same SCC.
+        if (SCCNodes.count(CG[CS.getCalledFunction()]))
+          continue;
+      } else if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+        // Ignore loads from local memory.
+        if (PointsToLocalMemory(LI->getPointerOperand()))
+          continue;
+      } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
+        // Ignore stores to local memory.
+        if (PointsToLocalMemory(SI->getPointerOperand()))
+          continue;
+      }
+
+      // Any remaining instructions need to be taken seriously!  Check if they
+      // read or write memory.
+      if (I->mayWriteToMemory())
+        // Writes memory.  Just give up.
+        return false;
+
+      if (isa<MallocInst>(I))
+        // MallocInst claims not to write memory!  PR3754.
+        return false;
+
+      // If this instruction may read memory, remember that.
+      ReadsMemory |= I->mayReadFromMemory();
+    }
+  }
+
+  // Success!  Functions in this SCC do not access memory, or only read memory.
+  // Give them the appropriate attribute.
+  bool MadeChange = false;
+  for (unsigned i = 0, e = SCC.size(); i != e; ++i) {
+    Function *F = SCC[i]->getFunction();
+
+    if (F->doesNotAccessMemory())
+      // Already perfect!
+      continue;
+
+    if (F->onlyReadsMemory() && ReadsMemory)
+      // No change.
+      continue;
+
+    MadeChange = true;
+
+    // Clear out any existing attributes.
+    F->removeAttribute(~0, Attribute::ReadOnly | Attribute::ReadNone);
+
+    // Add in the new attribute.
+    F->addAttribute(~0, ReadsMemory? Attribute::ReadOnly : Attribute::ReadNone);
+
+    if (ReadsMemory)
+      ++NumReadOnly;
+    else
+      ++NumReadNone;
+  }
+
+  return MadeChange;
+}
+
+/// AddNoCaptureAttrs - Deduce nocapture attributes for the SCC.
+bool FunctionAttrs::AddNoCaptureAttrs(const std::vector<CallGraphNode *> &SCC) {
+  bool Changed = false;
+
+  // Check each function in turn, determining which pointer arguments are not
+  // captured.
+  for (unsigned i = 0, e = SCC.size(); i != e; ++i) {
+    Function *F = SCC[i]->getFunction();
+
+    if (F == 0)
+      // External node - skip it;
+      continue;
+
+    // Definitions with weak linkage may be overridden at linktime with
+    // something that writes memory, so treat them like declarations.
+    if (F->isDeclaration() || F->mayBeOverridden())
+      continue;
+
+    for (Function::arg_iterator A = F->arg_begin(), E = F->arg_end(); A!=E; ++A)
+      if (isa<PointerType>(A->getType()) && !A->hasNoCaptureAttr() &&
+          !PointerMayBeCaptured(A, true)) {
+        A->addAttr(Attribute::NoCapture);
+        ++NumNoCapture;
+        Changed = true;
+      }
+  }
+
+  return Changed;
+}
+
+/// IsFunctionMallocLike - A function is malloc-like if it returns either null
+/// or a pointer that doesn't alias any other pointer visible to the caller.
+bool FunctionAttrs::IsFunctionMallocLike(Function *F,
+                              SmallPtrSet<CallGraphNode*, 8> &SCCNodes) const {
+  CallGraph &CG = getAnalysis<CallGraph>();
+
+  UniqueVector<Value *> FlowsToReturn;
+  for (Function::iterator I = F->begin(), E = F->end(); I != E; ++I)
+    if (ReturnInst *Ret = dyn_cast<ReturnInst>(I->getTerminator()))
+      FlowsToReturn.insert(Ret->getReturnValue());
+
+  for (unsigned i = 0; i != FlowsToReturn.size(); ++i) {
+    Value *RetVal = FlowsToReturn[i+1];   // UniqueVector[0] is reserved.
+
+    if (Constant *C = dyn_cast<Constant>(RetVal)) {
+      if (!C->isNullValue() && !isa<UndefValue>(C))
+        return false;
+
+      continue;
+    }
+
+    if (isa<Argument>(RetVal))
+      return false;
+
+    if (Instruction *RVI = dyn_cast<Instruction>(RetVal))
+      switch (RVI->getOpcode()) {
+        // Extend the analysis by looking upwards.
+        case Instruction::GetElementPtr:
+        case Instruction::BitCast:
+          FlowsToReturn.insert(RVI->getOperand(0));
+          continue;
+        case Instruction::Select: {
+          SelectInst *SI = cast<SelectInst>(RVI);
+          FlowsToReturn.insert(SI->getTrueValue());
+          FlowsToReturn.insert(SI->getFalseValue());
+        } continue;
+        case Instruction::PHI: {
+          PHINode *PN = cast<PHINode>(RVI);
+          for (int i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+            FlowsToReturn.insert(PN->getIncomingValue(i));
+        } continue;
+
+        // Check whether the pointer came from an allocation.
+        case Instruction::Alloca:
+        case Instruction::Malloc:
+          break;
+        case Instruction::Call:
+        case Instruction::Invoke: {
+          CallSite CS(RVI);
+          if (CS.paramHasAttr(0, Attribute::NoAlias))
+            break;
+          if (CS.getCalledFunction() &&
+              SCCNodes.count(CG[CS.getCalledFunction()]))
+            break;
+        } // fall-through
+        default:
+          return false;  // Did not come from an allocation.
+      }
+
+    if (PointerMayBeCaptured(RetVal, false))
+      return false;
+  }
+
+  return true;
+}
+
+/// AddNoAliasAttrs - Deduce noalias attributes for the SCC.
+bool FunctionAttrs::AddNoAliasAttrs(const std::vector<CallGraphNode *> &SCC) {
+  SmallPtrSet<CallGraphNode*, 8> SCCNodes;
+
+  // Fill SCCNodes with the elements of the SCC.  Used for quickly
+  // looking up whether a given CallGraphNode is in this SCC.
+  for (unsigned i = 0, e = SCC.size(); i != e; ++i)
+    SCCNodes.insert(SCC[i]);
+
+  // Check each function in turn, determining which functions return noalias
+  // pointers.
+  for (unsigned i = 0, e = SCC.size(); i != e; ++i) {
+    Function *F = SCC[i]->getFunction();
+
+    if (F == 0)
+      // External node - skip it;
+      return false;
+
+    // Already noalias.
+    if (F->doesNotAlias(0))
+      continue;
+
+    // Definitions with weak linkage may be overridden at linktime, so
+    // treat them like declarations.
+    if (F->isDeclaration() || F->mayBeOverridden())
+      return false;
+
+    // We annotate noalias return values, which are only applicable to 
+    // pointer types.
+    if (!isa<PointerType>(F->getReturnType()))
+      continue;
+
+    if (!IsFunctionMallocLike(F, SCCNodes))
+      return false;
+  }
+
+  bool MadeChange = false;
+  for (unsigned i = 0, e = SCC.size(); i != e; ++i) {
+    Function *F = SCC[i]->getFunction();
+    if (F->doesNotAlias(0) || !isa<PointerType>(F->getReturnType()))
+      continue;
+
+    F->setDoesNotAlias(0);
+    ++NumNoAlias;
+    MadeChange = true;
+  }
+
+  return MadeChange;
+}
+
+bool FunctionAttrs::runOnSCC(const std::vector<CallGraphNode *> &SCC) {
+  bool Changed = AddReadAttrs(SCC);
+  Changed |= AddNoCaptureAttrs(SCC);
+  Changed |= AddNoAliasAttrs(SCC);
+  return Changed;
+}
diff --git a/lib/Transforms/IPO/GlobalDCE.cpp b/lib/Transforms/IPO/GlobalDCE.cpp
new file mode 100644
index 0000000..db378b0
--- /dev/null
+++ b/lib/Transforms/IPO/GlobalDCE.cpp
@@ -0,0 +1,227 @@
+//===-- GlobalDCE.cpp - DCE unreachable internal functions ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This transform is designed to eliminate unreachable internal globals from the
+// program.  It uses an aggressive algorithm, searching out globals that are
+// known to be alive.  After it finds all of the globals which are needed, it
+// deletes whatever is left over.  This allows it to delete recursive chunks of
+// the program which are unreachable.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "globaldce"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Constants.h"
+#include "llvm/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/Compiler.h"
+#include <set>
+using namespace llvm;
+
+STATISTIC(NumAliases  , "Number of global aliases removed");
+STATISTIC(NumFunctions, "Number of functions removed");
+STATISTIC(NumVariables, "Number of global variables removed");
+
+namespace {
+  struct VISIBILITY_HIDDEN GlobalDCE : public ModulePass {
+    static char ID; // Pass identification, replacement for typeid
+    GlobalDCE() : ModulePass(&ID) {}
+
+    // run - Do the GlobalDCE pass on the specified module, optionally updating
+    // the specified callgraph to reflect the changes.
+    //
+    bool runOnModule(Module &M);
+
+  private:
+    std::set<GlobalValue*> AliveGlobals;
+
+    /// GlobalIsNeeded - mark the specific global value as needed, and
+    /// recursively mark anything that it uses as also needed.
+    void GlobalIsNeeded(GlobalValue *GV);
+    void MarkUsedGlobalsAsNeeded(Constant *C);
+
+    bool SafeToDestroyConstant(Constant* C);
+    bool RemoveUnusedGlobalValue(GlobalValue &GV);
+  };
+}
+
+char GlobalDCE::ID = 0;
+static RegisterPass<GlobalDCE> X("globaldce", "Dead Global Elimination");
+
+ModulePass *llvm::createGlobalDCEPass() { return new GlobalDCE(); }
+
+bool GlobalDCE::runOnModule(Module &M) {
+  bool Changed = false;
+  // Loop over the module, adding globals which are obviously necessary.
+  for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) {
+    Changed |= RemoveUnusedGlobalValue(*I);
+    // Functions with external linkage are needed if they have a body
+    if (!I->hasLocalLinkage() && !I->hasLinkOnceLinkage() &&
+        !I->isDeclaration() && !I->hasAvailableExternallyLinkage())
+      GlobalIsNeeded(I);
+  }
+
+  for (Module::global_iterator I = M.global_begin(), E = M.global_end();
+       I != E; ++I) {
+    Changed |= RemoveUnusedGlobalValue(*I);
+    // Externally visible & appending globals are needed, if they have an
+    // initializer.
+    if (!I->hasLocalLinkage() && !I->hasLinkOnceLinkage() &&
+        !I->isDeclaration() && !I->hasAvailableExternallyLinkage())
+      GlobalIsNeeded(I);
+  }
+
+  for (Module::alias_iterator I = M.alias_begin(), E = M.alias_end();
+       I != E; ++I) {
+    Changed |= RemoveUnusedGlobalValue(*I);
+    // Externally visible aliases are needed.
+    if (!I->hasLocalLinkage() && !I->hasLinkOnceLinkage())
+      GlobalIsNeeded(I);
+  }
+
+  // Now that all globals which are needed are in the AliveGlobals set, we loop
+  // through the program, deleting those which are not alive.
+  //
+
+  // The first pass is to drop initializers of global variables which are dead.
+  std::vector<GlobalVariable*> DeadGlobalVars;   // Keep track of dead globals
+  for (Module::global_iterator I = M.global_begin(), E = M.global_end(); I != E; ++I)
+    if (!AliveGlobals.count(I)) {
+      DeadGlobalVars.push_back(I);         // Keep track of dead globals
+      I->setInitializer(0);
+    }
+
+  // The second pass drops the bodies of functions which are dead...
+  std::vector<Function*> DeadFunctions;
+  for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I)
+    if (!AliveGlobals.count(I)) {
+      DeadFunctions.push_back(I);         // Keep track of dead globals
+      if (!I->isDeclaration())
+        I->deleteBody();
+    }
+
+  // The third pass drops targets of aliases which are dead...
+  std::vector<GlobalAlias*> DeadAliases;
+  for (Module::alias_iterator I = M.alias_begin(), E = M.alias_end(); I != E;
+       ++I)
+    if (!AliveGlobals.count(I)) {
+      DeadAliases.push_back(I);
+      I->setAliasee(0);
+    }
+
+  if (!DeadFunctions.empty()) {
+    // Now that all interferences have been dropped, delete the actual objects
+    // themselves.
+    for (unsigned i = 0, e = DeadFunctions.size(); i != e; ++i) {
+      RemoveUnusedGlobalValue(*DeadFunctions[i]);
+      M.getFunctionList().erase(DeadFunctions[i]);
+    }
+    NumFunctions += DeadFunctions.size();
+    Changed = true;
+  }
+
+  if (!DeadGlobalVars.empty()) {
+    for (unsigned i = 0, e = DeadGlobalVars.size(); i != e; ++i) {
+      RemoveUnusedGlobalValue(*DeadGlobalVars[i]);
+      M.getGlobalList().erase(DeadGlobalVars[i]);
+    }
+    NumVariables += DeadGlobalVars.size();
+    Changed = true;
+  }
+
+  // Now delete any dead aliases.
+  if (!DeadAliases.empty()) {
+    for (unsigned i = 0, e = DeadAliases.size(); i != e; ++i) {
+      RemoveUnusedGlobalValue(*DeadAliases[i]);
+      M.getAliasList().erase(DeadAliases[i]);
+    }
+    NumAliases += DeadAliases.size();
+    Changed = true;
+  }
+
+  // Make sure that all memory is released
+  AliveGlobals.clear();
+  return Changed;
+}
+
+/// GlobalIsNeeded - the specific global value as needed, and
+/// recursively mark anything that it uses as also needed.
+void GlobalDCE::GlobalIsNeeded(GlobalValue *G) {
+  std::set<GlobalValue*>::iterator I = AliveGlobals.find(G);
+
+  // If the global is already in the set, no need to reprocess it.
+  if (I != AliveGlobals.end()) return;
+
+  // Otherwise insert it now, so we do not infinitely recurse
+  AliveGlobals.insert(I, G);
+
+  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(G)) {
+    // If this is a global variable, we must make sure to add any global values
+    // referenced by the initializer to the alive set.
+    if (GV->hasInitializer())
+      MarkUsedGlobalsAsNeeded(GV->getInitializer());
+  } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(G)) {
+    // The target of a global alias is needed.
+    MarkUsedGlobalsAsNeeded(GA->getAliasee());
+  } else {
+    // Otherwise this must be a function object.  We have to scan the body of
+    // the function looking for constants and global values which are used as
+    // operands.  Any operands of these types must be processed to ensure that
+    // any globals used will be marked as needed.
+    Function *F = cast<Function>(G);
+    // For all basic blocks...
+    for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB)
+      // For all instructions...
+      for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I)
+        // For all operands...
+        for (User::op_iterator U = I->op_begin(), E = I->op_end(); U != E; ++U)
+          if (GlobalValue *GV = dyn_cast<GlobalValue>(*U))
+            GlobalIsNeeded(GV);
+          else if (Constant *C = dyn_cast<Constant>(*U))
+            MarkUsedGlobalsAsNeeded(C);
+  }
+}
+
+void GlobalDCE::MarkUsedGlobalsAsNeeded(Constant *C) {
+  if (GlobalValue *GV = dyn_cast<GlobalValue>(C))
+    GlobalIsNeeded(GV);
+  else {
+    // Loop over all of the operands of the constant, adding any globals they
+    // use to the list of needed globals.
+    for (User::op_iterator I = C->op_begin(), E = C->op_end(); I != E; ++I)
+      MarkUsedGlobalsAsNeeded(cast<Constant>(*I));
+  }
+}
+
+// RemoveUnusedGlobalValue - Loop over all of the uses of the specified
+// GlobalValue, looking for the constant pointer ref that may be pointing to it.
+// If found, check to see if the constant pointer ref is safe to destroy, and if
+// so, nuke it.  This will reduce the reference count on the global value, which
+// might make it deader.
+//
+bool GlobalDCE::RemoveUnusedGlobalValue(GlobalValue &GV) {
+  if (GV.use_empty()) return false;
+  GV.removeDeadConstantUsers();
+  return GV.use_empty();
+}
+
+// SafeToDestroyConstant - It is safe to destroy a constant iff it is only used
+// by constants itself.  Note that constants cannot be cyclic, so this test is
+// pretty easy to implement recursively.
+//
+bool GlobalDCE::SafeToDestroyConstant(Constant *C) {
+  for (Value::use_iterator I = C->use_begin(), E = C->use_end(); I != E; ++I)
+    if (Constant *User = dyn_cast<Constant>(*I)) {
+      if (!SafeToDestroyConstant(User)) return false;
+    } else {
+      return false;
+    }
+  return true;
+}
diff --git a/lib/Transforms/IPO/GlobalOpt.cpp b/lib/Transforms/IPO/GlobalOpt.cpp
new file mode 100644
index 0000000..2c01cc3
--- /dev/null
+++ b/lib/Transforms/IPO/GlobalOpt.cpp
@@ -0,0 +1,2485 @@
+//===- GlobalOpt.cpp - Optimize Global Variables --------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass transforms simple global variables that never have their address
+// taken.  If obviously true, it marks read/write globals as constant, deletes
+// variables only stored to, etc.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "globalopt"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/CallingConv.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Instructions.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Support/CallSite.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/GetElementPtrTypeIterator.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/STLExtras.h"
+#include <algorithm>
+using namespace llvm;
+
+STATISTIC(NumMarked    , "Number of globals marked constant");
+STATISTIC(NumSRA       , "Number of aggregate globals broken into scalars");
+STATISTIC(NumHeapSRA   , "Number of heap objects SRA'd");
+STATISTIC(NumSubstitute,"Number of globals with initializers stored into them");
+STATISTIC(NumDeleted   , "Number of globals deleted");
+STATISTIC(NumFnDeleted , "Number of functions deleted");
+STATISTIC(NumGlobUses  , "Number of global uses devirtualized");
+STATISTIC(NumLocalized , "Number of globals localized");
+STATISTIC(NumShrunkToBool  , "Number of global vars shrunk to booleans");
+STATISTIC(NumFastCallFns   , "Number of functions converted to fastcc");
+STATISTIC(NumCtorsEvaluated, "Number of static ctors evaluated");
+STATISTIC(NumNestRemoved   , "Number of nest attributes removed");
+STATISTIC(NumAliasesResolved, "Number of global aliases resolved");
+STATISTIC(NumAliasesRemoved, "Number of global aliases eliminated");
+
+namespace {
+  struct VISIBILITY_HIDDEN GlobalOpt : public ModulePass {
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequired<TargetData>();
+    }
+    static char ID; // Pass identification, replacement for typeid
+    GlobalOpt() : ModulePass(&ID) {}
+
+    bool runOnModule(Module &M);
+
+  private:
+    GlobalVariable *FindGlobalCtors(Module &M);
+    bool OptimizeFunctions(Module &M);
+    bool OptimizeGlobalVars(Module &M);
+    bool OptimizeGlobalAliases(Module &M);
+    bool OptimizeGlobalCtorsList(GlobalVariable *&GCL);
+    bool ProcessInternalGlobal(GlobalVariable *GV,Module::global_iterator &GVI);
+  };
+}
+
+char GlobalOpt::ID = 0;
+static RegisterPass<GlobalOpt> X("globalopt", "Global Variable Optimizer");
+
+ModulePass *llvm::createGlobalOptimizerPass() { return new GlobalOpt(); }
+
+namespace {
+
+/// GlobalStatus - As we analyze each global, keep track of some information
+/// about it.  If we find out that the address of the global is taken, none of
+/// this info will be accurate.
+struct VISIBILITY_HIDDEN GlobalStatus {
+  /// isLoaded - True if the global is ever loaded.  If the global isn't ever
+  /// loaded it can be deleted.
+  bool isLoaded;
+
+  /// StoredType - Keep track of what stores to the global look like.
+  ///
+  enum StoredType {
+    /// NotStored - There is no store to this global.  It can thus be marked
+    /// constant.
+    NotStored,
+
+    /// isInitializerStored - This global is stored to, but the only thing
+    /// stored is the constant it was initialized with.  This is only tracked
+    /// for scalar globals.
+    isInitializerStored,
+
+    /// isStoredOnce - This global is stored to, but only its initializer and
+    /// one other value is ever stored to it.  If this global isStoredOnce, we
+    /// track the value stored to it in StoredOnceValue below.  This is only
+    /// tracked for scalar globals.
+    isStoredOnce,
+
+    /// isStored - This global is stored to by multiple values or something else
+    /// that we cannot track.
+    isStored
+  } StoredType;
+
+  /// StoredOnceValue - If only one value (besides the initializer constant) is
+  /// ever stored to this global, keep track of what value it is.
+  Value *StoredOnceValue;
+
+  /// AccessingFunction/HasMultipleAccessingFunctions - These start out
+  /// null/false.  When the first accessing function is noticed, it is recorded.
+  /// When a second different accessing function is noticed,
+  /// HasMultipleAccessingFunctions is set to true.
+  Function *AccessingFunction;
+  bool HasMultipleAccessingFunctions;
+
+  /// HasNonInstructionUser - Set to true if this global has a user that is not
+  /// an instruction (e.g. a constant expr or GV initializer).
+  bool HasNonInstructionUser;
+
+  /// HasPHIUser - Set to true if this global has a user that is a PHI node.
+  bool HasPHIUser;
+  
+  GlobalStatus() : isLoaded(false), StoredType(NotStored), StoredOnceValue(0),
+                   AccessingFunction(0), HasMultipleAccessingFunctions(false),
+                   HasNonInstructionUser(false), HasPHIUser(false) {}
+};
+
+}
+
+/// ConstantIsDead - Return true if the specified constant is (transitively)
+/// dead.  The constant may be used by other constants (e.g. constant arrays and
+/// constant exprs) as long as they are dead, but it cannot be used by anything
+/// else.
+static bool ConstantIsDead(Constant *C) {
+  if (isa<GlobalValue>(C)) return false;
+
+  for (Value::use_iterator UI = C->use_begin(), E = C->use_end(); UI != E; ++UI)
+    if (Constant *CU = dyn_cast<Constant>(*UI)) {
+      if (!ConstantIsDead(CU)) return false;
+    } else
+      return false;
+  return true;
+}
+
+
+/// AnalyzeGlobal - Look at all uses of the global and fill in the GlobalStatus
+/// structure.  If the global has its address taken, return true to indicate we
+/// can't do anything with it.
+///
+static bool AnalyzeGlobal(Value *V, GlobalStatus &GS,
+                          SmallPtrSet<PHINode*, 16> &PHIUsers) {
+  for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); UI != E; ++UI)
+    if (ConstantExpr *CE = dyn_cast<ConstantExpr>(*UI)) {
+      GS.HasNonInstructionUser = true;
+
+      if (AnalyzeGlobal(CE, GS, PHIUsers)) return true;
+
+    } else if (Instruction *I = dyn_cast<Instruction>(*UI)) {
+      if (!GS.HasMultipleAccessingFunctions) {
+        Function *F = I->getParent()->getParent();
+        if (GS.AccessingFunction == 0)
+          GS.AccessingFunction = F;
+        else if (GS.AccessingFunction != F)
+          GS.HasMultipleAccessingFunctions = true;
+      }
+      if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+        GS.isLoaded = true;
+        if (LI->isVolatile()) return true;  // Don't hack on volatile loads.
+      } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
+        // Don't allow a store OF the address, only stores TO the address.
+        if (SI->getOperand(0) == V) return true;
+
+        if (SI->isVolatile()) return true;  // Don't hack on volatile stores.
+
+        // If this is a direct store to the global (i.e., the global is a scalar
+        // value, not an aggregate), keep more specific information about
+        // stores.
+        if (GS.StoredType != GlobalStatus::isStored) {
+          if (GlobalVariable *GV = dyn_cast<GlobalVariable>(SI->getOperand(1))){
+            Value *StoredVal = SI->getOperand(0);
+            if (StoredVal == GV->getInitializer()) {
+              if (GS.StoredType < GlobalStatus::isInitializerStored)
+                GS.StoredType = GlobalStatus::isInitializerStored;
+            } else if (isa<LoadInst>(StoredVal) &&
+                       cast<LoadInst>(StoredVal)->getOperand(0) == GV) {
+              // G = G
+              if (GS.StoredType < GlobalStatus::isInitializerStored)
+                GS.StoredType = GlobalStatus::isInitializerStored;
+            } else if (GS.StoredType < GlobalStatus::isStoredOnce) {
+              GS.StoredType = GlobalStatus::isStoredOnce;
+              GS.StoredOnceValue = StoredVal;
+            } else if (GS.StoredType == GlobalStatus::isStoredOnce &&
+                       GS.StoredOnceValue == StoredVal) {
+              // noop.
+            } else {
+              GS.StoredType = GlobalStatus::isStored;
+            }
+          } else {
+            GS.StoredType = GlobalStatus::isStored;
+          }
+        }
+      } else if (isa<GetElementPtrInst>(I)) {
+        if (AnalyzeGlobal(I, GS, PHIUsers)) return true;
+      } else if (isa<SelectInst>(I)) {
+        if (AnalyzeGlobal(I, GS, PHIUsers)) return true;
+      } else if (PHINode *PN = dyn_cast<PHINode>(I)) {
+        // PHI nodes we can check just like select or GEP instructions, but we
+        // have to be careful about infinite recursion.
+        if (PHIUsers.insert(PN))  // Not already visited.
+          if (AnalyzeGlobal(I, GS, PHIUsers)) return true;
+        GS.HasPHIUser = true;
+      } else if (isa<CmpInst>(I)) {
+      } else if (isa<MemTransferInst>(I)) {
+        if (I->getOperand(1) == V)
+          GS.StoredType = GlobalStatus::isStored;
+        if (I->getOperand(2) == V)
+          GS.isLoaded = true;
+      } else if (isa<MemSetInst>(I)) {
+        assert(I->getOperand(1) == V && "Memset only takes one pointer!");
+        GS.StoredType = GlobalStatus::isStored;
+      } else {
+        return true;  // Any other non-load instruction might take address!
+      }
+    } else if (Constant *C = dyn_cast<Constant>(*UI)) {
+      GS.HasNonInstructionUser = true;
+      // We might have a dead and dangling constant hanging off of here.
+      if (!ConstantIsDead(C))
+        return true;
+    } else {
+      GS.HasNonInstructionUser = true;
+      // Otherwise must be some other user.
+      return true;
+    }
+
+  return false;
+}
+
+static Constant *getAggregateConstantElement(Constant *Agg, Constant *Idx) {
+  ConstantInt *CI = dyn_cast<ConstantInt>(Idx);
+  if (!CI) return 0;
+  unsigned IdxV = CI->getZExtValue();
+
+  if (ConstantStruct *CS = dyn_cast<ConstantStruct>(Agg)) {
+    if (IdxV < CS->getNumOperands()) return CS->getOperand(IdxV);
+  } else if (ConstantArray *CA = dyn_cast<ConstantArray>(Agg)) {
+    if (IdxV < CA->getNumOperands()) return CA->getOperand(IdxV);
+  } else if (ConstantVector *CP = dyn_cast<ConstantVector>(Agg)) {
+    if (IdxV < CP->getNumOperands()) return CP->getOperand(IdxV);
+  } else if (isa<ConstantAggregateZero>(Agg)) {
+    if (const StructType *STy = dyn_cast<StructType>(Agg->getType())) {
+      if (IdxV < STy->getNumElements())
+        return Constant::getNullValue(STy->getElementType(IdxV));
+    } else if (const SequentialType *STy =
+               dyn_cast<SequentialType>(Agg->getType())) {
+      return Constant::getNullValue(STy->getElementType());
+    }
+  } else if (isa<UndefValue>(Agg)) {
+    if (const StructType *STy = dyn_cast<StructType>(Agg->getType())) {
+      if (IdxV < STy->getNumElements())
+        return UndefValue::get(STy->getElementType(IdxV));
+    } else if (const SequentialType *STy =
+               dyn_cast<SequentialType>(Agg->getType())) {
+      return UndefValue::get(STy->getElementType());
+    }
+  }
+  return 0;
+}
+
+
+/// CleanupConstantGlobalUsers - We just marked GV constant.  Loop over all
+/// users of the global, cleaning up the obvious ones.  This is largely just a
+/// quick scan over the use list to clean up the easy and obvious cruft.  This
+/// returns true if it made a change.
+static bool CleanupConstantGlobalUsers(Value *V, Constant *Init) {
+  bool Changed = false;
+  for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); UI != E;) {
+    User *U = *UI++;
+
+    if (LoadInst *LI = dyn_cast<LoadInst>(U)) {
+      if (Init) {
+        // Replace the load with the initializer.
+        LI->replaceAllUsesWith(Init);
+        LI->eraseFromParent();
+        Changed = true;
+      }
+    } else if (StoreInst *SI = dyn_cast<StoreInst>(U)) {
+      // Store must be unreachable or storing Init into the global.
+      SI->eraseFromParent();
+      Changed = true;
+    } else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(U)) {
+      if (CE->getOpcode() == Instruction::GetElementPtr) {
+        Constant *SubInit = 0;
+        if (Init)
+          SubInit = ConstantFoldLoadThroughGEPConstantExpr(Init, CE);
+        Changed |= CleanupConstantGlobalUsers(CE, SubInit);
+      } else if (CE->getOpcode() == Instruction::BitCast && 
+                 isa<PointerType>(CE->getType())) {
+        // Pointer cast, delete any stores and memsets to the global.
+        Changed |= CleanupConstantGlobalUsers(CE, 0);
+      }
+
+      if (CE->use_empty()) {
+        CE->destroyConstant();
+        Changed = true;
+      }
+    } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(U)) {
+      // Do not transform "gepinst (gep constexpr (GV))" here, because forming
+      // "gepconstexpr (gep constexpr (GV))" will cause the two gep's to fold
+      // and will invalidate our notion of what Init is.
+      Constant *SubInit = 0;
+      if (!isa<ConstantExpr>(GEP->getOperand(0))) {
+        ConstantExpr *CE = 
+          dyn_cast_or_null<ConstantExpr>(ConstantFoldInstruction(GEP));
+        if (Init && CE && CE->getOpcode() == Instruction::GetElementPtr)
+          SubInit = ConstantFoldLoadThroughGEPConstantExpr(Init, CE);
+      }
+      Changed |= CleanupConstantGlobalUsers(GEP, SubInit);
+
+      if (GEP->use_empty()) {
+        GEP->eraseFromParent();
+        Changed = true;
+      }
+    } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U)) { // memset/cpy/mv
+      if (MI->getRawDest() == V) {
+        MI->eraseFromParent();
+        Changed = true;
+      }
+
+    } else if (Constant *C = dyn_cast<Constant>(U)) {
+      // If we have a chain of dead constantexprs or other things dangling from
+      // us, and if they are all dead, nuke them without remorse.
+      if (ConstantIsDead(C)) {
+        C->destroyConstant();
+        // This could have invalidated UI, start over from scratch.
+        CleanupConstantGlobalUsers(V, Init);
+        return true;
+      }
+    }
+  }
+  return Changed;
+}
+
+/// isSafeSROAElementUse - Return true if the specified instruction is a safe
+/// user of a derived expression from a global that we want to SROA.
+static bool isSafeSROAElementUse(Value *V) {
+  // We might have a dead and dangling constant hanging off of here.
+  if (Constant *C = dyn_cast<Constant>(V))
+    return ConstantIsDead(C);
+  
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (!I) return false;
+
+  // Loads are ok.
+  if (isa<LoadInst>(I)) return true;
+
+  // Stores *to* the pointer are ok.
+  if (StoreInst *SI = dyn_cast<StoreInst>(I))
+    return SI->getOperand(0) != V;
+    
+  // Otherwise, it must be a GEP.
+  GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(I);
+  if (GEPI == 0) return false;
+  
+  if (GEPI->getNumOperands() < 3 || !isa<Constant>(GEPI->getOperand(1)) ||
+      !cast<Constant>(GEPI->getOperand(1))->isNullValue())
+    return false;
+  
+  for (Value::use_iterator I = GEPI->use_begin(), E = GEPI->use_end();
+       I != E; ++I)
+    if (!isSafeSROAElementUse(*I))
+      return false;
+  return true;
+}
+
+
+/// IsUserOfGlobalSafeForSRA - U is a direct user of the specified global value.
+/// Look at it and its uses and decide whether it is safe to SROA this global.
+///
+static bool IsUserOfGlobalSafeForSRA(User *U, GlobalValue *GV) {
+  // The user of the global must be a GEP Inst or a ConstantExpr GEP.
+  if (!isa<GetElementPtrInst>(U) && 
+      (!isa<ConstantExpr>(U) || 
+       cast<ConstantExpr>(U)->getOpcode() != Instruction::GetElementPtr))
+    return false;
+  
+  // Check to see if this ConstantExpr GEP is SRA'able.  In particular, we
+  // don't like < 3 operand CE's, and we don't like non-constant integer
+  // indices.  This enforces that all uses are 'gep GV, 0, C, ...' for some
+  // value of C.
+  if (U->getNumOperands() < 3 || !isa<Constant>(U->getOperand(1)) ||
+      !cast<Constant>(U->getOperand(1))->isNullValue() ||
+      !isa<ConstantInt>(U->getOperand(2)))
+    return false;
+
+  gep_type_iterator GEPI = gep_type_begin(U), E = gep_type_end(U);
+  ++GEPI;  // Skip over the pointer index.
+  
+  // If this is a use of an array allocation, do a bit more checking for sanity.
+  if (const ArrayType *AT = dyn_cast<ArrayType>(*GEPI)) {
+    uint64_t NumElements = AT->getNumElements();
+    ConstantInt *Idx = cast<ConstantInt>(U->getOperand(2));
+    
+    // Check to make sure that index falls within the array.  If not,
+    // something funny is going on, so we won't do the optimization.
+    //
+    if (Idx->getZExtValue() >= NumElements)
+      return false;
+      
+    // We cannot scalar repl this level of the array unless any array
+    // sub-indices are in-range constants.  In particular, consider:
+    // A[0][i].  We cannot know that the user isn't doing invalid things like
+    // allowing i to index an out-of-range subscript that accesses A[1].
+    //
+    // Scalar replacing *just* the outer index of the array is probably not
+    // going to be a win anyway, so just give up.
+    for (++GEPI; // Skip array index.
+         GEPI != E && (isa<ArrayType>(*GEPI) || isa<VectorType>(*GEPI));
+         ++GEPI) {
+      uint64_t NumElements;
+      if (const ArrayType *SubArrayTy = dyn_cast<ArrayType>(*GEPI))
+        NumElements = SubArrayTy->getNumElements();
+      else
+        NumElements = cast<VectorType>(*GEPI)->getNumElements();
+      
+      ConstantInt *IdxVal = dyn_cast<ConstantInt>(GEPI.getOperand());
+      if (!IdxVal || IdxVal->getZExtValue() >= NumElements)
+        return false;
+    }
+  }
+
+  for (Value::use_iterator I = U->use_begin(), E = U->use_end(); I != E; ++I)
+    if (!isSafeSROAElementUse(*I))
+      return false;
+  return true;
+}
+
+/// GlobalUsersSafeToSRA - Look at all uses of the global and decide whether it
+/// is safe for us to perform this transformation.
+///
+static bool GlobalUsersSafeToSRA(GlobalValue *GV) {
+  for (Value::use_iterator UI = GV->use_begin(), E = GV->use_end();
+       UI != E; ++UI) {
+    if (!IsUserOfGlobalSafeForSRA(*UI, GV))
+      return false;
+  }
+  return true;
+}
+ 
+
+/// SRAGlobal - Perform scalar replacement of aggregates on the specified global
+/// variable.  This opens the door for other optimizations by exposing the
+/// behavior of the program in a more fine-grained way.  We have determined that
+/// this transformation is safe already.  We return the first global variable we
+/// insert so that the caller can reprocess it.
+static GlobalVariable *SRAGlobal(GlobalVariable *GV, const TargetData &TD) {
+  // Make sure this global only has simple uses that we can SRA.
+  if (!GlobalUsersSafeToSRA(GV))
+    return 0;
+  
+  assert(GV->hasLocalLinkage() && !GV->isConstant());
+  Constant *Init = GV->getInitializer();
+  const Type *Ty = Init->getType();
+
+  std::vector<GlobalVariable*> NewGlobals;
+  Module::GlobalListType &Globals = GV->getParent()->getGlobalList();
+
+  // Get the alignment of the global, either explicit or target-specific.
+  unsigned StartAlignment = GV->getAlignment();
+  if (StartAlignment == 0)
+    StartAlignment = TD.getABITypeAlignment(GV->getType());
+   
+  if (const StructType *STy = dyn_cast<StructType>(Ty)) {
+    NewGlobals.reserve(STy->getNumElements());
+    const StructLayout &Layout = *TD.getStructLayout(STy);
+    for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
+      Constant *In = getAggregateConstantElement(Init,
+                                            ConstantInt::get(Type::Int32Ty, i));
+      assert(In && "Couldn't get element of initializer?");
+      GlobalVariable *NGV = new GlobalVariable(STy->getElementType(i), false,
+                                               GlobalVariable::InternalLinkage,
+                                               In, GV->getName()+"."+utostr(i),
+                                               (Module *)NULL,
+                                               GV->isThreadLocal(),
+                                               GV->getType()->getAddressSpace());
+      Globals.insert(GV, NGV);
+      NewGlobals.push_back(NGV);
+      
+      // Calculate the known alignment of the field.  If the original aggregate
+      // had 256 byte alignment for example, something might depend on that:
+      // propagate info to each field.
+      uint64_t FieldOffset = Layout.getElementOffset(i);
+      unsigned NewAlign = (unsigned)MinAlign(StartAlignment, FieldOffset);
+      if (NewAlign > TD.getABITypeAlignment(STy->getElementType(i)))
+        NGV->setAlignment(NewAlign);
+    }
+  } else if (const SequentialType *STy = dyn_cast<SequentialType>(Ty)) {
+    unsigned NumElements = 0;
+    if (const ArrayType *ATy = dyn_cast<ArrayType>(STy))
+      NumElements = ATy->getNumElements();
+    else
+      NumElements = cast<VectorType>(STy)->getNumElements();
+
+    if (NumElements > 16 && GV->hasNUsesOrMore(16))
+      return 0; // It's not worth it.
+    NewGlobals.reserve(NumElements);
+    
+    uint64_t EltSize = TD.getTypeAllocSize(STy->getElementType());
+    unsigned EltAlign = TD.getABITypeAlignment(STy->getElementType());
+    for (unsigned i = 0, e = NumElements; i != e; ++i) {
+      Constant *In = getAggregateConstantElement(Init,
+                                            ConstantInt::get(Type::Int32Ty, i));
+      assert(In && "Couldn't get element of initializer?");
+
+      GlobalVariable *NGV = new GlobalVariable(STy->getElementType(), false,
+                                               GlobalVariable::InternalLinkage,
+                                               In, GV->getName()+"."+utostr(i),
+                                               (Module *)NULL,
+                                               GV->isThreadLocal(),
+                                               GV->getType()->getAddressSpace());
+      Globals.insert(GV, NGV);
+      NewGlobals.push_back(NGV);
+      
+      // Calculate the known alignment of the field.  If the original aggregate
+      // had 256 byte alignment for example, something might depend on that:
+      // propagate info to each field.
+      unsigned NewAlign = (unsigned)MinAlign(StartAlignment, EltSize*i);
+      if (NewAlign > EltAlign)
+        NGV->setAlignment(NewAlign);
+    }
+  }
+
+  if (NewGlobals.empty())
+    return 0;
+
+  DOUT << "PERFORMING GLOBAL SRA ON: " << *GV;
+
+  Constant *NullInt = Constant::getNullValue(Type::Int32Ty);
+
+  // Loop over all of the uses of the global, replacing the constantexpr geps,
+  // with smaller constantexpr geps or direct references.
+  while (!GV->use_empty()) {
+    User *GEP = GV->use_back();
+    assert(((isa<ConstantExpr>(GEP) &&
+             cast<ConstantExpr>(GEP)->getOpcode()==Instruction::GetElementPtr)||
+            isa<GetElementPtrInst>(GEP)) && "NonGEP CE's are not SRAable!");
+
+    // Ignore the 1th operand, which has to be zero or else the program is quite
+    // broken (undefined).  Get the 2nd operand, which is the structure or array
+    // index.
+    unsigned Val = cast<ConstantInt>(GEP->getOperand(2))->getZExtValue();
+    if (Val >= NewGlobals.size()) Val = 0; // Out of bound array access.
+
+    Value *NewPtr = NewGlobals[Val];
+
+    // Form a shorter GEP if needed.
+    if (GEP->getNumOperands() > 3) {
+      if (ConstantExpr *CE = dyn_cast<ConstantExpr>(GEP)) {
+        SmallVector<Constant*, 8> Idxs;
+        Idxs.push_back(NullInt);
+        for (unsigned i = 3, e = CE->getNumOperands(); i != e; ++i)
+          Idxs.push_back(CE->getOperand(i));
+        NewPtr = ConstantExpr::getGetElementPtr(cast<Constant>(NewPtr),
+                                                &Idxs[0], Idxs.size());
+      } else {
+        GetElementPtrInst *GEPI = cast<GetElementPtrInst>(GEP);
+        SmallVector<Value*, 8> Idxs;
+        Idxs.push_back(NullInt);
+        for (unsigned i = 3, e = GEPI->getNumOperands(); i != e; ++i)
+          Idxs.push_back(GEPI->getOperand(i));
+        NewPtr = GetElementPtrInst::Create(NewPtr, Idxs.begin(), Idxs.end(),
+                                           GEPI->getName()+"."+utostr(Val), GEPI);
+      }
+    }
+    GEP->replaceAllUsesWith(NewPtr);
+
+    if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(GEP))
+      GEPI->eraseFromParent();
+    else
+      cast<ConstantExpr>(GEP)->destroyConstant();
+  }
+
+  // Delete the old global, now that it is dead.
+  Globals.erase(GV);
+  ++NumSRA;
+
+  // Loop over the new globals array deleting any globals that are obviously
+  // dead.  This can arise due to scalarization of a structure or an array that
+  // has elements that are dead.
+  unsigned FirstGlobal = 0;
+  for (unsigned i = 0, e = NewGlobals.size(); i != e; ++i)
+    if (NewGlobals[i]->use_empty()) {
+      Globals.erase(NewGlobals[i]);
+      if (FirstGlobal == i) ++FirstGlobal;
+    }
+
+  return FirstGlobal != NewGlobals.size() ? NewGlobals[FirstGlobal] : 0;
+}
+
+/// AllUsesOfValueWillTrapIfNull - Return true if all users of the specified
+/// value will trap if the value is dynamically null.  PHIs keeps track of any 
+/// phi nodes we've seen to avoid reprocessing them.
+static bool AllUsesOfValueWillTrapIfNull(Value *V,
+                                         SmallPtrSet<PHINode*, 8> &PHIs) {
+  for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); UI != E; ++UI)
+    if (isa<LoadInst>(*UI)) {
+      // Will trap.
+    } else if (StoreInst *SI = dyn_cast<StoreInst>(*UI)) {
+      if (SI->getOperand(0) == V) {
+        //cerr << "NONTRAPPING USE: " << **UI;
+        return false;  // Storing the value.
+      }
+    } else if (CallInst *CI = dyn_cast<CallInst>(*UI)) {
+      if (CI->getOperand(0) != V) {
+        //cerr << "NONTRAPPING USE: " << **UI;
+        return false;  // Not calling the ptr
+      }
+    } else if (InvokeInst *II = dyn_cast<InvokeInst>(*UI)) {
+      if (II->getOperand(0) != V) {
+        //cerr << "NONTRAPPING USE: " << **UI;
+        return false;  // Not calling the ptr
+      }
+    } else if (BitCastInst *CI = dyn_cast<BitCastInst>(*UI)) {
+      if (!AllUsesOfValueWillTrapIfNull(CI, PHIs)) return false;
+    } else if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(*UI)) {
+      if (!AllUsesOfValueWillTrapIfNull(GEPI, PHIs)) return false;
+    } else if (PHINode *PN = dyn_cast<PHINode>(*UI)) {
+      // If we've already seen this phi node, ignore it, it has already been
+      // checked.
+      if (PHIs.insert(PN))
+        return AllUsesOfValueWillTrapIfNull(PN, PHIs);
+    } else if (isa<ICmpInst>(*UI) &&
+               isa<ConstantPointerNull>(UI->getOperand(1))) {
+      // Ignore setcc X, null
+    } else {
+      //cerr << "NONTRAPPING USE: " << **UI;
+      return false;
+    }
+  return true;
+}
+
+/// AllUsesOfLoadedValueWillTrapIfNull - Return true if all uses of any loads
+/// from GV will trap if the loaded value is null.  Note that this also permits
+/// comparisons of the loaded value against null, as a special case.
+static bool AllUsesOfLoadedValueWillTrapIfNull(GlobalVariable *GV) {
+  for (Value::use_iterator UI = GV->use_begin(), E = GV->use_end(); UI!=E; ++UI)
+    if (LoadInst *LI = dyn_cast<LoadInst>(*UI)) {
+      SmallPtrSet<PHINode*, 8> PHIs;
+      if (!AllUsesOfValueWillTrapIfNull(LI, PHIs))
+        return false;
+    } else if (isa<StoreInst>(*UI)) {
+      // Ignore stores to the global.
+    } else {
+      // We don't know or understand this user, bail out.
+      //cerr << "UNKNOWN USER OF GLOBAL!: " << **UI;
+      return false;
+    }
+
+  return true;
+}
+
+static bool OptimizeAwayTrappingUsesOfValue(Value *V, Constant *NewV) {
+  bool Changed = false;
+  for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); UI != E; ) {
+    Instruction *I = cast<Instruction>(*UI++);
+    if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+      LI->setOperand(0, NewV);
+      Changed = true;
+    } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
+      if (SI->getOperand(1) == V) {
+        SI->setOperand(1, NewV);
+        Changed = true;
+      }
+    } else if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
+      if (I->getOperand(0) == V) {
+        // Calling through the pointer!  Turn into a direct call, but be careful
+        // that the pointer is not also being passed as an argument.
+        I->setOperand(0, NewV);
+        Changed = true;
+        bool PassedAsArg = false;
+        for (unsigned i = 1, e = I->getNumOperands(); i != e; ++i)
+          if (I->getOperand(i) == V) {
+            PassedAsArg = true;
+            I->setOperand(i, NewV);
+          }
+
+        if (PassedAsArg) {
+          // Being passed as an argument also.  Be careful to not invalidate UI!
+          UI = V->use_begin();
+        }
+      }
+    } else if (CastInst *CI = dyn_cast<CastInst>(I)) {
+      Changed |= OptimizeAwayTrappingUsesOfValue(CI,
+                                ConstantExpr::getCast(CI->getOpcode(),
+                                                      NewV, CI->getType()));
+      if (CI->use_empty()) {
+        Changed = true;
+        CI->eraseFromParent();
+      }
+    } else if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(I)) {
+      // Should handle GEP here.
+      SmallVector<Constant*, 8> Idxs;
+      Idxs.reserve(GEPI->getNumOperands()-1);
+      for (User::op_iterator i = GEPI->op_begin() + 1, e = GEPI->op_end();
+           i != e; ++i)
+        if (Constant *C = dyn_cast<Constant>(*i))
+          Idxs.push_back(C);
+        else
+          break;
+      if (Idxs.size() == GEPI->getNumOperands()-1)
+        Changed |= OptimizeAwayTrappingUsesOfValue(GEPI,
+                                ConstantExpr::getGetElementPtr(NewV, &Idxs[0],
+                                                               Idxs.size()));
+      if (GEPI->use_empty()) {
+        Changed = true;
+        GEPI->eraseFromParent();
+      }
+    }
+  }
+
+  return Changed;
+}
+
+
+/// OptimizeAwayTrappingUsesOfLoads - The specified global has only one non-null
+/// value stored into it.  If there are uses of the loaded value that would trap
+/// if the loaded value is dynamically null, then we know that they cannot be
+/// reachable with a null optimize away the load.
+static bool OptimizeAwayTrappingUsesOfLoads(GlobalVariable *GV, Constant *LV) {
+  bool Changed = false;
+
+  // Keep track of whether we are able to remove all the uses of the global
+  // other than the store that defines it.
+  bool AllNonStoreUsesGone = true;
+  
+  // Replace all uses of loads with uses of uses of the stored value.
+  for (Value::use_iterator GUI = GV->use_begin(), E = GV->use_end(); GUI != E;){
+    User *GlobalUser = *GUI++;
+    if (LoadInst *LI = dyn_cast<LoadInst>(GlobalUser)) {
+      Changed |= OptimizeAwayTrappingUsesOfValue(LI, LV);
+      // If we were able to delete all uses of the loads
+      if (LI->use_empty()) {
+        LI->eraseFromParent();
+        Changed = true;
+      } else {
+        AllNonStoreUsesGone = false;
+      }
+    } else if (isa<StoreInst>(GlobalUser)) {
+      // Ignore the store that stores "LV" to the global.
+      assert(GlobalUser->getOperand(1) == GV &&
+             "Must be storing *to* the global");
+    } else {
+      AllNonStoreUsesGone = false;
+
+      // If we get here we could have other crazy uses that are transitively
+      // loaded.
+      assert((isa<PHINode>(GlobalUser) || isa<SelectInst>(GlobalUser) ||
+              isa<ConstantExpr>(GlobalUser)) && "Only expect load and stores!");
+    }
+  }
+
+  if (Changed) {
+    DOUT << "OPTIMIZED LOADS FROM STORED ONCE POINTER: " << *GV;
+    ++NumGlobUses;
+  }
+
+  // If we nuked all of the loads, then none of the stores are needed either,
+  // nor is the global.
+  if (AllNonStoreUsesGone) {
+    DOUT << "  *** GLOBAL NOW DEAD!\n";
+    CleanupConstantGlobalUsers(GV, 0);
+    if (GV->use_empty()) {
+      GV->eraseFromParent();
+      ++NumDeleted;
+    }
+    Changed = true;
+  }
+  return Changed;
+}
+
+/// ConstantPropUsersOf - Walk the use list of V, constant folding all of the
+/// instructions that are foldable.
+static void ConstantPropUsersOf(Value *V) {
+  for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); UI != E; )
+    if (Instruction *I = dyn_cast<Instruction>(*UI++))
+      if (Constant *NewC = ConstantFoldInstruction(I)) {
+        I->replaceAllUsesWith(NewC);
+
+        // Advance UI to the next non-I use to avoid invalidating it!
+        // Instructions could multiply use V.
+        while (UI != E && *UI == I)
+          ++UI;
+        I->eraseFromParent();
+      }
+}
+
+/// OptimizeGlobalAddressOfMalloc - This function takes the specified global
+/// variable, and transforms the program as if it always contained the result of
+/// the specified malloc.  Because it is always the result of the specified
+/// malloc, there is no reason to actually DO the malloc.  Instead, turn the
+/// malloc into a global, and any loads of GV as uses of the new global.
+static GlobalVariable *OptimizeGlobalAddressOfMalloc(GlobalVariable *GV,
+                                                     MallocInst *MI) {
+  DOUT << "PROMOTING MALLOC GLOBAL: " << *GV << "  MALLOC = " << *MI;
+  ConstantInt *NElements = cast<ConstantInt>(MI->getArraySize());
+
+  if (NElements->getZExtValue() != 1) {
+    // If we have an array allocation, transform it to a single element
+    // allocation to make the code below simpler.
+    Type *NewTy = ArrayType::get(MI->getAllocatedType(),
+                                 NElements->getZExtValue());
+    MallocInst *NewMI =
+      new MallocInst(NewTy, Constant::getNullValue(Type::Int32Ty),
+                     MI->getAlignment(), MI->getName(), MI);
+    Value* Indices[2];
+    Indices[0] = Indices[1] = Constant::getNullValue(Type::Int32Ty);
+    Value *NewGEP = GetElementPtrInst::Create(NewMI, Indices, Indices + 2,
+                                              NewMI->getName()+".el0", MI);
+    MI->replaceAllUsesWith(NewGEP);
+    MI->eraseFromParent();
+    MI = NewMI;
+  }
+
+  // Create the new global variable.  The contents of the malloc'd memory is
+  // undefined, so initialize with an undef value.
+  Constant *Init = UndefValue::get(MI->getAllocatedType());
+  GlobalVariable *NewGV = new GlobalVariable(MI->getAllocatedType(), false,
+                                             GlobalValue::InternalLinkage, Init,
+                                             GV->getName()+".body",
+                                             (Module *)NULL,
+                                             GV->isThreadLocal());
+  // FIXME: This new global should have the alignment returned by malloc.  Code
+  // could depend on malloc returning large alignment (on the mac, 16 bytes) but
+  // this would only guarantee some lower alignment.
+  GV->getParent()->getGlobalList().insert(GV, NewGV);
+
+  // Anything that used the malloc now uses the global directly.
+  MI->replaceAllUsesWith(NewGV);
+
+  Constant *RepValue = NewGV;
+  if (NewGV->getType() != GV->getType()->getElementType())
+    RepValue = ConstantExpr::getBitCast(RepValue, 
+                                        GV->getType()->getElementType());
+
+  // If there is a comparison against null, we will insert a global bool to
+  // keep track of whether the global was initialized yet or not.
+  GlobalVariable *InitBool =
+    new GlobalVariable(Type::Int1Ty, false, GlobalValue::InternalLinkage,
+                       ConstantInt::getFalse(), GV->getName()+".init",
+                       (Module *)NULL, GV->isThreadLocal());
+  bool InitBoolUsed = false;
+
+  // Loop over all uses of GV, processing them in turn.
+  std::vector<StoreInst*> Stores;
+  while (!GV->use_empty())
+    if (LoadInst *LI = dyn_cast<LoadInst>(GV->use_back())) {
+      while (!LI->use_empty()) {
+        Use &LoadUse = LI->use_begin().getUse();
+        if (!isa<ICmpInst>(LoadUse.getUser()))
+          LoadUse = RepValue;
+        else {
+          ICmpInst *CI = cast<ICmpInst>(LoadUse.getUser());
+          // Replace the cmp X, 0 with a use of the bool value.
+          Value *LV = new LoadInst(InitBool, InitBool->getName()+".val", CI);
+          InitBoolUsed = true;
+          switch (CI->getPredicate()) {
+          default: assert(0 && "Unknown ICmp Predicate!");
+          case ICmpInst::ICMP_ULT:
+          case ICmpInst::ICMP_SLT:
+            LV = ConstantInt::getFalse();   // X < null -> always false
+            break;
+          case ICmpInst::ICMP_ULE:
+          case ICmpInst::ICMP_SLE:
+          case ICmpInst::ICMP_EQ:
+            LV = BinaryOperator::CreateNot(LV, "notinit", CI);
+            break;
+          case ICmpInst::ICMP_NE:
+          case ICmpInst::ICMP_UGE:
+          case ICmpInst::ICMP_SGE:
+          case ICmpInst::ICMP_UGT:
+          case ICmpInst::ICMP_SGT:
+            break;  // no change.
+          }
+          CI->replaceAllUsesWith(LV);
+          CI->eraseFromParent();
+        }
+      }
+      LI->eraseFromParent();
+    } else {
+      StoreInst *SI = cast<StoreInst>(GV->use_back());
+      // The global is initialized when the store to it occurs.
+      new StoreInst(ConstantInt::getTrue(), InitBool, SI);
+      SI->eraseFromParent();
+    }
+
+  // If the initialization boolean was used, insert it, otherwise delete it.
+  if (!InitBoolUsed) {
+    while (!InitBool->use_empty())  // Delete initializations
+      cast<Instruction>(InitBool->use_back())->eraseFromParent();
+    delete InitBool;
+  } else
+    GV->getParent()->getGlobalList().insert(GV, InitBool);
+
+
+  // Now the GV is dead, nuke it and the malloc.
+  GV->eraseFromParent();
+  MI->eraseFromParent();
+
+  // To further other optimizations, loop over all users of NewGV and try to
+  // constant prop them.  This will promote GEP instructions with constant
+  // indices into GEP constant-exprs, which will allow global-opt to hack on it.
+  ConstantPropUsersOf(NewGV);
+  if (RepValue != NewGV)
+    ConstantPropUsersOf(RepValue);
+
+  return NewGV;
+}
+
+/// ValueIsOnlyUsedLocallyOrStoredToOneGlobal - Scan the use-list of V checking
+/// to make sure that there are no complex uses of V.  We permit simple things
+/// like dereferencing the pointer, but not storing through the address, unless
+/// it is to the specified global.
+static bool ValueIsOnlyUsedLocallyOrStoredToOneGlobal(Instruction *V,
+                                                      GlobalVariable *GV,
+                                              SmallPtrSet<PHINode*, 8> &PHIs) {
+  for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); UI != E;++UI){
+    Instruction *Inst = dyn_cast<Instruction>(*UI);
+    if (Inst == 0) return false;
+    
+    if (isa<LoadInst>(Inst) || isa<CmpInst>(Inst)) {
+      continue; // Fine, ignore.
+    }
+    
+    if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
+      if (SI->getOperand(0) == V && SI->getOperand(1) != GV)
+        return false;  // Storing the pointer itself... bad.
+      continue; // Otherwise, storing through it, or storing into GV... fine.
+    }
+    
+    if (isa<GetElementPtrInst>(Inst)) {
+      if (!ValueIsOnlyUsedLocallyOrStoredToOneGlobal(Inst, GV, PHIs))
+        return false;
+      continue;
+    }
+    
+    if (PHINode *PN = dyn_cast<PHINode>(Inst)) {
+      // PHIs are ok if all uses are ok.  Don't infinitely recurse through PHI
+      // cycles.
+      if (PHIs.insert(PN))
+        if (!ValueIsOnlyUsedLocallyOrStoredToOneGlobal(PN, GV, PHIs))
+          return false;
+      continue;
+    }
+    
+    if (BitCastInst *BCI = dyn_cast<BitCastInst>(Inst)) {
+      if (!ValueIsOnlyUsedLocallyOrStoredToOneGlobal(BCI, GV, PHIs))
+        return false;
+      continue;
+    }
+    
+    return false;
+  }
+  return true;
+}
+
+/// ReplaceUsesOfMallocWithGlobal - The Alloc pointer is stored into GV
+/// somewhere.  Transform all uses of the allocation into loads from the
+/// global and uses of the resultant pointer.  Further, delete the store into
+/// GV.  This assumes that these value pass the 
+/// 'ValueIsOnlyUsedLocallyOrStoredToOneGlobal' predicate.
+static void ReplaceUsesOfMallocWithGlobal(Instruction *Alloc, 
+                                          GlobalVariable *GV) {
+  while (!Alloc->use_empty()) {
+    Instruction *U = cast<Instruction>(*Alloc->use_begin());
+    Instruction *InsertPt = U;
+    if (StoreInst *SI = dyn_cast<StoreInst>(U)) {
+      // If this is the store of the allocation into the global, remove it.
+      if (SI->getOperand(1) == GV) {
+        SI->eraseFromParent();
+        continue;
+      }
+    } else if (PHINode *PN = dyn_cast<PHINode>(U)) {
+      // Insert the load in the corresponding predecessor, not right before the
+      // PHI.
+      InsertPt = PN->getIncomingBlock(Alloc->use_begin())->getTerminator();
+    } else if (isa<BitCastInst>(U)) {
+      // Must be bitcast between the malloc and store to initialize the global.
+      ReplaceUsesOfMallocWithGlobal(U, GV);
+      U->eraseFromParent();
+      continue;
+    } else if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(U)) {
+      // If this is a "GEP bitcast" and the user is a store to the global, then
+      // just process it as a bitcast.
+      if (GEPI->hasAllZeroIndices() && GEPI->hasOneUse())
+        if (StoreInst *SI = dyn_cast<StoreInst>(GEPI->use_back()))
+          if (SI->getOperand(1) == GV) {
+            // Must be bitcast GEP between the malloc and store to initialize
+            // the global.
+            ReplaceUsesOfMallocWithGlobal(GEPI, GV);
+            GEPI->eraseFromParent();
+            continue;
+          }
+    }
+      
+    // Insert a load from the global, and use it instead of the malloc.
+    Value *NL = new LoadInst(GV, GV->getName()+".val", InsertPt);
+    U->replaceUsesOfWith(Alloc, NL);
+  }
+}
+
+/// LoadUsesSimpleEnoughForHeapSRA - Verify that all uses of V (a load, or a phi
+/// of a load) are simple enough to perform heap SRA on.  This permits GEP's
+/// that index through the array and struct field, icmps of null, and PHIs.
+static bool LoadUsesSimpleEnoughForHeapSRA(Value *V,
+                              SmallPtrSet<PHINode*, 32> &LoadUsingPHIs,
+                              SmallPtrSet<PHINode*, 32> &LoadUsingPHIsPerLoad) {
+  // We permit two users of the load: setcc comparing against the null
+  // pointer, and a getelementptr of a specific form.
+  for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); UI != E;++UI){
+    Instruction *User = cast<Instruction>(*UI);
+    
+    // Comparison against null is ok.
+    if (ICmpInst *ICI = dyn_cast<ICmpInst>(User)) {
+      if (!isa<ConstantPointerNull>(ICI->getOperand(1)))
+        return false;
+      continue;
+    }
+    
+    // getelementptr is also ok, but only a simple form.
+    if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(User)) {
+      // Must index into the array and into the struct.
+      if (GEPI->getNumOperands() < 3)
+        return false;
+      
+      // Otherwise the GEP is ok.
+      continue;
+    }
+    
+    if (PHINode *PN = dyn_cast<PHINode>(User)) {
+      if (!LoadUsingPHIsPerLoad.insert(PN))
+        // This means some phi nodes are dependent on each other.
+        // Avoid infinite looping!
+        return false;
+      if (!LoadUsingPHIs.insert(PN))
+        // If we have already analyzed this PHI, then it is safe.
+        continue;
+      
+      // Make sure all uses of the PHI are simple enough to transform.
+      if (!LoadUsesSimpleEnoughForHeapSRA(PN,
+                                          LoadUsingPHIs, LoadUsingPHIsPerLoad))
+        return false;
+      
+      continue;
+    }
+    
+    // Otherwise we don't know what this is, not ok.
+    return false;
+  }
+  
+  return true;
+}
+
+
+/// AllGlobalLoadUsesSimpleEnoughForHeapSRA - If all users of values loaded from
+/// GV are simple enough to perform HeapSRA, return true.
+static bool AllGlobalLoadUsesSimpleEnoughForHeapSRA(GlobalVariable *GV,
+                                                    MallocInst *MI) {
+  SmallPtrSet<PHINode*, 32> LoadUsingPHIs;
+  SmallPtrSet<PHINode*, 32> LoadUsingPHIsPerLoad;
+  for (Value::use_iterator UI = GV->use_begin(), E = GV->use_end(); UI != E; 
+       ++UI)
+    if (LoadInst *LI = dyn_cast<LoadInst>(*UI)) {
+      if (!LoadUsesSimpleEnoughForHeapSRA(LI, LoadUsingPHIs,
+                                          LoadUsingPHIsPerLoad))
+        return false;
+      LoadUsingPHIsPerLoad.clear();
+    }
+  
+  // If we reach here, we know that all uses of the loads and transitive uses
+  // (through PHI nodes) are simple enough to transform.  However, we don't know
+  // that all inputs the to the PHI nodes are in the same equivalence sets. 
+  // Check to verify that all operands of the PHIs are either PHIS that can be
+  // transformed, loads from GV, or MI itself.
+  for (SmallPtrSet<PHINode*, 32>::iterator I = LoadUsingPHIs.begin(),
+       E = LoadUsingPHIs.end(); I != E; ++I) {
+    PHINode *PN = *I;
+    for (unsigned op = 0, e = PN->getNumIncomingValues(); op != e; ++op) {
+      Value *InVal = PN->getIncomingValue(op);
+      
+      // PHI of the stored value itself is ok.
+      if (InVal == MI) continue;
+      
+      if (PHINode *InPN = dyn_cast<PHINode>(InVal)) {
+        // One of the PHIs in our set is (optimistically) ok.
+        if (LoadUsingPHIs.count(InPN))
+          continue;
+        return false;
+      }
+      
+      // Load from GV is ok.
+      if (LoadInst *LI = dyn_cast<LoadInst>(InVal))
+        if (LI->getOperand(0) == GV)
+          continue;
+      
+      // UNDEF? NULL?
+      
+      // Anything else is rejected.
+      return false;
+    }
+  }
+  
+  return true;
+}
+
+static Value *GetHeapSROAValue(Value *V, unsigned FieldNo,
+               DenseMap<Value*, std::vector<Value*> > &InsertedScalarizedValues,
+                   std::vector<std::pair<PHINode*, unsigned> > &PHIsToRewrite) {
+  std::vector<Value*> &FieldVals = InsertedScalarizedValues[V];
+  
+  if (FieldNo >= FieldVals.size())
+    FieldVals.resize(FieldNo+1);
+  
+  // If we already have this value, just reuse the previously scalarized
+  // version.
+  if (Value *FieldVal = FieldVals[FieldNo])
+    return FieldVal;
+  
+  // Depending on what instruction this is, we have several cases.
+  Value *Result;
+  if (LoadInst *LI = dyn_cast<LoadInst>(V)) {
+    // This is a scalarized version of the load from the global.  Just create
+    // a new Load of the scalarized global.
+    Result = new LoadInst(GetHeapSROAValue(LI->getOperand(0), FieldNo,
+                                           InsertedScalarizedValues,
+                                           PHIsToRewrite),
+                          LI->getName()+".f" + utostr(FieldNo), LI);
+  } else if (PHINode *PN = dyn_cast<PHINode>(V)) {
+    // PN's type is pointer to struct.  Make a new PHI of pointer to struct
+    // field.
+    const StructType *ST = 
+      cast<StructType>(cast<PointerType>(PN->getType())->getElementType());
+    
+    Result =PHINode::Create(PointerType::getUnqual(ST->getElementType(FieldNo)),
+                            PN->getName()+".f"+utostr(FieldNo), PN);
+    PHIsToRewrite.push_back(std::make_pair(PN, FieldNo));
+  } else {
+    assert(0 && "Unknown usable value");
+    Result = 0;
+  }
+  
+  return FieldVals[FieldNo] = Result;
+}
+
+/// RewriteHeapSROALoadUser - Given a load instruction and a value derived from
+/// the load, rewrite the derived value to use the HeapSRoA'd load.
+static void RewriteHeapSROALoadUser(Instruction *LoadUser, 
+             DenseMap<Value*, std::vector<Value*> > &InsertedScalarizedValues,
+                   std::vector<std::pair<PHINode*, unsigned> > &PHIsToRewrite) {
+  // If this is a comparison against null, handle it.
+  if (ICmpInst *SCI = dyn_cast<ICmpInst>(LoadUser)) {
+    assert(isa<ConstantPointerNull>(SCI->getOperand(1)));
+    // If we have a setcc of the loaded pointer, we can use a setcc of any
+    // field.
+    Value *NPtr = GetHeapSROAValue(SCI->getOperand(0), 0,
+                                   InsertedScalarizedValues, PHIsToRewrite);
+    
+    Value *New = new ICmpInst(SCI->getPredicate(), NPtr,
+                              Constant::getNullValue(NPtr->getType()),
+                              SCI->getName(), SCI);
+    SCI->replaceAllUsesWith(New);
+    SCI->eraseFromParent();
+    return;
+  }
+  
+  // Handle 'getelementptr Ptr, Idx, i32 FieldNo ...'
+  if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(LoadUser)) {
+    assert(GEPI->getNumOperands() >= 3 && isa<ConstantInt>(GEPI->getOperand(2))
+           && "Unexpected GEPI!");
+  
+    // Load the pointer for this field.
+    unsigned FieldNo = cast<ConstantInt>(GEPI->getOperand(2))->getZExtValue();
+    Value *NewPtr = GetHeapSROAValue(GEPI->getOperand(0), FieldNo,
+                                     InsertedScalarizedValues, PHIsToRewrite);
+    
+    // Create the new GEP idx vector.
+    SmallVector<Value*, 8> GEPIdx;
+    GEPIdx.push_back(GEPI->getOperand(1));
+    GEPIdx.append(GEPI->op_begin()+3, GEPI->op_end());
+    
+    Value *NGEPI = GetElementPtrInst::Create(NewPtr,
+                                             GEPIdx.begin(), GEPIdx.end(),
+                                             GEPI->getName(), GEPI);
+    GEPI->replaceAllUsesWith(NGEPI);
+    GEPI->eraseFromParent();
+    return;
+  }
+
+  // Recursively transform the users of PHI nodes.  This will lazily create the
+  // PHIs that are needed for individual elements.  Keep track of what PHIs we
+  // see in InsertedScalarizedValues so that we don't get infinite loops (very
+  // antisocial).  If the PHI is already in InsertedScalarizedValues, it has
+  // already been seen first by another load, so its uses have already been
+  // processed.
+  PHINode *PN = cast<PHINode>(LoadUser);
+  bool Inserted;
+  DenseMap<Value*, std::vector<Value*> >::iterator InsertPos;
+  tie(InsertPos, Inserted) =
+    InsertedScalarizedValues.insert(std::make_pair(PN, std::vector<Value*>()));
+  if (!Inserted) return;
+  
+  // If this is the first time we've seen this PHI, recursively process all
+  // users.
+  for (Value::use_iterator UI = PN->use_begin(), E = PN->use_end(); UI != E; ) {
+    Instruction *User = cast<Instruction>(*UI++);
+    RewriteHeapSROALoadUser(User, InsertedScalarizedValues, PHIsToRewrite);
+  }
+}
+
+/// RewriteUsesOfLoadForHeapSRoA - We are performing Heap SRoA on a global.  Ptr
+/// is a value loaded from the global.  Eliminate all uses of Ptr, making them
+/// use FieldGlobals instead.  All uses of loaded values satisfy
+/// AllGlobalLoadUsesSimpleEnoughForHeapSRA.
+static void RewriteUsesOfLoadForHeapSRoA(LoadInst *Load, 
+               DenseMap<Value*, std::vector<Value*> > &InsertedScalarizedValues,
+                   std::vector<std::pair<PHINode*, unsigned> > &PHIsToRewrite) {
+  for (Value::use_iterator UI = Load->use_begin(), E = Load->use_end();
+       UI != E; ) {
+    Instruction *User = cast<Instruction>(*UI++);
+    RewriteHeapSROALoadUser(User, InsertedScalarizedValues, PHIsToRewrite);
+  }
+  
+  if (Load->use_empty()) {
+    Load->eraseFromParent();
+    InsertedScalarizedValues.erase(Load);
+  }
+}
+
+/// PerformHeapAllocSRoA - MI is an allocation of an array of structures.  Break
+/// it up into multiple allocations of arrays of the fields.
+static GlobalVariable *PerformHeapAllocSRoA(GlobalVariable *GV, MallocInst *MI){
+  DOUT << "SROA HEAP ALLOC: " << *GV << "  MALLOC = " << *MI;
+  const StructType *STy = cast<StructType>(MI->getAllocatedType());
+
+  // There is guaranteed to be at least one use of the malloc (storing
+  // it into GV).  If there are other uses, change them to be uses of
+  // the global to simplify later code.  This also deletes the store
+  // into GV.
+  ReplaceUsesOfMallocWithGlobal(MI, GV);
+  
+  // Okay, at this point, there are no users of the malloc.  Insert N
+  // new mallocs at the same place as MI, and N globals.
+  std::vector<Value*> FieldGlobals;
+  std::vector<MallocInst*> FieldMallocs;
+  
+  for (unsigned FieldNo = 0, e = STy->getNumElements(); FieldNo != e;++FieldNo){
+    const Type *FieldTy = STy->getElementType(FieldNo);
+    const Type *PFieldTy = PointerType::getUnqual(FieldTy);
+    
+    GlobalVariable *NGV =
+      new GlobalVariable(PFieldTy, false, GlobalValue::InternalLinkage,
+                         Constant::getNullValue(PFieldTy),
+                         GV->getName() + ".f" + utostr(FieldNo), GV,
+                         GV->isThreadLocal());
+    FieldGlobals.push_back(NGV);
+    
+    MallocInst *NMI = new MallocInst(FieldTy, MI->getArraySize(),
+                                     MI->getName() + ".f" + utostr(FieldNo),MI);
+    FieldMallocs.push_back(NMI);
+    new StoreInst(NMI, NGV, MI);
+  }
+  
+  // The tricky aspect of this transformation is handling the case when malloc
+  // fails.  In the original code, malloc failing would set the result pointer
+  // of malloc to null.  In this case, some mallocs could succeed and others
+  // could fail.  As such, we emit code that looks like this:
+  //    F0 = malloc(field0)
+  //    F1 = malloc(field1)
+  //    F2 = malloc(field2)
+  //    if (F0 == 0 || F1 == 0 || F2 == 0) {
+  //      if (F0) { free(F0); F0 = 0; }
+  //      if (F1) { free(F1); F1 = 0; }
+  //      if (F2) { free(F2); F2 = 0; }
+  //    }
+  Value *RunningOr = 0;
+  for (unsigned i = 0, e = FieldMallocs.size(); i != e; ++i) {
+    Value *Cond = new ICmpInst(ICmpInst::ICMP_EQ, FieldMallocs[i],
+                             Constant::getNullValue(FieldMallocs[i]->getType()),
+                                  "isnull", MI);
+    if (!RunningOr)
+      RunningOr = Cond;   // First seteq
+    else
+      RunningOr = BinaryOperator::CreateOr(RunningOr, Cond, "tmp", MI);
+  }
+
+  // Split the basic block at the old malloc.
+  BasicBlock *OrigBB = MI->getParent();
+  BasicBlock *ContBB = OrigBB->splitBasicBlock(MI, "malloc_cont");
+  
+  // Create the block to check the first condition.  Put all these blocks at the
+  // end of the function as they are unlikely to be executed.
+  BasicBlock *NullPtrBlock = BasicBlock::Create("malloc_ret_null",
+                                                OrigBB->getParent());
+  
+  // Remove the uncond branch from OrigBB to ContBB, turning it into a cond
+  // branch on RunningOr.
+  OrigBB->getTerminator()->eraseFromParent();
+  BranchInst::Create(NullPtrBlock, ContBB, RunningOr, OrigBB);
+  
+  // Within the NullPtrBlock, we need to emit a comparison and branch for each
+  // pointer, because some may be null while others are not.
+  for (unsigned i = 0, e = FieldGlobals.size(); i != e; ++i) {
+    Value *GVVal = new LoadInst(FieldGlobals[i], "tmp", NullPtrBlock);
+    Value *Cmp = new ICmpInst(ICmpInst::ICMP_NE, GVVal, 
+                              Constant::getNullValue(GVVal->getType()),
+                              "tmp", NullPtrBlock);
+    BasicBlock *FreeBlock = BasicBlock::Create("free_it", OrigBB->getParent());
+    BasicBlock *NextBlock = BasicBlock::Create("next", OrigBB->getParent());
+    BranchInst::Create(FreeBlock, NextBlock, Cmp, NullPtrBlock);
+
+    // Fill in FreeBlock.
+    new FreeInst(GVVal, FreeBlock);
+    new StoreInst(Constant::getNullValue(GVVal->getType()), FieldGlobals[i],
+                  FreeBlock);
+    BranchInst::Create(NextBlock, FreeBlock);
+    
+    NullPtrBlock = NextBlock;
+  }
+  
+  BranchInst::Create(ContBB, NullPtrBlock);
+  
+  // MI is no longer needed, remove it.
+  MI->eraseFromParent();
+
+  /// InsertedScalarizedLoads - As we process loads, if we can't immediately
+  /// update all uses of the load, keep track of what scalarized loads are
+  /// inserted for a given load.
+  DenseMap<Value*, std::vector<Value*> > InsertedScalarizedValues;
+  InsertedScalarizedValues[GV] = FieldGlobals;
+  
+  std::vector<std::pair<PHINode*, unsigned> > PHIsToRewrite;
+  
+  // Okay, the malloc site is completely handled.  All of the uses of GV are now
+  // loads, and all uses of those loads are simple.  Rewrite them to use loads
+  // of the per-field globals instead.
+  for (Value::use_iterator UI = GV->use_begin(), E = GV->use_end(); UI != E;) {
+    Instruction *User = cast<Instruction>(*UI++);
+    
+    if (LoadInst *LI = dyn_cast<LoadInst>(User)) {
+      RewriteUsesOfLoadForHeapSRoA(LI, InsertedScalarizedValues, PHIsToRewrite);
+      continue;
+    }
+    
+    // Must be a store of null.
+    StoreInst *SI = cast<StoreInst>(User);
+    assert(isa<ConstantPointerNull>(SI->getOperand(0)) &&
+           "Unexpected heap-sra user!");
+    
+    // Insert a store of null into each global.
+    for (unsigned i = 0, e = FieldGlobals.size(); i != e; ++i) {
+      const PointerType *PT = cast<PointerType>(FieldGlobals[i]->getType());
+      Constant *Null = Constant::getNullValue(PT->getElementType());
+      new StoreInst(Null, FieldGlobals[i], SI);
+    }
+    // Erase the original store.
+    SI->eraseFromParent();
+  }
+
+  // While we have PHIs that are interesting to rewrite, do it.
+  while (!PHIsToRewrite.empty()) {
+    PHINode *PN = PHIsToRewrite.back().first;
+    unsigned FieldNo = PHIsToRewrite.back().second;
+    PHIsToRewrite.pop_back();
+    PHINode *FieldPN = cast<PHINode>(InsertedScalarizedValues[PN][FieldNo]);
+    assert(FieldPN->getNumIncomingValues() == 0 &&"Already processed this phi");
+
+    // Add all the incoming values.  This can materialize more phis.
+    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+      Value *InVal = PN->getIncomingValue(i);
+      InVal = GetHeapSROAValue(InVal, FieldNo, InsertedScalarizedValues,
+                               PHIsToRewrite);
+      FieldPN->addIncoming(InVal, PN->getIncomingBlock(i));
+    }
+  }
+  
+  // Drop all inter-phi links and any loads that made it this far.
+  for (DenseMap<Value*, std::vector<Value*> >::iterator
+       I = InsertedScalarizedValues.begin(), E = InsertedScalarizedValues.end();
+       I != E; ++I) {
+    if (PHINode *PN = dyn_cast<PHINode>(I->first))
+      PN->dropAllReferences();
+    else if (LoadInst *LI = dyn_cast<LoadInst>(I->first))
+      LI->dropAllReferences();
+  }
+  
+  // Delete all the phis and loads now that inter-references are dead.
+  for (DenseMap<Value*, std::vector<Value*> >::iterator
+       I = InsertedScalarizedValues.begin(), E = InsertedScalarizedValues.end();
+       I != E; ++I) {
+    if (PHINode *PN = dyn_cast<PHINode>(I->first))
+      PN->eraseFromParent();
+    else if (LoadInst *LI = dyn_cast<LoadInst>(I->first))
+      LI->eraseFromParent();
+  }
+  
+  // The old global is now dead, remove it.
+  GV->eraseFromParent();
+
+  ++NumHeapSRA;
+  return cast<GlobalVariable>(FieldGlobals[0]);
+}
+
+/// TryToOptimizeStoreOfMallocToGlobal - This function is called when we see a
+/// pointer global variable with a single value stored it that is a malloc or
+/// cast of malloc.
+static bool TryToOptimizeStoreOfMallocToGlobal(GlobalVariable *GV,
+                                               MallocInst *MI,
+                                               Module::global_iterator &GVI,
+                                               TargetData &TD) {
+  // If this is a malloc of an abstract type, don't touch it.
+  if (!MI->getAllocatedType()->isSized())
+    return false;
+  
+  // We can't optimize this global unless all uses of it are *known* to be
+  // of the malloc value, not of the null initializer value (consider a use
+  // that compares the global's value against zero to see if the malloc has
+  // been reached).  To do this, we check to see if all uses of the global
+  // would trap if the global were null: this proves that they must all
+  // happen after the malloc.
+  if (!AllUsesOfLoadedValueWillTrapIfNull(GV))
+    return false;
+  
+  // We can't optimize this if the malloc itself is used in a complex way,
+  // for example, being stored into multiple globals.  This allows the
+  // malloc to be stored into the specified global, loaded setcc'd, and
+  // GEP'd.  These are all things we could transform to using the global
+  // for.
+  {
+    SmallPtrSet<PHINode*, 8> PHIs;
+    if (!ValueIsOnlyUsedLocallyOrStoredToOneGlobal(MI, GV, PHIs))
+      return false;
+  }
+  
+  
+  // If we have a global that is only initialized with a fixed size malloc,
+  // transform the program to use global memory instead of malloc'd memory.
+  // This eliminates dynamic allocation, avoids an indirection accessing the
+  // data, and exposes the resultant global to further GlobalOpt.
+  if (ConstantInt *NElements = dyn_cast<ConstantInt>(MI->getArraySize())) {
+    // Restrict this transformation to only working on small allocations
+    // (2048 bytes currently), as we don't want to introduce a 16M global or
+    // something.
+    if (NElements->getZExtValue()*
+        TD.getTypeAllocSize(MI->getAllocatedType()) < 2048) {
+      GVI = OptimizeGlobalAddressOfMalloc(GV, MI);
+      return true;
+    }
+  }
+  
+  // If the allocation is an array of structures, consider transforming this
+  // into multiple malloc'd arrays, one for each field.  This is basically
+  // SRoA for malloc'd memory.
+  const Type *AllocTy = MI->getAllocatedType();
+  
+  // If this is an allocation of a fixed size array of structs, analyze as a
+  // variable size array.  malloc [100 x struct],1 -> malloc struct, 100
+  if (!MI->isArrayAllocation())
+    if (const ArrayType *AT = dyn_cast<ArrayType>(AllocTy))
+      AllocTy = AT->getElementType();
+  
+  if (const StructType *AllocSTy = dyn_cast<StructType>(AllocTy)) {
+    // This the structure has an unreasonable number of fields, leave it
+    // alone.
+    if (AllocSTy->getNumElements() <= 16 && AllocSTy->getNumElements() != 0 &&
+        AllGlobalLoadUsesSimpleEnoughForHeapSRA(GV, MI)) {
+      
+      // If this is a fixed size array, transform the Malloc to be an alloc of
+      // structs.  malloc [100 x struct],1 -> malloc struct, 100
+      if (const ArrayType *AT = dyn_cast<ArrayType>(MI->getAllocatedType())) {
+        MallocInst *NewMI = 
+          new MallocInst(AllocSTy, 
+                         ConstantInt::get(Type::Int32Ty, AT->getNumElements()),
+                         "", MI);
+        NewMI->takeName(MI);
+        Value *Cast = new BitCastInst(NewMI, MI->getType(), "tmp", MI);
+        MI->replaceAllUsesWith(Cast);
+        MI->eraseFromParent();
+        MI = NewMI;
+      }
+      
+      GVI = PerformHeapAllocSRoA(GV, MI);
+      return true;
+    }
+  }
+  
+  return false;
+}  
+
+// OptimizeOnceStoredGlobal - Try to optimize globals based on the knowledge
+// that only one value (besides its initializer) is ever stored to the global.
+static bool OptimizeOnceStoredGlobal(GlobalVariable *GV, Value *StoredOnceVal,
+                                     Module::global_iterator &GVI,
+                                     TargetData &TD) {
+  // Ignore no-op GEPs and bitcasts.
+  StoredOnceVal = StoredOnceVal->stripPointerCasts();
+
+  // If we are dealing with a pointer global that is initialized to null and
+  // only has one (non-null) value stored into it, then we can optimize any
+  // users of the loaded value (often calls and loads) that would trap if the
+  // value was null.
+  if (isa<PointerType>(GV->getInitializer()->getType()) &&
+      GV->getInitializer()->isNullValue()) {
+    if (Constant *SOVC = dyn_cast<Constant>(StoredOnceVal)) {
+      if (GV->getInitializer()->getType() != SOVC->getType())
+        SOVC = ConstantExpr::getBitCast(SOVC, GV->getInitializer()->getType());
+
+      // Optimize away any trapping uses of the loaded value.
+      if (OptimizeAwayTrappingUsesOfLoads(GV, SOVC))
+        return true;
+    } else if (MallocInst *MI = dyn_cast<MallocInst>(StoredOnceVal)) {
+      if (TryToOptimizeStoreOfMallocToGlobal(GV, MI, GVI, TD))
+        return true;
+    }
+  }
+
+  return false;
+}
+
+/// TryToShrinkGlobalToBoolean - At this point, we have learned that the only
+/// two values ever stored into GV are its initializer and OtherVal.  See if we
+/// can shrink the global into a boolean and select between the two values
+/// whenever it is used.  This exposes the values to other scalar optimizations.
+static bool TryToShrinkGlobalToBoolean(GlobalVariable *GV, Constant *OtherVal) {
+  const Type *GVElType = GV->getType()->getElementType();
+  
+  // If GVElType is already i1, it is already shrunk.  If the type of the GV is
+  // an FP value, pointer or vector, don't do this optimization because a select
+  // between them is very expensive and unlikely to lead to later
+  // simplification.  In these cases, we typically end up with "cond ? v1 : v2"
+  // where v1 and v2 both require constant pool loads, a big loss.
+  if (GVElType == Type::Int1Ty || GVElType->isFloatingPoint() ||
+      isa<PointerType>(GVElType) || isa<VectorType>(GVElType))
+    return false;
+  
+  // Walk the use list of the global seeing if all the uses are load or store.
+  // If there is anything else, bail out.
+  for (Value::use_iterator I = GV->use_begin(), E = GV->use_end(); I != E; ++I)
+    if (!isa<LoadInst>(I) && !isa<StoreInst>(I))
+      return false;
+  
+  DOUT << "   *** SHRINKING TO BOOL: " << *GV;
+  
+  // Create the new global, initializing it to false.
+  GlobalVariable *NewGV = new GlobalVariable(Type::Int1Ty, false,
+         GlobalValue::InternalLinkage, ConstantInt::getFalse(),
+                                             GV->getName()+".b",
+                                             (Module *)NULL,
+                                             GV->isThreadLocal());
+  GV->getParent()->getGlobalList().insert(GV, NewGV);
+
+  Constant *InitVal = GV->getInitializer();
+  assert(InitVal->getType() != Type::Int1Ty && "No reason to shrink to bool!");
+
+  // If initialized to zero and storing one into the global, we can use a cast
+  // instead of a select to synthesize the desired value.
+  bool IsOneZero = false;
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(OtherVal))
+    IsOneZero = InitVal->isNullValue() && CI->isOne();
+
+  while (!GV->use_empty()) {
+    Instruction *UI = cast<Instruction>(GV->use_back());
+    if (StoreInst *SI = dyn_cast<StoreInst>(UI)) {
+      // Change the store into a boolean store.
+      bool StoringOther = SI->getOperand(0) == OtherVal;
+      // Only do this if we weren't storing a loaded value.
+      Value *StoreVal;
+      if (StoringOther || SI->getOperand(0) == InitVal)
+        StoreVal = ConstantInt::get(Type::Int1Ty, StoringOther);
+      else {
+        // Otherwise, we are storing a previously loaded copy.  To do this,
+        // change the copy from copying the original value to just copying the
+        // bool.
+        Instruction *StoredVal = cast<Instruction>(SI->getOperand(0));
+
+        // If we're already replaced the input, StoredVal will be a cast or
+        // select instruction.  If not, it will be a load of the original
+        // global.
+        if (LoadInst *LI = dyn_cast<LoadInst>(StoredVal)) {
+          assert(LI->getOperand(0) == GV && "Not a copy!");
+          // Insert a new load, to preserve the saved value.
+          StoreVal = new LoadInst(NewGV, LI->getName()+".b", LI);
+        } else {
+          assert((isa<CastInst>(StoredVal) || isa<SelectInst>(StoredVal)) &&
+                 "This is not a form that we understand!");
+          StoreVal = StoredVal->getOperand(0);
+          assert(isa<LoadInst>(StoreVal) && "Not a load of NewGV!");
+        }
+      }
+      new StoreInst(StoreVal, NewGV, SI);
+    } else {
+      // Change the load into a load of bool then a select.
+      LoadInst *LI = cast<LoadInst>(UI);
+      LoadInst *NLI = new LoadInst(NewGV, LI->getName()+".b", LI);
+      Value *NSI;
+      if (IsOneZero)
+        NSI = new ZExtInst(NLI, LI->getType(), "", LI);
+      else
+        NSI = SelectInst::Create(NLI, OtherVal, InitVal, "", LI);
+      NSI->takeName(LI);
+      LI->replaceAllUsesWith(NSI);
+    }
+    UI->eraseFromParent();
+  }
+
+  GV->eraseFromParent();
+  return true;
+}
+
+
+/// ProcessInternalGlobal - Analyze the specified global variable and optimize
+/// it if possible.  If we make a change, return true.
+bool GlobalOpt::ProcessInternalGlobal(GlobalVariable *GV,
+                                      Module::global_iterator &GVI) {
+  SmallPtrSet<PHINode*, 16> PHIUsers;
+  GlobalStatus GS;
+  GV->removeDeadConstantUsers();
+
+  if (GV->use_empty()) {
+    DOUT << "GLOBAL DEAD: " << *GV;
+    GV->eraseFromParent();
+    ++NumDeleted;
+    return true;
+  }
+
+  if (!AnalyzeGlobal(GV, GS, PHIUsers)) {
+#if 0
+    cerr << "Global: " << *GV;
+    cerr << "  isLoaded = " << GS.isLoaded << "\n";
+    cerr << "  StoredType = ";
+    switch (GS.StoredType) {
+    case GlobalStatus::NotStored: cerr << "NEVER STORED\n"; break;
+    case GlobalStatus::isInitializerStored: cerr << "INIT STORED\n"; break;
+    case GlobalStatus::isStoredOnce: cerr << "STORED ONCE\n"; break;
+    case GlobalStatus::isStored: cerr << "stored\n"; break;
+    }
+    if (GS.StoredType == GlobalStatus::isStoredOnce && GS.StoredOnceValue)
+      cerr << "  StoredOnceValue = " << *GS.StoredOnceValue << "\n";
+    if (GS.AccessingFunction && !GS.HasMultipleAccessingFunctions)
+      cerr << "  AccessingFunction = " << GS.AccessingFunction->getName()
+                << "\n";
+    cerr << "  HasMultipleAccessingFunctions =  "
+              << GS.HasMultipleAccessingFunctions << "\n";
+    cerr << "  HasNonInstructionUser = " << GS.HasNonInstructionUser<<"\n";
+    cerr << "\n";
+#endif
+    
+    // If this is a first class global and has only one accessing function
+    // and this function is main (which we know is not recursive we can make
+    // this global a local variable) we replace the global with a local alloca
+    // in this function.
+    //
+    // NOTE: It doesn't make sense to promote non single-value types since we
+    // are just replacing static memory to stack memory.
+    if (!GS.HasMultipleAccessingFunctions &&
+        GS.AccessingFunction && !GS.HasNonInstructionUser &&
+        GV->getType()->getElementType()->isSingleValueType() &&
+        GS.AccessingFunction->getName() == "main" &&
+        GS.AccessingFunction->hasExternalLinkage()) {
+      DOUT << "LOCALIZING GLOBAL: " << *GV;
+      Instruction* FirstI = GS.AccessingFunction->getEntryBlock().begin();
+      const Type* ElemTy = GV->getType()->getElementType();
+      // FIXME: Pass Global's alignment when globals have alignment
+      AllocaInst* Alloca = new AllocaInst(ElemTy, NULL, GV->getName(), FirstI);
+      if (!isa<UndefValue>(GV->getInitializer()))
+        new StoreInst(GV->getInitializer(), Alloca, FirstI);
+
+      GV->replaceAllUsesWith(Alloca);
+      GV->eraseFromParent();
+      ++NumLocalized;
+      return true;
+    }
+    
+    // If the global is never loaded (but may be stored to), it is dead.
+    // Delete it now.
+    if (!GS.isLoaded) {
+      DOUT << "GLOBAL NEVER LOADED: " << *GV;
+
+      // Delete any stores we can find to the global.  We may not be able to
+      // make it completely dead though.
+      bool Changed = CleanupConstantGlobalUsers(GV, GV->getInitializer());
+
+      // If the global is dead now, delete it.
+      if (GV->use_empty()) {
+        GV->eraseFromParent();
+        ++NumDeleted;
+        Changed = true;
+      }
+      return Changed;
+
+    } else if (GS.StoredType <= GlobalStatus::isInitializerStored) {
+      DOUT << "MARKING CONSTANT: " << *GV;
+      GV->setConstant(true);
+
+      // Clean up any obviously simplifiable users now.
+      CleanupConstantGlobalUsers(GV, GV->getInitializer());
+
+      // If the global is dead now, just nuke it.
+      if (GV->use_empty()) {
+        DOUT << "   *** Marking constant allowed us to simplify "
+             << "all users and delete global!\n";
+        GV->eraseFromParent();
+        ++NumDeleted;
+      }
+
+      ++NumMarked;
+      return true;
+    } else if (!GV->getInitializer()->getType()->isSingleValueType()) {
+      if (GlobalVariable *FirstNewGV = SRAGlobal(GV, 
+                                                 getAnalysis<TargetData>())) {
+        GVI = FirstNewGV;  // Don't skip the newly produced globals!
+        return true;
+      }
+    } else if (GS.StoredType == GlobalStatus::isStoredOnce) {
+      // If the initial value for the global was an undef value, and if only
+      // one other value was stored into it, we can just change the
+      // initializer to be the stored value, then delete all stores to the
+      // global.  This allows us to mark it constant.
+      if (Constant *SOVConstant = dyn_cast<Constant>(GS.StoredOnceValue))
+        if (isa<UndefValue>(GV->getInitializer())) {
+          // Change the initial value here.
+          GV->setInitializer(SOVConstant);
+
+          // Clean up any obviously simplifiable users now.
+          CleanupConstantGlobalUsers(GV, GV->getInitializer());
+
+          if (GV->use_empty()) {
+            DOUT << "   *** Substituting initializer allowed us to "
+                 << "simplify all users and delete global!\n";
+            GV->eraseFromParent();
+            ++NumDeleted;
+          } else {
+            GVI = GV;
+          }
+          ++NumSubstitute;
+          return true;
+        }
+
+      // Try to optimize globals based on the knowledge that only one value
+      // (besides its initializer) is ever stored to the global.
+      if (OptimizeOnceStoredGlobal(GV, GS.StoredOnceValue, GVI,
+                                   getAnalysis<TargetData>()))
+        return true;
+
+      // Otherwise, if the global was not a boolean, we can shrink it to be a
+      // boolean.
+      if (Constant *SOVConstant = dyn_cast<Constant>(GS.StoredOnceValue))
+        if (TryToShrinkGlobalToBoolean(GV, SOVConstant)) {
+          ++NumShrunkToBool;
+          return true;
+        }
+    }
+  }
+  return false;
+}
+
+/// OnlyCalledDirectly - Return true if the specified function is only called
+/// directly.  In other words, its address is never taken.
+static bool OnlyCalledDirectly(Function *F) {
+  for (Value::use_iterator UI = F->use_begin(), E = F->use_end(); UI != E;++UI){
+    Instruction *User = dyn_cast<Instruction>(*UI);
+    if (!User) return false;
+    if (!isa<CallInst>(User) && !isa<InvokeInst>(User)) return false;
+
+    // See if the function address is passed as an argument.
+    for (User::op_iterator i = User->op_begin() + 1, e = User->op_end();
+         i != e; ++i)
+      if (*i == F) return false;
+  }
+  return true;
+}
+
+/// ChangeCalleesToFastCall - Walk all of the direct calls of the specified
+/// function, changing them to FastCC.
+static void ChangeCalleesToFastCall(Function *F) {
+  for (Value::use_iterator UI = F->use_begin(), E = F->use_end(); UI != E;++UI){
+    CallSite User(cast<Instruction>(*UI));
+    User.setCallingConv(CallingConv::Fast);
+  }
+}
+
+static AttrListPtr StripNest(const AttrListPtr &Attrs) {
+  for (unsigned i = 0, e = Attrs.getNumSlots(); i != e; ++i) {
+    if ((Attrs.getSlot(i).Attrs & Attribute::Nest) == 0)
+      continue;
+
+    // There can be only one.
+    return Attrs.removeAttr(Attrs.getSlot(i).Index, Attribute::Nest);
+  }
+
+  return Attrs;
+}
+
+static void RemoveNestAttribute(Function *F) {
+  F->setAttributes(StripNest(F->getAttributes()));
+  for (Value::use_iterator UI = F->use_begin(), E = F->use_end(); UI != E;++UI){
+    CallSite User(cast<Instruction>(*UI));
+    User.setAttributes(StripNest(User.getAttributes()));
+  }
+}
+
+bool GlobalOpt::OptimizeFunctions(Module &M) {
+  bool Changed = false;
+  // Optimize functions.
+  for (Module::iterator FI = M.begin(), E = M.end(); FI != E; ) {
+    Function *F = FI++;
+    // Functions without names cannot be referenced outside this module.
+    if (!F->hasName() && !F->isDeclaration())
+      F->setLinkage(GlobalValue::InternalLinkage);
+    F->removeDeadConstantUsers();
+    if (F->use_empty() && (F->hasLocalLinkage() ||
+                           F->hasLinkOnceLinkage())) {
+      M.getFunctionList().erase(F);
+      Changed = true;
+      ++NumFnDeleted;
+    } else if (F->hasLocalLinkage()) {
+      if (F->getCallingConv() == CallingConv::C && !F->isVarArg() &&
+          OnlyCalledDirectly(F)) {
+        // If this function has C calling conventions, is not a varargs
+        // function, and is only called directly, promote it to use the Fast
+        // calling convention.
+        F->setCallingConv(CallingConv::Fast);
+        ChangeCalleesToFastCall(F);
+        ++NumFastCallFns;
+        Changed = true;
+      }
+
+      if (F->getAttributes().hasAttrSomewhere(Attribute::Nest) &&
+          OnlyCalledDirectly(F)) {
+        // The function is not used by a trampoline intrinsic, so it is safe
+        // to remove the 'nest' attribute.
+        RemoveNestAttribute(F);
+        ++NumNestRemoved;
+        Changed = true;
+      }
+    }
+  }
+  return Changed;
+}
+
+bool GlobalOpt::OptimizeGlobalVars(Module &M) {
+  bool Changed = false;
+  for (Module::global_iterator GVI = M.global_begin(), E = M.global_end();
+       GVI != E; ) {
+    GlobalVariable *GV = GVI++;
+    // Global variables without names cannot be referenced outside this module.
+    if (!GV->hasName() && !GV->isDeclaration())
+      GV->setLinkage(GlobalValue::InternalLinkage);
+    if (!GV->isConstant() && GV->hasLocalLinkage() &&
+        GV->hasInitializer())
+      Changed |= ProcessInternalGlobal(GV, GVI);
+  }
+  return Changed;
+}
+
+/// FindGlobalCtors - Find the llvm.globalctors list, verifying that all
+/// initializers have an init priority of 65535.
+GlobalVariable *GlobalOpt::FindGlobalCtors(Module &M) {
+  for (Module::global_iterator I = M.global_begin(), E = M.global_end();
+       I != E; ++I)
+    if (I->getName() == "llvm.global_ctors") {
+      // Found it, verify it's an array of { int, void()* }.
+      const ArrayType *ATy =dyn_cast<ArrayType>(I->getType()->getElementType());
+      if (!ATy) return 0;
+      const StructType *STy = dyn_cast<StructType>(ATy->getElementType());
+      if (!STy || STy->getNumElements() != 2 ||
+          STy->getElementType(0) != Type::Int32Ty) return 0;
+      const PointerType *PFTy = dyn_cast<PointerType>(STy->getElementType(1));
+      if (!PFTy) return 0;
+      const FunctionType *FTy = dyn_cast<FunctionType>(PFTy->getElementType());
+      if (!FTy || FTy->getReturnType() != Type::VoidTy || FTy->isVarArg() ||
+          FTy->getNumParams() != 0)
+        return 0;
+      
+      // Verify that the initializer is simple enough for us to handle.
+      if (!I->hasInitializer()) return 0;
+      ConstantArray *CA = dyn_cast<ConstantArray>(I->getInitializer());
+      if (!CA) return 0;
+      for (User::op_iterator i = CA->op_begin(), e = CA->op_end(); i != e; ++i)
+        if (ConstantStruct *CS = dyn_cast<ConstantStruct>(*i)) {
+          if (isa<ConstantPointerNull>(CS->getOperand(1)))
+            continue;
+
+          // Must have a function or null ptr.
+          if (!isa<Function>(CS->getOperand(1)))
+            return 0;
+          
+          // Init priority must be standard.
+          ConstantInt *CI = dyn_cast<ConstantInt>(CS->getOperand(0));
+          if (!CI || CI->getZExtValue() != 65535)
+            return 0;
+        } else {
+          return 0;
+        }
+      
+      return I;
+    }
+  return 0;
+}
+
+/// ParseGlobalCtors - Given a llvm.global_ctors list that we can understand,
+/// return a list of the functions and null terminator as a vector.
+static std::vector<Function*> ParseGlobalCtors(GlobalVariable *GV) {
+  ConstantArray *CA = cast<ConstantArray>(GV->getInitializer());
+  std::vector<Function*> Result;
+  Result.reserve(CA->getNumOperands());
+  for (User::op_iterator i = CA->op_begin(), e = CA->op_end(); i != e; ++i) {
+    ConstantStruct *CS = cast<ConstantStruct>(*i);
+    Result.push_back(dyn_cast<Function>(CS->getOperand(1)));
+  }
+  return Result;
+}
+
+/// InstallGlobalCtors - Given a specified llvm.global_ctors list, install the
+/// specified array, returning the new global to use.
+static GlobalVariable *InstallGlobalCtors(GlobalVariable *GCL, 
+                                          const std::vector<Function*> &Ctors) {
+  // If we made a change, reassemble the initializer list.
+  std::vector<Constant*> CSVals;
+  CSVals.push_back(ConstantInt::get(Type::Int32Ty, 65535));
+  CSVals.push_back(0);
+  
+  // Create the new init list.
+  std::vector<Constant*> CAList;
+  for (unsigned i = 0, e = Ctors.size(); i != e; ++i) {
+    if (Ctors[i]) {
+      CSVals[1] = Ctors[i];
+    } else {
+      const Type *FTy = FunctionType::get(Type::VoidTy,
+                                          std::vector<const Type*>(), false);
+      const PointerType *PFTy = PointerType::getUnqual(FTy);
+      CSVals[1] = Constant::getNullValue(PFTy);
+      CSVals[0] = ConstantInt::get(Type::Int32Ty, 2147483647);
+    }
+    CAList.push_back(ConstantStruct::get(CSVals));
+  }
+  
+  // Create the array initializer.
+  const Type *StructTy =
+    cast<ArrayType>(GCL->getType()->getElementType())->getElementType();
+  Constant *CA = ConstantArray::get(ArrayType::get(StructTy, CAList.size()),
+                                    CAList);
+  
+  // If we didn't change the number of elements, don't create a new GV.
+  if (CA->getType() == GCL->getInitializer()->getType()) {
+    GCL->setInitializer(CA);
+    return GCL;
+  }
+  
+  // Create the new global and insert it next to the existing list.
+  GlobalVariable *NGV = new GlobalVariable(CA->getType(), GCL->isConstant(),
+                                           GCL->getLinkage(), CA, "",
+                                           (Module *)NULL,
+                                           GCL->isThreadLocal());
+  GCL->getParent()->getGlobalList().insert(GCL, NGV);
+  NGV->takeName(GCL);
+  
+  // Nuke the old list, replacing any uses with the new one.
+  if (!GCL->use_empty()) {
+    Constant *V = NGV;
+    if (V->getType() != GCL->getType())
+      V = ConstantExpr::getBitCast(V, GCL->getType());
+    GCL->replaceAllUsesWith(V);
+  }
+  GCL->eraseFromParent();
+  
+  if (Ctors.size())
+    return NGV;
+  else
+    return 0;
+}
+
+
+static Constant *getVal(DenseMap<Value*, Constant*> &ComputedValues,
+                        Value *V) {
+  if (Constant *CV = dyn_cast<Constant>(V)) return CV;
+  Constant *R = ComputedValues[V];
+  assert(R && "Reference to an uncomputed value!");
+  return R;
+}
+
+/// isSimpleEnoughPointerToCommit - Return true if this constant is simple
+/// enough for us to understand.  In particular, if it is a cast of something,
+/// we punt.  We basically just support direct accesses to globals and GEP's of
+/// globals.  This should be kept up to date with CommitValueTo.
+static bool isSimpleEnoughPointerToCommit(Constant *C) {
+  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(C)) {
+    if (!GV->hasExternalLinkage() && !GV->hasLocalLinkage())
+      return false;  // do not allow weak/linkonce/dllimport/dllexport linkage.
+    return !GV->isDeclaration();  // reject external globals.
+  }
+  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C))
+    // Handle a constantexpr gep.
+    if (CE->getOpcode() == Instruction::GetElementPtr &&
+        isa<GlobalVariable>(CE->getOperand(0))) {
+      GlobalVariable *GV = cast<GlobalVariable>(CE->getOperand(0));
+      if (!GV->hasExternalLinkage() && !GV->hasLocalLinkage())
+        return false;  // do not allow weak/linkonce/dllimport/dllexport linkage.
+      return GV->hasInitializer() &&
+             ConstantFoldLoadThroughGEPConstantExpr(GV->getInitializer(), CE);
+    }
+  return false;
+}
+
+/// EvaluateStoreInto - Evaluate a piece of a constantexpr store into a global
+/// initializer.  This returns 'Init' modified to reflect 'Val' stored into it.
+/// At this point, the GEP operands of Addr [0, OpNo) have been stepped into.
+static Constant *EvaluateStoreInto(Constant *Init, Constant *Val,
+                                   ConstantExpr *Addr, unsigned OpNo) {
+  // Base case of the recursion.
+  if (OpNo == Addr->getNumOperands()) {
+    assert(Val->getType() == Init->getType() && "Type mismatch!");
+    return Val;
+  }
+  
+  if (const StructType *STy = dyn_cast<StructType>(Init->getType())) {
+    std::vector<Constant*> Elts;
+
+    // Break up the constant into its elements.
+    if (ConstantStruct *CS = dyn_cast<ConstantStruct>(Init)) {
+      for (User::op_iterator i = CS->op_begin(), e = CS->op_end(); i != e; ++i)
+        Elts.push_back(cast<Constant>(*i));
+    } else if (isa<ConstantAggregateZero>(Init)) {
+      for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i)
+        Elts.push_back(Constant::getNullValue(STy->getElementType(i)));
+    } else if (isa<UndefValue>(Init)) {
+      for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i)
+        Elts.push_back(UndefValue::get(STy->getElementType(i)));
+    } else {
+      assert(0 && "This code is out of sync with "
+             " ConstantFoldLoadThroughGEPConstantExpr");
+    }
+    
+    // Replace the element that we are supposed to.
+    ConstantInt *CU = cast<ConstantInt>(Addr->getOperand(OpNo));
+    unsigned Idx = CU->getZExtValue();
+    assert(Idx < STy->getNumElements() && "Struct index out of range!");
+    Elts[Idx] = EvaluateStoreInto(Elts[Idx], Val, Addr, OpNo+1);
+    
+    // Return the modified struct.
+    return ConstantStruct::get(&Elts[0], Elts.size(), STy->isPacked());
+  } else {
+    ConstantInt *CI = cast<ConstantInt>(Addr->getOperand(OpNo));
+    const ArrayType *ATy = cast<ArrayType>(Init->getType());
+
+    // Break up the array into elements.
+    std::vector<Constant*> Elts;
+    if (ConstantArray *CA = dyn_cast<ConstantArray>(Init)) {
+      for (User::op_iterator i = CA->op_begin(), e = CA->op_end(); i != e; ++i)
+        Elts.push_back(cast<Constant>(*i));
+    } else if (isa<ConstantAggregateZero>(Init)) {
+      Constant *Elt = Constant::getNullValue(ATy->getElementType());
+      Elts.assign(ATy->getNumElements(), Elt);
+    } else if (isa<UndefValue>(Init)) {
+      Constant *Elt = UndefValue::get(ATy->getElementType());
+      Elts.assign(ATy->getNumElements(), Elt);
+    } else {
+      assert(0 && "This code is out of sync with "
+             " ConstantFoldLoadThroughGEPConstantExpr");
+    }
+    
+    assert(CI->getZExtValue() < ATy->getNumElements());
+    Elts[CI->getZExtValue()] =
+      EvaluateStoreInto(Elts[CI->getZExtValue()], Val, Addr, OpNo+1);
+    return ConstantArray::get(ATy, Elts);
+  }    
+}
+
+/// CommitValueTo - We have decided that Addr (which satisfies the predicate
+/// isSimpleEnoughPointerToCommit) should get Val as its value.  Make it happen.
+static void CommitValueTo(Constant *Val, Constant *Addr) {
+  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(Addr)) {
+    assert(GV->hasInitializer());
+    GV->setInitializer(Val);
+    return;
+  }
+  
+  ConstantExpr *CE = cast<ConstantExpr>(Addr);
+  GlobalVariable *GV = cast<GlobalVariable>(CE->getOperand(0));
+  
+  Constant *Init = GV->getInitializer();
+  Init = EvaluateStoreInto(Init, Val, CE, 2);
+  GV->setInitializer(Init);
+}
+
+/// ComputeLoadResult - Return the value that would be computed by a load from
+/// P after the stores reflected by 'memory' have been performed.  If we can't
+/// decide, return null.
+static Constant *ComputeLoadResult(Constant *P,
+                                const DenseMap<Constant*, Constant*> &Memory) {
+  // If this memory location has been recently stored, use the stored value: it
+  // is the most up-to-date.
+  DenseMap<Constant*, Constant*>::const_iterator I = Memory.find(P);
+  if (I != Memory.end()) return I->second;
+ 
+  // Access it.
+  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(P)) {
+    if (GV->hasInitializer())
+      return GV->getInitializer();
+    return 0;
+  }
+  
+  // Handle a constantexpr getelementptr.
+  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(P))
+    if (CE->getOpcode() == Instruction::GetElementPtr &&
+        isa<GlobalVariable>(CE->getOperand(0))) {
+      GlobalVariable *GV = cast<GlobalVariable>(CE->getOperand(0));
+      if (GV->hasInitializer())
+        return ConstantFoldLoadThroughGEPConstantExpr(GV->getInitializer(), CE);
+    }
+
+  return 0;  // don't know how to evaluate.
+}
+
+/// EvaluateFunction - Evaluate a call to function F, returning true if
+/// successful, false if we can't evaluate it.  ActualArgs contains the formal
+/// arguments for the function.
+static bool EvaluateFunction(Function *F, Constant *&RetVal,
+                             const std::vector<Constant*> &ActualArgs,
+                             std::vector<Function*> &CallStack,
+                             DenseMap<Constant*, Constant*> &MutatedMemory,
+                             std::vector<GlobalVariable*> &AllocaTmps) {
+  // Check to see if this function is already executing (recursion).  If so,
+  // bail out.  TODO: we might want to accept limited recursion.
+  if (std::find(CallStack.begin(), CallStack.end(), F) != CallStack.end())
+    return false;
+  
+  CallStack.push_back(F);
+  
+  /// Values - As we compute SSA register values, we store their contents here.
+  DenseMap<Value*, Constant*> Values;
+  
+  // Initialize arguments to the incoming values specified.
+  unsigned ArgNo = 0;
+  for (Function::arg_iterator AI = F->arg_begin(), E = F->arg_end(); AI != E;
+       ++AI, ++ArgNo)
+    Values[AI] = ActualArgs[ArgNo];
+
+  /// ExecutedBlocks - We only handle non-looping, non-recursive code.  As such,
+  /// we can only evaluate any one basic block at most once.  This set keeps
+  /// track of what we have executed so we can detect recursive cases etc.
+  SmallPtrSet<BasicBlock*, 32> ExecutedBlocks;
+  
+  // CurInst - The current instruction we're evaluating.
+  BasicBlock::iterator CurInst = F->begin()->begin();
+  
+  // This is the main evaluation loop.
+  while (1) {
+    Constant *InstResult = 0;
+    
+    if (StoreInst *SI = dyn_cast<StoreInst>(CurInst)) {
+      if (SI->isVolatile()) return false;  // no volatile accesses.
+      Constant *Ptr = getVal(Values, SI->getOperand(1));
+      if (!isSimpleEnoughPointerToCommit(Ptr))
+        // If this is too complex for us to commit, reject it.
+        return false;
+      Constant *Val = getVal(Values, SI->getOperand(0));
+      MutatedMemory[Ptr] = Val;
+    } else if (BinaryOperator *BO = dyn_cast<BinaryOperator>(CurInst)) {
+      InstResult = ConstantExpr::get(BO->getOpcode(),
+                                     getVal(Values, BO->getOperand(0)),
+                                     getVal(Values, BO->getOperand(1)));
+    } else if (CmpInst *CI = dyn_cast<CmpInst>(CurInst)) {
+      InstResult = ConstantExpr::getCompare(CI->getPredicate(),
+                                            getVal(Values, CI->getOperand(0)),
+                                            getVal(Values, CI->getOperand(1)));
+    } else if (CastInst *CI = dyn_cast<CastInst>(CurInst)) {
+      InstResult = ConstantExpr::getCast(CI->getOpcode(),
+                                         getVal(Values, CI->getOperand(0)),
+                                         CI->getType());
+    } else if (SelectInst *SI = dyn_cast<SelectInst>(CurInst)) {
+      InstResult = ConstantExpr::getSelect(getVal(Values, SI->getOperand(0)),
+                                           getVal(Values, SI->getOperand(1)),
+                                           getVal(Values, SI->getOperand(2)));
+    } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(CurInst)) {
+      Constant *P = getVal(Values, GEP->getOperand(0));
+      SmallVector<Constant*, 8> GEPOps;
+      for (User::op_iterator i = GEP->op_begin() + 1, e = GEP->op_end();
+           i != e; ++i)
+        GEPOps.push_back(getVal(Values, *i));
+      InstResult = ConstantExpr::getGetElementPtr(P, &GEPOps[0], GEPOps.size());
+    } else if (LoadInst *LI = dyn_cast<LoadInst>(CurInst)) {
+      if (LI->isVolatile()) return false;  // no volatile accesses.
+      InstResult = ComputeLoadResult(getVal(Values, LI->getOperand(0)),
+                                     MutatedMemory);
+      if (InstResult == 0) return false; // Could not evaluate load.
+    } else if (AllocaInst *AI = dyn_cast<AllocaInst>(CurInst)) {
+      if (AI->isArrayAllocation()) return false;  // Cannot handle array allocs.
+      const Type *Ty = AI->getType()->getElementType();
+      AllocaTmps.push_back(new GlobalVariable(Ty, false,
+                                              GlobalValue::InternalLinkage,
+                                              UndefValue::get(Ty),
+                                              AI->getName()));
+      InstResult = AllocaTmps.back();     
+    } else if (CallInst *CI = dyn_cast<CallInst>(CurInst)) {
+
+      // Debug info can safely be ignored here.
+      if (isa<DbgInfoIntrinsic>(CI)) {
+        ++CurInst;
+        continue;
+      }
+
+      // Cannot handle inline asm.
+      if (isa<InlineAsm>(CI->getOperand(0))) return false;
+
+      // Resolve function pointers.
+      Function *Callee = dyn_cast<Function>(getVal(Values, CI->getOperand(0)));
+      if (!Callee) return false;  // Cannot resolve.
+
+      std::vector<Constant*> Formals;
+      for (User::op_iterator i = CI->op_begin() + 1, e = CI->op_end();
+           i != e; ++i)
+        Formals.push_back(getVal(Values, *i));
+      
+      if (Callee->isDeclaration()) {
+        // If this is a function we can constant fold, do it.
+        if (Constant *C = ConstantFoldCall(Callee, &Formals[0],
+                                           Formals.size())) {
+          InstResult = C;
+        } else {
+          return false;
+        }
+      } else {
+        if (Callee->getFunctionType()->isVarArg())
+          return false;
+        
+        Constant *RetVal;
+        // Execute the call, if successful, use the return value.
+        if (!EvaluateFunction(Callee, RetVal, Formals, CallStack,
+                              MutatedMemory, AllocaTmps))
+          return false;
+        InstResult = RetVal;
+      }
+    } else if (isa<TerminatorInst>(CurInst)) {
+      BasicBlock *NewBB = 0;
+      if (BranchInst *BI = dyn_cast<BranchInst>(CurInst)) {
+        if (BI->isUnconditional()) {
+          NewBB = BI->getSuccessor(0);
+        } else {
+          ConstantInt *Cond =
+            dyn_cast<ConstantInt>(getVal(Values, BI->getCondition()));
+          if (!Cond) return false;  // Cannot determine.
+
+          NewBB = BI->getSuccessor(!Cond->getZExtValue());          
+        }
+      } else if (SwitchInst *SI = dyn_cast<SwitchInst>(CurInst)) {
+        ConstantInt *Val =
+          dyn_cast<ConstantInt>(getVal(Values, SI->getCondition()));
+        if (!Val) return false;  // Cannot determine.
+        NewBB = SI->getSuccessor(SI->findCaseValue(Val));
+      } else if (ReturnInst *RI = dyn_cast<ReturnInst>(CurInst)) {
+        if (RI->getNumOperands())
+          RetVal = getVal(Values, RI->getOperand(0));
+        
+        CallStack.pop_back();  // return from fn.
+        return true;  // We succeeded at evaluating this ctor!
+      } else {
+        // invoke, unwind, unreachable.
+        return false;  // Cannot handle this terminator.
+      }
+      
+      // Okay, we succeeded in evaluating this control flow.  See if we have
+      // executed the new block before.  If so, we have a looping function,
+      // which we cannot evaluate in reasonable time.
+      if (!ExecutedBlocks.insert(NewBB))
+        return false;  // looped!
+      
+      // Okay, we have never been in this block before.  Check to see if there
+      // are any PHI nodes.  If so, evaluate them with information about where
+      // we came from.
+      BasicBlock *OldBB = CurInst->getParent();
+      CurInst = NewBB->begin();
+      PHINode *PN;
+      for (; (PN = dyn_cast<PHINode>(CurInst)); ++CurInst)
+        Values[PN] = getVal(Values, PN->getIncomingValueForBlock(OldBB));
+
+      // Do NOT increment CurInst.  We know that the terminator had no value.
+      continue;
+    } else {
+      // Did not know how to evaluate this!
+      return false;
+    }
+    
+    if (!CurInst->use_empty())
+      Values[CurInst] = InstResult;
+    
+    // Advance program counter.
+    ++CurInst;
+  }
+}
+
+/// EvaluateStaticConstructor - Evaluate static constructors in the function, if
+/// we can.  Return true if we can, false otherwise.
+static bool EvaluateStaticConstructor(Function *F) {
+  /// MutatedMemory - For each store we execute, we update this map.  Loads
+  /// check this to get the most up-to-date value.  If evaluation is successful,
+  /// this state is committed to the process.
+  DenseMap<Constant*, Constant*> MutatedMemory;
+
+  /// AllocaTmps - To 'execute' an alloca, we create a temporary global variable
+  /// to represent its body.  This vector is needed so we can delete the
+  /// temporary globals when we are done.
+  std::vector<GlobalVariable*> AllocaTmps;
+  
+  /// CallStack - This is used to detect recursion.  In pathological situations
+  /// we could hit exponential behavior, but at least there is nothing
+  /// unbounded.
+  std::vector<Function*> CallStack;
+
+  // Call the function.
+  Constant *RetValDummy;
+  bool EvalSuccess = EvaluateFunction(F, RetValDummy, std::vector<Constant*>(),
+                                       CallStack, MutatedMemory, AllocaTmps);
+  if (EvalSuccess) {
+    // We succeeded at evaluation: commit the result.
+    DOUT << "FULLY EVALUATED GLOBAL CTOR FUNCTION '"
+         << F->getName() << "' to " << MutatedMemory.size()
+         << " stores.\n";
+    for (DenseMap<Constant*, Constant*>::iterator I = MutatedMemory.begin(),
+         E = MutatedMemory.end(); I != E; ++I)
+      CommitValueTo(I->second, I->first);
+  }
+  
+  // At this point, we are done interpreting.  If we created any 'alloca'
+  // temporaries, release them now.
+  while (!AllocaTmps.empty()) {
+    GlobalVariable *Tmp = AllocaTmps.back();
+    AllocaTmps.pop_back();
+    
+    // If there are still users of the alloca, the program is doing something
+    // silly, e.g. storing the address of the alloca somewhere and using it
+    // later.  Since this is undefined, we'll just make it be null.
+    if (!Tmp->use_empty())
+      Tmp->replaceAllUsesWith(Constant::getNullValue(Tmp->getType()));
+    delete Tmp;
+  }
+  
+  return EvalSuccess;
+}
+
+
+
+/// OptimizeGlobalCtorsList - Simplify and evaluation global ctors if possible.
+/// Return true if anything changed.
+bool GlobalOpt::OptimizeGlobalCtorsList(GlobalVariable *&GCL) {
+  std::vector<Function*> Ctors = ParseGlobalCtors(GCL);
+  bool MadeChange = false;
+  if (Ctors.empty()) return false;
+  
+  // Loop over global ctors, optimizing them when we can.
+  for (unsigned i = 0; i != Ctors.size(); ++i) {
+    Function *F = Ctors[i];
+    // Found a null terminator in the middle of the list, prune off the rest of
+    // the list.
+    if (F == 0) {
+      if (i != Ctors.size()-1) {
+        Ctors.resize(i+1);
+        MadeChange = true;
+      }
+      break;
+    }
+    
+    // We cannot simplify external ctor functions.
+    if (F->empty()) continue;
+    
+    // If we can evaluate the ctor at compile time, do.
+    if (EvaluateStaticConstructor(F)) {
+      Ctors.erase(Ctors.begin()+i);
+      MadeChange = true;
+      --i;
+      ++NumCtorsEvaluated;
+      continue;
+    }
+  }
+  
+  if (!MadeChange) return false;
+  
+  GCL = InstallGlobalCtors(GCL, Ctors);
+  return true;
+}
+
+bool GlobalOpt::OptimizeGlobalAliases(Module &M) {
+  bool Changed = false;
+
+  for (Module::alias_iterator I = M.alias_begin(), E = M.alias_end();
+       I != E;) {
+    Module::alias_iterator J = I++;
+    // Aliases without names cannot be referenced outside this module.
+    if (!J->hasName() && !J->isDeclaration())
+      J->setLinkage(GlobalValue::InternalLinkage);
+    // If the aliasee may change at link time, nothing can be done - bail out.
+    if (J->mayBeOverridden())
+      continue;
+
+    Constant *Aliasee = J->getAliasee();
+    GlobalValue *Target = cast<GlobalValue>(Aliasee->stripPointerCasts());
+    Target->removeDeadConstantUsers();
+    bool hasOneUse = Target->hasOneUse() && Aliasee->hasOneUse();
+
+    // Make all users of the alias use the aliasee instead.
+    if (!J->use_empty()) {
+      J->replaceAllUsesWith(Aliasee);
+      ++NumAliasesResolved;
+      Changed = true;
+    }
+
+    // If the aliasee has internal linkage, give it the name and linkage
+    // of the alias, and delete the alias.  This turns:
+    //   define internal ... @f(...)
+    //   @a = alias ... @f
+    // into:
+    //   define ... @a(...)
+    if (!Target->hasLocalLinkage())
+      continue;
+
+    // The transform is only useful if the alias does not have internal linkage.
+    if (J->hasLocalLinkage())
+      continue;
+
+    // Do not perform the transform if multiple aliases potentially target the
+    // aliasee.  This check also ensures that it is safe to replace the section
+    // and other attributes of the aliasee with those of the alias.
+    if (!hasOneUse)
+      continue;
+
+    // Give the aliasee the name, linkage and other attributes of the alias.
+    Target->takeName(J);
+    Target->setLinkage(J->getLinkage());
+    Target->GlobalValue::copyAttributesFrom(J);
+
+    // Delete the alias.
+    M.getAliasList().erase(J);
+    ++NumAliasesRemoved;
+    Changed = true;
+  }
+
+  return Changed;
+}
+
+bool GlobalOpt::runOnModule(Module &M) {
+  bool Changed = false;
+  
+  // Try to find the llvm.globalctors list.
+  GlobalVariable *GlobalCtors = FindGlobalCtors(M);
+
+  bool LocalChange = true;
+  while (LocalChange) {
+    LocalChange = false;
+    
+    // Delete functions that are trivially dead, ccc -> fastcc
+    LocalChange |= OptimizeFunctions(M);
+    
+    // Optimize global_ctors list.
+    if (GlobalCtors)
+      LocalChange |= OptimizeGlobalCtorsList(GlobalCtors);
+    
+    // Optimize non-address-taken globals.
+    LocalChange |= OptimizeGlobalVars(M);
+
+    // Resolve aliases, when possible.
+    LocalChange |= OptimizeGlobalAliases(M);
+    Changed |= LocalChange;
+  }
+  
+  // TODO: Move all global ctors functions to the end of the module for code
+  // layout.
+  
+  return Changed;
+}
diff --git a/lib/Transforms/IPO/IPConstantPropagation.cpp b/lib/Transforms/IPO/IPConstantPropagation.cpp
new file mode 100644
index 0000000..2dc8558
--- /dev/null
+++ b/lib/Transforms/IPO/IPConstantPropagation.cpp
@@ -0,0 +1,277 @@
+//===-- IPConstantPropagation.cpp - Propagate constants through calls -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass implements an _extremely_ simple interprocedural constant
+// propagation pass.  It could certainly be improved in many different ways,
+// like using a worklist.  This pass makes arguments dead, but does not remove
+// them.  The existing dead argument elimination pass should be run after this
+// to clean up the mess.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "ipconstprop"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Constants.h"
+#include "llvm/Instructions.h"
+#include "llvm/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Support/CallSite.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/SmallVector.h"
+using namespace llvm;
+
+STATISTIC(NumArgumentsProped, "Number of args turned into constants");
+STATISTIC(NumReturnValProped, "Number of return values turned into constants");
+
+namespace {
+  /// IPCP - The interprocedural constant propagation pass
+  ///
+  struct VISIBILITY_HIDDEN IPCP : public ModulePass {
+    static char ID; // Pass identification, replacement for typeid
+    IPCP() : ModulePass(&ID) {}
+
+    bool runOnModule(Module &M);
+  private:
+    bool PropagateConstantsIntoArguments(Function &F);
+    bool PropagateConstantReturn(Function &F);
+  };
+}
+
+char IPCP::ID = 0;
+static RegisterPass<IPCP>
+X("ipconstprop", "Interprocedural constant propagation");
+
+ModulePass *llvm::createIPConstantPropagationPass() { return new IPCP(); }
+
+bool IPCP::runOnModule(Module &M) {
+  bool Changed = false;
+  bool LocalChange = true;
+
+  // FIXME: instead of using smart algorithms, we just iterate until we stop
+  // making changes.
+  while (LocalChange) {
+    LocalChange = false;
+    for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I)
+      if (!I->isDeclaration()) {
+        // Delete any klingons.
+        I->removeDeadConstantUsers();
+        if (I->hasLocalLinkage())
+          LocalChange |= PropagateConstantsIntoArguments(*I);
+        Changed |= PropagateConstantReturn(*I);
+      }
+    Changed |= LocalChange;
+  }
+  return Changed;
+}
+
+/// PropagateConstantsIntoArguments - Look at all uses of the specified
+/// function.  If all uses are direct call sites, and all pass a particular
+/// constant in for an argument, propagate that constant in as the argument.
+///
+bool IPCP::PropagateConstantsIntoArguments(Function &F) {
+  if (F.arg_empty() || F.use_empty()) return false; // No arguments? Early exit.
+
+  // For each argument, keep track of its constant value and whether it is a
+  // constant or not.  The bool is driven to true when found to be non-constant.
+  SmallVector<std::pair<Constant*, bool>, 16> ArgumentConstants;
+  ArgumentConstants.resize(F.arg_size());
+
+  unsigned NumNonconstant = 0;
+  for (Value::use_iterator UI = F.use_begin(), E = F.use_end(); UI != E; ++UI) {
+    // Used by a non-instruction, or not the callee of a function, do not
+    // transform.
+    if (!isa<CallInst>(*UI) && !isa<InvokeInst>(*UI))
+      return false;
+    
+    CallSite CS = CallSite::get(cast<Instruction>(*UI));
+    if (!CS.isCallee(UI))
+      return false;
+
+    // Check out all of the potentially constant arguments.  Note that we don't
+    // inspect varargs here.
+    CallSite::arg_iterator AI = CS.arg_begin();
+    Function::arg_iterator Arg = F.arg_begin();
+    for (unsigned i = 0, e = ArgumentConstants.size(); i != e;
+         ++i, ++AI, ++Arg) {
+      
+      // If this argument is known non-constant, ignore it.
+      if (ArgumentConstants[i].second)
+        continue;
+      
+      Constant *C = dyn_cast<Constant>(*AI);
+      if (C && ArgumentConstants[i].first == 0) {
+        ArgumentConstants[i].first = C;   // First constant seen.
+      } else if (C && ArgumentConstants[i].first == C) {
+        // Still the constant value we think it is.
+      } else if (*AI == &*Arg) {
+        // Ignore recursive calls passing argument down.
+      } else {
+        // Argument became non-constant.  If all arguments are non-constant now,
+        // give up on this function.
+        if (++NumNonconstant == ArgumentConstants.size())
+          return false;
+        ArgumentConstants[i].second = true;
+      }
+    }
+  }
+
+  // If we got to this point, there is a constant argument!
+  assert(NumNonconstant != ArgumentConstants.size());
+  bool MadeChange = false;
+  Function::arg_iterator AI = F.arg_begin();
+  for (unsigned i = 0, e = ArgumentConstants.size(); i != e; ++i, ++AI) {
+    // Do we have a constant argument?
+    if (ArgumentConstants[i].second || AI->use_empty())
+      continue;
+  
+    Value *V = ArgumentConstants[i].first;
+    if (V == 0) V = UndefValue::get(AI->getType());
+    AI->replaceAllUsesWith(V);
+    ++NumArgumentsProped;
+    MadeChange = true;
+  }
+  return MadeChange;
+}
+
+
+// Check to see if this function returns one or more constants. If so, replace
+// all callers that use those return values with the constant value. This will
+// leave in the actual return values and instructions, but deadargelim will
+// clean that up.
+//
+// Additionally if a function always returns one of its arguments directly,
+// callers will be updated to use the value they pass in directly instead of
+// using the return value.
+bool IPCP::PropagateConstantReturn(Function &F) {
+  if (F.getReturnType() == Type::VoidTy)
+    return false; // No return value.
+
+  // If this function could be overridden later in the link stage, we can't
+  // propagate information about its results into callers.
+  if (F.mayBeOverridden())
+    return false;
+  
+  // Check to see if this function returns a constant.
+  SmallVector<Value *,4> RetVals;
+  const StructType *STy = dyn_cast<StructType>(F.getReturnType());
+  if (STy)
+    for (unsigned i = 0, e = STy->getNumElements(); i < e; ++i) 
+      RetVals.push_back(UndefValue::get(STy->getElementType(i)));
+  else
+    RetVals.push_back(UndefValue::get(F.getReturnType()));
+
+  unsigned NumNonConstant = 0;
+  for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB)
+    if (ReturnInst *RI = dyn_cast<ReturnInst>(BB->getTerminator())) {
+      for (unsigned i = 0, e = RetVals.size(); i != e; ++i) {
+        // Already found conflicting return values?
+        Value *RV = RetVals[i];
+        if (!RV)
+          continue;
+
+        // Find the returned value
+        Value *V;
+        if (!STy)
+          V = RI->getOperand(i);
+        else
+          V = FindInsertedValue(RI->getOperand(0), i);
+
+        if (V) {
+          // Ignore undefs, we can change them into anything
+          if (isa<UndefValue>(V))
+            continue;
+          
+          // Try to see if all the rets return the same constant or argument.
+          if (isa<Constant>(V) || isa<Argument>(V)) {
+            if (isa<UndefValue>(RV)) {
+              // No value found yet? Try the current one.
+              RetVals[i] = V;
+              continue;
+            }
+            // Returning the same value? Good.
+            if (RV == V)
+              continue;
+          }
+        }
+        // Different or no known return value? Don't propagate this return
+        // value.
+        RetVals[i] = 0;
+        // All values non constant? Stop looking.
+        if (++NumNonConstant == RetVals.size())
+          return false;
+      }
+    }
+
+  // If we got here, the function returns at least one constant value.  Loop
+  // over all users, replacing any uses of the return value with the returned
+  // constant.
+  bool MadeChange = false;
+  for (Value::use_iterator UI = F.use_begin(), E = F.use_end(); UI != E; ++UI) {
+    CallSite CS = CallSite::get(*UI);
+    Instruction* Call = CS.getInstruction();
+
+    // Not a call instruction or a call instruction that's not calling F
+    // directly?
+    if (!Call || !CS.isCallee(UI))
+      continue;
+    
+    // Call result not used?
+    if (Call->use_empty())
+      continue;
+
+    MadeChange = true;
+
+    if (STy == 0) {
+      Value* New = RetVals[0];
+      if (Argument *A = dyn_cast<Argument>(New))
+        // Was an argument returned? Then find the corresponding argument in
+        // the call instruction and use that.
+        New = CS.getArgument(A->getArgNo());
+      Call->replaceAllUsesWith(New);
+      continue;
+    }
+   
+    for (Value::use_iterator I = Call->use_begin(), E = Call->use_end();
+         I != E;) {
+      Instruction *Ins = dyn_cast<Instruction>(*I);
+
+      // Increment now, so we can remove the use
+      ++I;
+
+      // Not an instruction? Ignore
+      if (!Ins)
+        continue;
+
+      // Find the index of the retval to replace with
+      int index = -1;
+      if (ExtractValueInst *EV = dyn_cast<ExtractValueInst>(Ins))
+        if (EV->hasIndices())
+          index = *EV->idx_begin();
+
+      // If this use uses a specific return value, and we have a replacement,
+      // replace it.
+      if (index != -1) {
+        Value *New = RetVals[index];
+        if (New) {
+          if (Argument *A = dyn_cast<Argument>(New))
+            // Was an argument returned? Then find the corresponding argument in
+            // the call instruction and use that.
+            New = CS.getArgument(A->getArgNo());
+          Ins->replaceAllUsesWith(New);
+          Ins->eraseFromParent();
+        }
+      }
+    }
+  }
+
+  if (MadeChange) ++NumReturnValProped;
+  return MadeChange;
+}
diff --git a/lib/Transforms/IPO/IPO.cpp b/lib/Transforms/IPO/IPO.cpp
new file mode 100644
index 0000000..43066076
--- /dev/null
+++ b/lib/Transforms/IPO/IPO.cpp
@@ -0,0 +1,75 @@
+//===-- Scalar.cpp --------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the C bindings for libLLVMIPO.a, which implements
+// several transformations over the LLVM intermediate representation.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm-c/Transforms/IPO.h"
+#include "llvm/PassManager.h"
+#include "llvm/Transforms/IPO.h"
+
+using namespace llvm;
+
+void LLVMAddArgumentPromotionPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createArgumentPromotionPass());
+}
+
+void LLVMAddConstantMergePass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createConstantMergePass());
+}
+
+void LLVMAddDeadArgEliminationPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createDeadArgEliminationPass());
+}
+
+void LLVMAddDeadTypeEliminationPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createDeadTypeEliminationPass());
+}
+
+void LLVMAddFunctionAttrsPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createFunctionAttrsPass());
+}
+
+void LLVMAddFunctionInliningPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createFunctionInliningPass());
+}
+
+void LLVMAddGlobalDCEPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createGlobalDCEPass());
+}
+
+void LLVMAddGlobalOptimizerPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createGlobalOptimizerPass());
+}
+
+void LLVMAddIPConstantPropagationPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createIPConstantPropagationPass());
+}
+
+void LLVMAddLowerSetJmpPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createLowerSetJmpPass());
+}
+
+void LLVMAddPruneEHPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createPruneEHPass());
+}
+
+void LLVMAddRaiseAllocationsPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createRaiseAllocationsPass());
+}
+
+void LLVMAddStripDeadPrototypesPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createStripDeadPrototypesPass());
+}
+
+void LLVMAddStripSymbolsPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createStripSymbolsPass());
+}
diff --git a/lib/Transforms/IPO/IndMemRemoval.cpp b/lib/Transforms/IPO/IndMemRemoval.cpp
new file mode 100644
index 0000000..b55dea2
--- /dev/null
+++ b/lib/Transforms/IPO/IndMemRemoval.cpp
@@ -0,0 +1,89 @@
+//===-- IndMemRemoval.cpp - Remove indirect allocations and frees ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass finds places where memory allocation functions may escape into
+// indirect land.  Some transforms are much easier (aka possible) only if free 
+// or malloc are not called indirectly.
+// Thus find places where the address of memory functions are taken and construct
+// bounce functions with direct calls of those functions.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "indmemrem"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Pass.h"
+#include "llvm/Module.h"
+#include "llvm/Instructions.h"
+#include "llvm/Type.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/Compiler.h"
+using namespace llvm;
+
+STATISTIC(NumBounceSites, "Number of sites modified");
+STATISTIC(NumBounce     , "Number of bounce functions created");
+
+namespace {
+  class VISIBILITY_HIDDEN IndMemRemPass : public ModulePass {
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    IndMemRemPass() : ModulePass(&ID) {}
+
+    virtual bool runOnModule(Module &M);
+  };
+} // end anonymous namespace
+
+char IndMemRemPass::ID = 0;
+static RegisterPass<IndMemRemPass>
+X("indmemrem","Indirect Malloc and Free Removal");
+
+bool IndMemRemPass::runOnModule(Module &M) {
+  // In theory, all direct calls of malloc and free should be promoted
+  // to intrinsics.  Therefore, this goes through and finds where the
+  // address of free or malloc are taken and replaces those with bounce
+  // functions, ensuring that all malloc and free that might happen
+  // happen through intrinsics.
+  bool changed = false;
+  if (Function* F = M.getFunction("free")) {
+    if (F->isDeclaration() && F->arg_size() == 1 && !F->use_empty()) {
+      Function* FN = Function::Create(F->getFunctionType(),
+                                      GlobalValue::LinkOnceAnyLinkage,
+                                      "free_llvm_bounce", &M);
+      BasicBlock* bb = BasicBlock::Create("entry",FN);
+      Instruction* R = ReturnInst::Create(bb);
+      new FreeInst(FN->arg_begin(), R);
+      ++NumBounce;
+      NumBounceSites += F->getNumUses();
+      F->replaceAllUsesWith(FN);
+      changed = true;
+    }
+  }
+  if (Function* F = M.getFunction("malloc")) {
+    if (F->isDeclaration() && F->arg_size() == 1 && !F->use_empty()) {
+      Function* FN = Function::Create(F->getFunctionType(), 
+                                      GlobalValue::LinkOnceAnyLinkage,
+                                      "malloc_llvm_bounce", &M);
+      FN->setDoesNotAlias(0);
+      BasicBlock* bb = BasicBlock::Create("entry",FN);
+      Instruction* c = CastInst::CreateIntegerCast(
+          FN->arg_begin(), Type::Int32Ty, false, "c", bb);
+      Instruction* a = new MallocInst(Type::Int8Ty, c, "m", bb);
+      ReturnInst::Create(a, bb);
+      ++NumBounce;
+      NumBounceSites += F->getNumUses();
+      F->replaceAllUsesWith(FN);
+      changed = true;
+    }
+  }
+  return changed;
+}
+
+ModulePass *llvm::createIndMemRemPass() {
+  return new IndMemRemPass();
+}
diff --git a/lib/Transforms/IPO/InlineAlways.cpp b/lib/Transforms/IPO/InlineAlways.cpp
new file mode 100644
index 0000000..5f9ea54
--- /dev/null
+++ b/lib/Transforms/IPO/InlineAlways.cpp
@@ -0,0 +1,75 @@
+//===- InlineAlways.cpp - Code to inline always_inline functions ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a custom inliner that handles only functions that
+// are marked as "always inline".
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "inline"
+#include "llvm/CallingConv.h"
+#include "llvm/Instructions.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Module.h"
+#include "llvm/Type.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/Support/CallSite.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/IPO/InlinerPass.h"
+#include "llvm/Transforms/Utils/InlineCost.h"
+#include "llvm/ADT/SmallPtrSet.h"
+
+using namespace llvm;
+
+namespace {
+
+  // AlwaysInliner only inlines functions that are mark as "always inline".
+  class VISIBILITY_HIDDEN AlwaysInliner : public Inliner {
+    // Functions that are never inlined
+    SmallPtrSet<const Function*, 16> NeverInline; 
+    InlineCostAnalyzer CA;
+  public:
+    // Use extremely low threshold. 
+    AlwaysInliner() : Inliner(&ID, -2000000000) {}
+    static char ID; // Pass identification, replacement for typeid
+    InlineCost getInlineCost(CallSite CS) {
+      return CA.getInlineCost(CS, NeverInline);
+    }
+    float getInlineFudgeFactor(CallSite CS) {
+      return CA.getInlineFudgeFactor(CS);
+    }
+    void resetCachedCostInfo(Function *Caller) {
+      return CA.resetCachedCostInfo(Caller);
+    }
+    virtual bool doFinalization(CallGraph &CG) { 
+      return removeDeadFunctions(CG, &NeverInline); 
+    }
+    virtual bool doInitialization(CallGraph &CG);
+  };
+}
+
+char AlwaysInliner::ID = 0;
+static RegisterPass<AlwaysInliner>
+X("always-inline", "Inliner for always_inline functions");
+
+Pass *llvm::createAlwaysInlinerPass() { return new AlwaysInliner(); }
+
+// doInitialization - Initializes the vector of functions that have not 
+// been annotated with the "always inline" attribute.
+bool AlwaysInliner::doInitialization(CallGraph &CG) {
+  Module &M = CG.getModule();
+  
+  for (Module::iterator I = M.begin(), E = M.end();
+       I != E; ++I)
+    if (!I->isDeclaration() && !I->hasFnAttr(Attribute::AlwaysInline))
+      NeverInline.insert(I);
+
+  return false;
+}
diff --git a/lib/Transforms/IPO/InlineSimple.cpp b/lib/Transforms/IPO/InlineSimple.cpp
new file mode 100644
index 0000000..e107a00
--- /dev/null
+++ b/lib/Transforms/IPO/InlineSimple.cpp
@@ -0,0 +1,106 @@
+//===- InlineSimple.cpp - Code to perform simple function inlining --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements bottom-up inlining of functions into callees.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "inline"
+#include "llvm/CallingConv.h"
+#include "llvm/Instructions.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Module.h"
+#include "llvm/Type.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/Support/CallSite.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/IPO/InlinerPass.h"
+#include "llvm/Transforms/Utils/InlineCost.h"
+#include "llvm/ADT/SmallPtrSet.h"
+
+using namespace llvm;
+
+namespace {
+
+  class VISIBILITY_HIDDEN SimpleInliner : public Inliner {
+    // Functions that are never inlined
+    SmallPtrSet<const Function*, 16> NeverInline; 
+    InlineCostAnalyzer CA;
+  public:
+    SimpleInliner() : Inliner(&ID) {}
+    SimpleInliner(int Threshold) : Inliner(&ID, Threshold) {}
+    static char ID; // Pass identification, replacement for typeid
+    InlineCost getInlineCost(CallSite CS) {
+      return CA.getInlineCost(CS, NeverInline);
+    }
+    float getInlineFudgeFactor(CallSite CS) {
+      return CA.getInlineFudgeFactor(CS);
+    }
+    void resetCachedCostInfo(Function *Caller) {
+      CA.resetCachedCostInfo(Caller);
+    }
+    virtual bool doInitialization(CallGraph &CG);
+  };
+}
+
+char SimpleInliner::ID = 0;
+static RegisterPass<SimpleInliner>
+X("inline", "Function Integration/Inlining");
+
+Pass *llvm::createFunctionInliningPass() { return new SimpleInliner(); }
+
+Pass *llvm::createFunctionInliningPass(int Threshold) { 
+  return new SimpleInliner(Threshold);
+}
+
+// doInitialization - Initializes the vector of functions that have been
+// annotated with the noinline attribute.
+bool SimpleInliner::doInitialization(CallGraph &CG) {
+  
+  Module &M = CG.getModule();
+  
+  for (Module::iterator I = M.begin(), E = M.end();
+       I != E; ++I)
+    if (!I->isDeclaration() && I->hasFnAttr(Attribute::NoInline))
+      NeverInline.insert(I);
+
+  // Get llvm.noinline
+  GlobalVariable *GV = M.getNamedGlobal("llvm.noinline");
+  
+  if (GV == 0)
+    return false;
+
+  // Don't crash on invalid code
+  if (!GV->hasInitializer())
+    return false;
+  
+  const ConstantArray *InitList = dyn_cast<ConstantArray>(GV->getInitializer());
+  
+  if (InitList == 0)
+    return false;
+
+  // Iterate over each element and add to the NeverInline set
+  for (unsigned i = 0, e = InitList->getNumOperands(); i != e; ++i) {
+        
+    // Get Source
+    const Constant *Elt = InitList->getOperand(i);
+        
+    if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(Elt))
+      if (CE->getOpcode() == Instruction::BitCast) 
+        Elt = CE->getOperand(0);
+    
+    // Insert into set of functions to never inline
+    if (const Function *F = dyn_cast<Function>(Elt))
+      NeverInline.insert(F);
+  }
+  
+  return false;
+}
+
diff --git a/lib/Transforms/IPO/Inliner.cpp b/lib/Transforms/IPO/Inliner.cpp
new file mode 100644
index 0000000..b382837
--- /dev/null
+++ b/lib/Transforms/IPO/Inliner.cpp
@@ -0,0 +1,278 @@
+//===- Inliner.cpp - Code common to all inliners --------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the mechanics required to implement inlining without
+// missing any calls and updating the call graph.  The decisions of which calls
+// are profitable to inline are implemented elsewhere.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "inline"
+#include "llvm/Module.h"
+#include "llvm/Instructions.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/Support/CallSite.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Transforms/IPO/InlinerPass.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/ADT/Statistic.h"
+#include <set>
+using namespace llvm;
+
+STATISTIC(NumInlined, "Number of functions inlined");
+STATISTIC(NumDeleted, "Number of functions deleted because all callers found");
+
+static cl::opt<int>
+InlineLimit("inline-threshold", cl::Hidden, cl::init(200),
+        cl::desc("Control the amount of inlining to perform (default = 200)"));
+
+Inliner::Inliner(void *ID) 
+  : CallGraphSCCPass(ID), InlineThreshold(InlineLimit) {}
+
+Inliner::Inliner(void *ID, int Threshold) 
+  : CallGraphSCCPass(ID), InlineThreshold(Threshold) {}
+
+/// getAnalysisUsage - For this class, we declare that we require and preserve
+/// the call graph.  If the derived class implements this method, it should
+/// always explicitly call the implementation here.
+void Inliner::getAnalysisUsage(AnalysisUsage &Info) const {
+  Info.addRequired<TargetData>();
+  CallGraphSCCPass::getAnalysisUsage(Info);
+}
+
+// InlineCallIfPossible - If it is possible to inline the specified call site,
+// do so and update the CallGraph for this operation.
+bool Inliner::InlineCallIfPossible(CallSite CS, CallGraph &CG,
+                                 const SmallPtrSet<Function*, 8> &SCCFunctions,
+                                 const TargetData &TD) {
+  Function *Callee = CS.getCalledFunction();
+  Function *Caller = CS.getCaller();
+
+  if (!InlineFunction(CS, &CG, &TD)) return false;
+
+  // If the inlined function had a higher stack protection level than the
+  // calling function, then bump up the caller's stack protection level.
+  if (Callee->hasFnAttr(Attribute::StackProtectReq))
+    Caller->addFnAttr(Attribute::StackProtectReq);
+  else if (Callee->hasFnAttr(Attribute::StackProtect) &&
+           !Caller->hasFnAttr(Attribute::StackProtectReq))
+    Caller->addFnAttr(Attribute::StackProtect);
+
+  // If we inlined the last possible call site to the function, delete the
+  // function body now.
+  if (Callee->use_empty() && (Callee->hasLocalLinkage() ||
+                              Callee->hasAvailableExternallyLinkage()) &&
+      !SCCFunctions.count(Callee)) {
+    DOUT << "    -> Deleting dead function: " << Callee->getName() << "\n";
+    CallGraphNode *CalleeNode = CG[Callee];
+
+    // Remove any call graph edges from the callee to its callees.
+    CalleeNode->removeAllCalledFunctions();
+
+    resetCachedCostInfo(CalleeNode->getFunction());
+
+    // Removing the node for callee from the call graph and delete it.
+    delete CG.removeFunctionFromModule(CalleeNode);
+    ++NumDeleted;
+  }
+  return true;
+}
+        
+/// shouldInline - Return true if the inliner should attempt to inline
+/// at the given CallSite.
+bool Inliner::shouldInline(CallSite CS) {
+  InlineCost IC = getInlineCost(CS);
+  float FudgeFactor = getInlineFudgeFactor(CS);
+  
+  if (IC.isAlways()) {
+    DOUT << "    Inlining: cost=always"
+         << ", Call: " << *CS.getInstruction();
+    return true;
+  }
+  
+  if (IC.isNever()) {
+    DOUT << "    NOT Inlining: cost=never"
+         << ", Call: " << *CS.getInstruction();
+    return false;
+  }
+  
+  int Cost = IC.getValue();
+  int CurrentThreshold = InlineThreshold;
+  Function *Fn = CS.getCaller();
+  if (Fn && !Fn->isDeclaration() 
+      && Fn->hasFnAttr(Attribute::OptimizeForSize)
+      && InlineThreshold != 50) {
+    CurrentThreshold = 50;
+  }
+  
+  if (Cost >= (int)(CurrentThreshold * FudgeFactor)) {
+    DOUT << "    NOT Inlining: cost=" << Cost
+         << ", Call: " << *CS.getInstruction();
+    return false;
+  } else {
+    DOUT << "    Inlining: cost=" << Cost
+         << ", Call: " << *CS.getInstruction();
+    return true;
+  }
+}
+
+bool Inliner::runOnSCC(const std::vector<CallGraphNode*> &SCC) {
+  CallGraph &CG = getAnalysis<CallGraph>();
+  TargetData &TD = getAnalysis<TargetData>();
+
+  SmallPtrSet<Function*, 8> SCCFunctions;
+  DOUT << "Inliner visiting SCC:";
+  for (unsigned i = 0, e = SCC.size(); i != e; ++i) {
+    Function *F = SCC[i]->getFunction();
+    if (F) SCCFunctions.insert(F);
+    DOUT << " " << (F ? F->getName() : "INDIRECTNODE");
+  }
+
+  // Scan through and identify all call sites ahead of time so that we only
+  // inline call sites in the original functions, not call sites that result
+  // from inlining other functions.
+  std::vector<CallSite> CallSites;
+
+  for (unsigned i = 0, e = SCC.size(); i != e; ++i)
+    if (Function *F = SCC[i]->getFunction())
+      for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB)
+        for (BasicBlock::iterator I = BB->begin(); I != BB->end(); ++I) {
+          CallSite CS = CallSite::get(I);
+          if (CS.getInstruction() && !isa<DbgInfoIntrinsic>(I) &&
+                                     (!CS.getCalledFunction() ||
+                                      !CS.getCalledFunction()->isDeclaration()))
+            CallSites.push_back(CS);
+        }
+
+  DOUT << ": " << CallSites.size() << " call sites.\n";
+
+  // Now that we have all of the call sites, move the ones to functions in the
+  // current SCC to the end of the list.
+  unsigned FirstCallInSCC = CallSites.size();
+  for (unsigned i = 0; i < FirstCallInSCC; ++i)
+    if (Function *F = CallSites[i].getCalledFunction())
+      if (SCCFunctions.count(F))
+        std::swap(CallSites[i--], CallSites[--FirstCallInSCC]);
+
+  // Now that we have all of the call sites, loop over them and inline them if
+  // it looks profitable to do so.
+  bool Changed = false;
+  bool LocalChange;
+  do {
+    LocalChange = false;
+    // Iterate over the outer loop because inlining functions can cause indirect
+    // calls to become direct calls.
+    for (unsigned CSi = 0; CSi != CallSites.size(); ++CSi)
+      if (Function *Callee = CallSites[CSi].getCalledFunction()) {
+        // Calls to external functions are never inlinable.
+        if (Callee->isDeclaration()) {
+          if (SCC.size() == 1) {
+            std::swap(CallSites[CSi], CallSites.back());
+            CallSites.pop_back();
+          } else {
+            // Keep the 'in SCC / not in SCC' boundary correct.
+            CallSites.erase(CallSites.begin()+CSi);
+          }
+          --CSi;
+          continue;
+        }
+
+        // If the policy determines that we should inline this function,
+        // try to do so.
+        CallSite CS = CallSites[CSi];
+        if (shouldInline(CS)) {
+          Function *Caller = CS.getCaller();
+          // Attempt to inline the function...
+          if (InlineCallIfPossible(CS, CG, SCCFunctions, TD)) {
+            // Remove any cached cost info for this caller, as inlining the
+            // callee has increased the size of the caller (which may be the
+            // same as the callee).
+            resetCachedCostInfo(Caller);
+
+            // Remove this call site from the list.  If possible, use 
+            // swap/pop_back for efficiency, but do not use it if doing so would
+            // move a call site to a function in this SCC before the
+            // 'FirstCallInSCC' barrier.
+            if (SCC.size() == 1) {
+              std::swap(CallSites[CSi], CallSites.back());
+              CallSites.pop_back();
+            } else {
+              CallSites.erase(CallSites.begin()+CSi);
+            }
+            --CSi;
+
+            ++NumInlined;
+            Changed = true;
+            LocalChange = true;
+          }
+        }
+      }
+  } while (LocalChange);
+
+  return Changed;
+}
+
+// doFinalization - Remove now-dead linkonce functions at the end of
+// processing to avoid breaking the SCC traversal.
+bool Inliner::doFinalization(CallGraph &CG) {
+  return removeDeadFunctions(CG);
+}
+
+  /// removeDeadFunctions - Remove dead functions that are not included in
+  /// DNR (Do Not Remove) list.
+bool Inliner::removeDeadFunctions(CallGraph &CG, 
+                                 SmallPtrSet<const Function *, 16> *DNR) {
+  std::set<CallGraphNode*> FunctionsToRemove;
+
+  // Scan for all of the functions, looking for ones that should now be removed
+  // from the program.  Insert the dead ones in the FunctionsToRemove set.
+  for (CallGraph::iterator I = CG.begin(), E = CG.end(); I != E; ++I) {
+    CallGraphNode *CGN = I->second;
+    if (Function *F = CGN ? CGN->getFunction() : 0) {
+      // If the only remaining users of the function are dead constants, remove
+      // them.
+      F->removeDeadConstantUsers();
+
+      if (DNR && DNR->count(F))
+        continue;
+
+      if ((F->hasLinkOnceLinkage() || F->hasLocalLinkage()) &&
+          F->use_empty()) {
+
+        // Remove any call graph edges from the function to its callees.
+        CGN->removeAllCalledFunctions();
+
+        // Remove any edges from the external node to the function's call graph
+        // node.  These edges might have been made irrelegant due to
+        // optimization of the program.
+        CG.getExternalCallingNode()->removeAnyCallEdgeTo(CGN);
+
+        // Removing the node for callee from the call graph and delete it.
+        FunctionsToRemove.insert(CGN);
+      }
+    }
+  }
+
+  // Now that we know which functions to delete, do so.  We didn't want to do
+  // this inline, because that would invalidate our CallGraph::iterator
+  // objects. :(
+  bool Changed = false;
+  for (std::set<CallGraphNode*>::iterator I = FunctionsToRemove.begin(),
+         E = FunctionsToRemove.end(); I != E; ++I) {
+    resetCachedCostInfo((*I)->getFunction());
+    delete CG.removeFunctionFromModule(*I);
+    ++NumDeleted;
+    Changed = true;
+  }
+
+  return Changed;
+}
diff --git a/lib/Transforms/IPO/Internalize.cpp b/lib/Transforms/IPO/Internalize.cpp
new file mode 100644
index 0000000..5093ae9
--- /dev/null
+++ b/lib/Transforms/IPO/Internalize.cpp
@@ -0,0 +1,184 @@
+//===-- Internalize.cpp - Mark functions internal -------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass loops over all of the functions in the input module, looking for a
+// main function.  If a main function is found, all other functions and all
+// global variables with initializers are marked as internal.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "internalize"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Pass.h"
+#include "llvm/Module.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/ADT/Statistic.h"
+#include <fstream>
+#include <set>
+using namespace llvm;
+
+STATISTIC(NumAliases  , "Number of aliases internalized");
+STATISTIC(NumFunctions, "Number of functions internalized");
+STATISTIC(NumGlobals  , "Number of global vars internalized");
+
+// APIFile - A file which contains a list of symbols that should not be marked
+// external.
+static cl::opt<std::string>
+APIFile("internalize-public-api-file", cl::value_desc("filename"),
+        cl::desc("A file containing list of symbol names to preserve"));
+
+// APIList - A list of symbols that should not be marked internal.
+static cl::list<std::string>
+APIList("internalize-public-api-list", cl::value_desc("list"),
+        cl::desc("A list of symbol names to preserve"),
+        cl::CommaSeparated);
+
+namespace {
+  class VISIBILITY_HIDDEN InternalizePass : public ModulePass {
+    std::set<std::string> ExternalNames;
+    /// If no api symbols were specified and a main function is defined,
+    /// assume the main function is the only API
+    bool AllButMain;
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    explicit InternalizePass(bool AllButMain = true);
+    explicit InternalizePass(const std::vector <const char *>& exportList);
+    void LoadFile(const char *Filename);
+    virtual bool runOnModule(Module &M);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesCFG();
+      AU.addPreserved<CallGraph>();
+    }
+  };
+} // end anonymous namespace
+
+char InternalizePass::ID = 0;
+static RegisterPass<InternalizePass>
+X("internalize", "Internalize Global Symbols");
+
+InternalizePass::InternalizePass(bool AllButMain)
+  : ModulePass(&ID), AllButMain(AllButMain){
+  if (!APIFile.empty())           // If a filename is specified, use it.
+    LoadFile(APIFile.c_str());
+  if (!APIList.empty())           // If a list is specified, use it as well.
+    ExternalNames.insert(APIList.begin(), APIList.end());
+}
+
+InternalizePass::InternalizePass(const std::vector<const char *>&exportList)
+  : ModulePass(&ID), AllButMain(false){
+  for(std::vector<const char *>::const_iterator itr = exportList.begin();
+        itr != exportList.end(); itr++) {
+    ExternalNames.insert(*itr);
+  }
+}
+
+void InternalizePass::LoadFile(const char *Filename) {
+  // Load the APIFile...
+  std::ifstream In(Filename);
+  if (!In.good()) {
+    cerr << "WARNING: Internalize couldn't load file '" << Filename
+         << "'! Continuing as if it's empty.\n";
+    return; // Just continue as if the file were empty
+  }
+  while (In) {
+    std::string Symbol;
+    In >> Symbol;
+    if (!Symbol.empty())
+      ExternalNames.insert(Symbol);
+  }
+}
+
+bool InternalizePass::runOnModule(Module &M) {
+  CallGraph *CG = getAnalysisIfAvailable<CallGraph>();
+  CallGraphNode *ExternalNode = CG ? CG->getExternalCallingNode() : 0;
+
+  if (ExternalNames.empty()) {
+    // Return if we're not in 'all but main' mode and have no external api
+    if (!AllButMain)
+      return false;
+    // If no list or file of symbols was specified, check to see if there is a
+    // "main" symbol defined in the module.  If so, use it, otherwise do not
+    // internalize the module, it must be a library or something.
+    //
+    Function *MainFunc = M.getFunction("main");
+    if (MainFunc == 0 || MainFunc->isDeclaration())
+      return false;  // No main found, must be a library...
+
+    // Preserve main, internalize all else.
+    ExternalNames.insert(MainFunc->getName());
+  }
+
+  bool Changed = false;
+
+  // Mark all functions not in the api as internal.
+  // FIXME: maybe use private linkage?
+  for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I)
+    if (!I->isDeclaration() &&         // Function must be defined here
+        !I->hasLocalLinkage() &&  // Can't already have internal linkage
+        !ExternalNames.count(I->getName())) {// Not marked to keep external?
+      I->setLinkage(GlobalValue::InternalLinkage);
+      // Remove a callgraph edge from the external node to this function.
+      if (ExternalNode) ExternalNode->removeOneAbstractEdgeTo((*CG)[I]);
+      Changed = true;
+      ++NumFunctions;
+      DOUT << "Internalizing func " << I->getName() << "\n";
+    }
+
+  // Never internalize the llvm.used symbol.  It is used to implement
+  // attribute((used)).
+  ExternalNames.insert("llvm.used");
+
+  // Never internalize anchors used by the machine module info, else the info
+  // won't find them.  (see MachineModuleInfo.)
+  ExternalNames.insert("llvm.dbg.compile_units");
+  ExternalNames.insert("llvm.dbg.global_variables");
+  ExternalNames.insert("llvm.dbg.subprograms");
+  ExternalNames.insert("llvm.global_ctors");
+  ExternalNames.insert("llvm.global_dtors");
+  ExternalNames.insert("llvm.noinline");
+  ExternalNames.insert("llvm.global.annotations");
+
+  // Mark all global variables with initializers that are not in the api as
+  // internal as well.
+  // FIXME: maybe use private linkage?
+  for (Module::global_iterator I = M.global_begin(), E = M.global_end();
+       I != E; ++I)
+    if (!I->isDeclaration() && !I->hasLocalLinkage() &&
+        !ExternalNames.count(I->getName())) {
+      I->setLinkage(GlobalValue::InternalLinkage);
+      Changed = true;
+      ++NumGlobals;
+      DOUT << "Internalized gvar " << I->getName() << "\n";
+    }
+
+  // Mark all aliases that are not in the api as internal as well.
+  for (Module::alias_iterator I = M.alias_begin(), E = M.alias_end();
+       I != E; ++I)
+    if (!I->isDeclaration() && !I->hasInternalLinkage() &&
+        !ExternalNames.count(I->getName())) {
+      I->setLinkage(GlobalValue::InternalLinkage);
+      Changed = true;
+      ++NumAliases;
+      DOUT << "Internalized alias " << I->getName() << "\n";
+    }
+
+  return Changed;
+}
+
+ModulePass *llvm::createInternalizePass(bool AllButMain) {
+  return new InternalizePass(AllButMain);
+}
+
+ModulePass *llvm::createInternalizePass(const std::vector <const char *> &el) {
+  return new InternalizePass(el);
+}
diff --git a/lib/Transforms/IPO/LoopExtractor.cpp b/lib/Transforms/IPO/LoopExtractor.cpp
new file mode 100644
index 0000000..0c65443
--- /dev/null
+++ b/lib/Transforms/IPO/LoopExtractor.cpp
@@ -0,0 +1,261 @@
+//===- LoopExtractor.cpp - Extract each loop into a new function ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// A pass wrapper around the ExtractLoop() scalar transformation to extract each
+// top-level loop into its own new function. If the loop is the ONLY loop in a
+// given function, it is not touched. This is a pass most useful for debugging
+// via bugpoint.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "loop-extract"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Instructions.h"
+#include "llvm/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/FunctionUtils.h"
+#include "llvm/ADT/Statistic.h"
+#include <fstream>
+#include <set>
+using namespace llvm;
+
+STATISTIC(NumExtracted, "Number of loops extracted");
+
+namespace {
+  // FIXME: This is not a function pass, but the PassManager doesn't allow
+  // Module passes to require FunctionPasses, so we can't get loop info if we're
+  // not a function pass.
+  struct VISIBILITY_HIDDEN LoopExtractor : public FunctionPass {
+    static char ID; // Pass identification, replacement for typeid
+    unsigned NumLoops;
+
+    explicit LoopExtractor(unsigned numLoops = ~0) 
+      : FunctionPass(&ID), NumLoops(numLoops) {}
+
+    virtual bool runOnFunction(Function &F);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequiredID(BreakCriticalEdgesID);
+      AU.addRequiredID(LoopSimplifyID);
+      AU.addRequired<DominatorTree>();
+      AU.addRequired<LoopInfo>();
+    }
+  };
+}
+
+char LoopExtractor::ID = 0;
+static RegisterPass<LoopExtractor>
+X("loop-extract", "Extract loops into new functions");
+
+namespace {
+  /// SingleLoopExtractor - For bugpoint.
+  struct SingleLoopExtractor : public LoopExtractor {
+    static char ID; // Pass identification, replacement for typeid
+    SingleLoopExtractor() : LoopExtractor(1) {}
+  };
+} // End anonymous namespace
+
+char SingleLoopExtractor::ID = 0;
+static RegisterPass<SingleLoopExtractor>
+Y("loop-extract-single", "Extract at most one loop into a new function");
+
+// createLoopExtractorPass - This pass extracts all natural loops from the
+// program into a function if it can.
+//
+FunctionPass *llvm::createLoopExtractorPass() { return new LoopExtractor(); }
+
+bool LoopExtractor::runOnFunction(Function &F) {
+  LoopInfo &LI = getAnalysis<LoopInfo>();
+
+  // If this function has no loops, there is nothing to do.
+  if (LI.empty())
+    return false;
+
+  DominatorTree &DT = getAnalysis<DominatorTree>();
+
+  // If there is more than one top-level loop in this function, extract all of
+  // the loops.
+  bool Changed = false;
+  if (LI.end()-LI.begin() > 1) {
+    for (LoopInfo::iterator i = LI.begin(), e = LI.end(); i != e; ++i) {
+      if (NumLoops == 0) return Changed;
+      --NumLoops;
+      Changed |= ExtractLoop(DT, *i) != 0;
+      ++NumExtracted;
+    }
+  } else {
+    // Otherwise there is exactly one top-level loop.  If this function is more
+    // than a minimal wrapper around the loop, extract the loop.
+    Loop *TLL = *LI.begin();
+    bool ShouldExtractLoop = false;
+
+    // Extract the loop if the entry block doesn't branch to the loop header.
+    TerminatorInst *EntryTI = F.getEntryBlock().getTerminator();
+    if (!isa<BranchInst>(EntryTI) ||
+        !cast<BranchInst>(EntryTI)->isUnconditional() ||
+        EntryTI->getSuccessor(0) != TLL->getHeader())
+      ShouldExtractLoop = true;
+    else {
+      // Check to see if any exits from the loop are more than just return
+      // blocks.
+      SmallVector<BasicBlock*, 8> ExitBlocks;
+      TLL->getExitBlocks(ExitBlocks);
+      for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i)
+        if (!isa<ReturnInst>(ExitBlocks[i]->getTerminator())) {
+          ShouldExtractLoop = true;
+          break;
+        }
+    }
+
+    if (ShouldExtractLoop) {
+      if (NumLoops == 0) return Changed;
+      --NumLoops;
+      Changed |= ExtractLoop(DT, TLL) != 0;
+      ++NumExtracted;
+    } else {
+      // Okay, this function is a minimal container around the specified loop.
+      // If we extract the loop, we will continue to just keep extracting it
+      // infinitely... so don't extract it.  However, if the loop contains any
+      // subloops, extract them.
+      for (Loop::iterator i = TLL->begin(), e = TLL->end(); i != e; ++i) {
+        if (NumLoops == 0) return Changed;
+        --NumLoops;
+        Changed |= ExtractLoop(DT, *i) != 0;
+        ++NumExtracted;
+      }
+    }
+  }
+
+  return Changed;
+}
+
+// createSingleLoopExtractorPass - This pass extracts one natural loop from the
+// program into a function if it can.  This is used by bugpoint.
+//
+FunctionPass *llvm::createSingleLoopExtractorPass() {
+  return new SingleLoopExtractor();
+}
+
+
+// BlockFile - A file which contains a list of blocks that should not be
+// extracted.
+static cl::opt<std::string>
+BlockFile("extract-blocks-file", cl::value_desc("filename"),
+          cl::desc("A file containing list of basic blocks to not extract"),
+          cl::Hidden);
+
+namespace {
+  /// BlockExtractorPass - This pass is used by bugpoint to extract all blocks
+  /// from the module into their own functions except for those specified by the
+  /// BlocksToNotExtract list.
+  class BlockExtractorPass : public ModulePass {
+    void LoadFile(const char *Filename);
+
+    std::vector<BasicBlock*> BlocksToNotExtract;
+    std::vector<std::pair<std::string, std::string> > BlocksToNotExtractByName;
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    explicit BlockExtractorPass(const std::vector<BasicBlock*> &B) 
+      : ModulePass(&ID), BlocksToNotExtract(B) {
+      if (!BlockFile.empty())
+        LoadFile(BlockFile.c_str());
+    }
+    BlockExtractorPass() : ModulePass(&ID) {}
+
+    bool runOnModule(Module &M);
+  };
+}
+
+char BlockExtractorPass::ID = 0;
+static RegisterPass<BlockExtractorPass>
+XX("extract-blocks", "Extract Basic Blocks From Module (for bugpoint use)");
+
+// createBlockExtractorPass - This pass extracts all blocks (except those
+// specified in the argument list) from the functions in the module.
+//
+ModulePass *llvm::createBlockExtractorPass(const std::vector<BasicBlock*> &BTNE)
+{
+  return new BlockExtractorPass(BTNE);
+}
+
+void BlockExtractorPass::LoadFile(const char *Filename) {
+  // Load the BlockFile...
+  std::ifstream In(Filename);
+  if (!In.good()) {
+    cerr << "WARNING: BlockExtractor couldn't load file '" << Filename
+         << "'!\n";
+    return;
+  }
+  while (In) {
+    std::string FunctionName, BlockName;
+    In >> FunctionName;
+    In >> BlockName;
+    if (!BlockName.empty())
+      BlocksToNotExtractByName.push_back(
+          std::make_pair(FunctionName, BlockName));
+  }
+}
+
+bool BlockExtractorPass::runOnModule(Module &M) {
+  std::set<BasicBlock*> TranslatedBlocksToNotExtract;
+  for (unsigned i = 0, e = BlocksToNotExtract.size(); i != e; ++i) {
+    BasicBlock *BB = BlocksToNotExtract[i];
+    Function *F = BB->getParent();
+
+    // Map the corresponding function in this module.
+    Function *MF = M.getFunction(F->getName());
+    assert(MF->getFunctionType() == F->getFunctionType() && "Wrong function?");
+
+    // Figure out which index the basic block is in its function.
+    Function::iterator BBI = MF->begin();
+    std::advance(BBI, std::distance(F->begin(), Function::iterator(BB)));
+    TranslatedBlocksToNotExtract.insert(BBI);
+  }
+
+  while (!BlocksToNotExtractByName.empty()) {
+    // There's no way to find BBs by name without looking at every BB inside
+    // every Function. Fortunately, this is always empty except when used by
+    // bugpoint in which case correctness is more important than performance.
+
+    std::string &FuncName  = BlocksToNotExtractByName.back().first;
+    std::string &BlockName = BlocksToNotExtractByName.back().second;
+
+    for (Module::iterator FI = M.begin(), FE = M.end(); FI != FE; ++FI) {
+      Function &F = *FI;
+      if (F.getName() != FuncName) continue;
+
+      for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) {
+        BasicBlock &BB = *BI;
+        if (BB.getName() != BlockName) continue;
+
+        TranslatedBlocksToNotExtract.insert(BI);
+      }
+    }
+
+    BlocksToNotExtractByName.pop_back();
+  }
+
+  // Now that we know which blocks to not extract, figure out which ones we WANT
+  // to extract.
+  std::vector<BasicBlock*> BlocksToExtract;
+  for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F)
+    for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB)
+      if (!TranslatedBlocksToNotExtract.count(BB))
+        BlocksToExtract.push_back(BB);
+
+  for (unsigned i = 0, e = BlocksToExtract.size(); i != e; ++i)
+    ExtractBasicBlock(BlocksToExtract[i]);
+
+  return !BlocksToExtract.empty();
+}
diff --git a/lib/Transforms/IPO/LowerSetJmp.cpp b/lib/Transforms/IPO/LowerSetJmp.cpp
new file mode 100644
index 0000000..dfc040b
--- /dev/null
+++ b/lib/Transforms/IPO/LowerSetJmp.cpp
@@ -0,0 +1,536 @@
+//===- LowerSetJmp.cpp - Code pertaining to lowering set/long jumps -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//  This file implements the lowering of setjmp and longjmp to use the
+//  LLVM invoke and unwind instructions as necessary.
+//
+//  Lowering of longjmp is fairly trivial. We replace the call with a
+//  call to the LLVM library function "__llvm_sjljeh_throw_longjmp()".
+//  This unwinds the stack for us calling all of the destructors for
+//  objects allocated on the stack.
+//
+//  At a setjmp call, the basic block is split and the setjmp removed.
+//  The calls in a function that have a setjmp are converted to invoke
+//  where the except part checks to see if it's a longjmp exception and,
+//  if so, if it's handled in the function. If it is, then it gets the
+//  value returned by the longjmp and goes to where the basic block was
+//  split. Invoke instructions are handled in a similar fashion with the
+//  original except block being executed if it isn't a longjmp except
+//  that is handled by that function.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// FIXME: This pass doesn't deal with PHI statements just yet. That is,
+// we expect this to occur before SSAification is done. This would seem
+// to make sense, but in general, it might be a good idea to make this
+// pass invokable via the "opt" command at will.
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "lowersetjmp"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Instructions.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/InstVisitor.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/VectorExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include <map>
+using namespace llvm;
+
+STATISTIC(LongJmpsTransformed, "Number of longjmps transformed");
+STATISTIC(SetJmpsTransformed , "Number of setjmps transformed");
+STATISTIC(CallsTransformed   , "Number of calls invokified");
+STATISTIC(InvokesTransformed , "Number of invokes modified");
+
+namespace {
+  //===--------------------------------------------------------------------===//
+  // LowerSetJmp pass implementation.
+  class VISIBILITY_HIDDEN LowerSetJmp : public ModulePass,
+                      public InstVisitor<LowerSetJmp> {
+    // LLVM library functions...
+    Constant *InitSJMap;        // __llvm_sjljeh_init_setjmpmap
+    Constant *DestroySJMap;     // __llvm_sjljeh_destroy_setjmpmap
+    Constant *AddSJToMap;       // __llvm_sjljeh_add_setjmp_to_map
+    Constant *ThrowLongJmp;     // __llvm_sjljeh_throw_longjmp
+    Constant *TryCatchLJ;       // __llvm_sjljeh_try_catching_longjmp_exception
+    Constant *IsLJException;    // __llvm_sjljeh_is_longjmp_exception
+    Constant *GetLJValue;       // __llvm_sjljeh_get_longjmp_value
+
+    typedef std::pair<SwitchInst*, CallInst*> SwitchValuePair;
+
+    // Keep track of those basic blocks reachable via a depth-first search of
+    // the CFG from a setjmp call. We only need to transform those "call" and
+    // "invoke" instructions that are reachable from the setjmp call site.
+    std::set<BasicBlock*> DFSBlocks;
+
+    // The setjmp map is going to hold information about which setjmps
+    // were called (each setjmp gets its own number) and with which
+    // buffer it was called.
+    std::map<Function*, AllocaInst*>            SJMap;
+
+    // The rethrow basic block map holds the basic block to branch to if
+    // the exception isn't handled in the current function and needs to
+    // be rethrown.
+    std::map<const Function*, BasicBlock*>      RethrowBBMap;
+
+    // The preliminary basic block map holds a basic block that grabs the
+    // exception and determines if it's handled by the current function.
+    std::map<const Function*, BasicBlock*>      PrelimBBMap;
+
+    // The switch/value map holds a switch inst/call inst pair. The
+    // switch inst controls which handler (if any) gets called and the
+    // value is the value returned to that handler by the call to
+    // __llvm_sjljeh_get_longjmp_value.
+    std::map<const Function*, SwitchValuePair>  SwitchValMap;
+
+    // A map of which setjmps we've seen so far in a function.
+    std::map<const Function*, unsigned>         SetJmpIDMap;
+
+    AllocaInst*     GetSetJmpMap(Function* Func);
+    BasicBlock*     GetRethrowBB(Function* Func);
+    SwitchValuePair GetSJSwitch(Function* Func, BasicBlock* Rethrow);
+
+    void TransformLongJmpCall(CallInst* Inst);
+    void TransformSetJmpCall(CallInst* Inst);
+
+    bool IsTransformableFunction(const std::string& Name);
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    LowerSetJmp() : ModulePass(&ID) {}
+
+    void visitCallInst(CallInst& CI);
+    void visitInvokeInst(InvokeInst& II);
+    void visitReturnInst(ReturnInst& RI);
+    void visitUnwindInst(UnwindInst& UI);
+
+    bool runOnModule(Module& M);
+    bool doInitialization(Module& M);
+  };
+} // end anonymous namespace
+
+char LowerSetJmp::ID = 0;
+static RegisterPass<LowerSetJmp> X("lowersetjmp", "Lower Set Jump");
+
+// run - Run the transformation on the program. We grab the function
+// prototypes for longjmp and setjmp. If they are used in the program,
+// then we can go directly to the places they're at and transform them.
+bool LowerSetJmp::runOnModule(Module& M) {
+  bool Changed = false;
+
+  // These are what the functions are called.
+  Function* SetJmp = M.getFunction("llvm.setjmp");
+  Function* LongJmp = M.getFunction("llvm.longjmp");
+
+  // This program doesn't have longjmp and setjmp calls.
+  if ((!LongJmp || LongJmp->use_empty()) &&
+        (!SetJmp || SetJmp->use_empty())) return false;
+
+  // Initialize some values and functions we'll need to transform the
+  // setjmp/longjmp functions.
+  doInitialization(M);
+
+  if (SetJmp) {
+    for (Value::use_iterator B = SetJmp->use_begin(), E = SetJmp->use_end();
+         B != E; ++B) {
+      BasicBlock* BB = cast<Instruction>(*B)->getParent();
+      for (df_ext_iterator<BasicBlock*> I = df_ext_begin(BB, DFSBlocks),
+             E = df_ext_end(BB, DFSBlocks); I != E; ++I)
+        /* empty */;
+    }
+
+    while (!SetJmp->use_empty()) {
+      assert(isa<CallInst>(SetJmp->use_back()) &&
+             "User of setjmp intrinsic not a call?");
+      TransformSetJmpCall(cast<CallInst>(SetJmp->use_back()));
+      Changed = true;
+    }
+  }
+
+  if (LongJmp)
+    while (!LongJmp->use_empty()) {
+      assert(isa<CallInst>(LongJmp->use_back()) &&
+             "User of longjmp intrinsic not a call?");
+      TransformLongJmpCall(cast<CallInst>(LongJmp->use_back()));
+      Changed = true;
+    }
+
+  // Now go through the affected functions and convert calls and invokes
+  // to new invokes...
+  for (std::map<Function*, AllocaInst*>::iterator
+      B = SJMap.begin(), E = SJMap.end(); B != E; ++B) {
+    Function* F = B->first;
+    for (Function::iterator BB = F->begin(), BE = F->end(); BB != BE; ++BB)
+      for (BasicBlock::iterator IB = BB->begin(), IE = BB->end(); IB != IE; ) {
+        visit(*IB++);
+        if (IB != BB->end() && IB->getParent() != BB)
+          break;  // The next instruction got moved to a different block!
+      }
+  }
+
+  DFSBlocks.clear();
+  SJMap.clear();
+  RethrowBBMap.clear();
+  PrelimBBMap.clear();
+  SwitchValMap.clear();
+  SetJmpIDMap.clear();
+
+  return Changed;
+}
+
+// doInitialization - For the lower long/setjmp pass, this ensures that a
+// module contains a declaration for the intrisic functions we are going
+// to call to convert longjmp and setjmp calls.
+//
+// This function is always successful, unless it isn't.
+bool LowerSetJmp::doInitialization(Module& M)
+{
+  const Type *SBPTy = PointerType::getUnqual(Type::Int8Ty);
+  const Type *SBPPTy = PointerType::getUnqual(SBPTy);
+
+  // N.B. See llvm/runtime/GCCLibraries/libexception/SJLJ-Exception.h for
+  // a description of the following library functions.
+
+  // void __llvm_sjljeh_init_setjmpmap(void**)
+  InitSJMap = M.getOrInsertFunction("__llvm_sjljeh_init_setjmpmap",
+                                    Type::VoidTy, SBPPTy, (Type *)0);
+  // void __llvm_sjljeh_destroy_setjmpmap(void**)
+  DestroySJMap = M.getOrInsertFunction("__llvm_sjljeh_destroy_setjmpmap",
+                                       Type::VoidTy, SBPPTy, (Type *)0);
+
+  // void __llvm_sjljeh_add_setjmp_to_map(void**, void*, unsigned)
+  AddSJToMap = M.getOrInsertFunction("__llvm_sjljeh_add_setjmp_to_map",
+                                     Type::VoidTy, SBPPTy, SBPTy,
+                                     Type::Int32Ty, (Type *)0);
+
+  // void __llvm_sjljeh_throw_longjmp(int*, int)
+  ThrowLongJmp = M.getOrInsertFunction("__llvm_sjljeh_throw_longjmp",
+                                       Type::VoidTy, SBPTy, Type::Int32Ty,
+                                       (Type *)0);
+
+  // unsigned __llvm_sjljeh_try_catching_longjmp_exception(void **)
+  TryCatchLJ =
+    M.getOrInsertFunction("__llvm_sjljeh_try_catching_longjmp_exception",
+                          Type::Int32Ty, SBPPTy, (Type *)0);
+
+  // bool __llvm_sjljeh_is_longjmp_exception()
+  IsLJException = M.getOrInsertFunction("__llvm_sjljeh_is_longjmp_exception",
+                                        Type::Int1Ty, (Type *)0);
+
+  // int __llvm_sjljeh_get_longjmp_value()
+  GetLJValue = M.getOrInsertFunction("__llvm_sjljeh_get_longjmp_value",
+                                     Type::Int32Ty, (Type *)0);
+  return true;
+}
+
+// IsTransformableFunction - Return true if the function name isn't one
+// of the ones we don't want transformed. Currently, don't transform any
+// "llvm.{setjmp,longjmp}" functions and none of the setjmp/longjmp error
+// handling functions (beginning with __llvm_sjljeh_...they don't throw
+// exceptions).
+bool LowerSetJmp::IsTransformableFunction(const std::string& Name) {
+  std::string SJLJEh("__llvm_sjljeh");
+
+  if (Name.size() > SJLJEh.size())
+    return std::string(Name.begin(), Name.begin() + SJLJEh.size()) != SJLJEh;
+
+  return true;
+}
+
+// TransformLongJmpCall - Transform a longjmp call into a call to the
+// internal __llvm_sjljeh_throw_longjmp function. It then takes care of
+// throwing the exception for us.
+void LowerSetJmp::TransformLongJmpCall(CallInst* Inst)
+{
+  const Type* SBPTy = PointerType::getUnqual(Type::Int8Ty);
+
+  // Create the call to "__llvm_sjljeh_throw_longjmp". This takes the
+  // same parameters as "longjmp", except that the buffer is cast to a
+  // char*. It returns "void", so it doesn't need to replace any of
+  // Inst's uses and doesn't get a name.
+  CastInst* CI = 
+    new BitCastInst(Inst->getOperand(1), SBPTy, "LJBuf", Inst);
+  SmallVector<Value *, 2> Args;
+  Args.push_back(CI);
+  Args.push_back(Inst->getOperand(2));
+  CallInst::Create(ThrowLongJmp, Args.begin(), Args.end(), "", Inst);
+
+  SwitchValuePair& SVP = SwitchValMap[Inst->getParent()->getParent()];
+
+  // If the function has a setjmp call in it (they are transformed first)
+  // we should branch to the basic block that determines if this longjmp
+  // is applicable here. Otherwise, issue an unwind.
+  if (SVP.first)
+    BranchInst::Create(SVP.first->getParent(), Inst);
+  else
+    new UnwindInst(Inst);
+
+  // Remove all insts after the branch/unwind inst.  Go from back to front to
+  // avoid replaceAllUsesWith if possible.
+  BasicBlock *BB = Inst->getParent();
+  Instruction *Removed;
+  do {
+    Removed = &BB->back();
+    // If the removed instructions have any users, replace them now.
+    if (!Removed->use_empty())
+      Removed->replaceAllUsesWith(UndefValue::get(Removed->getType()));
+    Removed->eraseFromParent();
+  } while (Removed != Inst);
+
+  ++LongJmpsTransformed;
+}
+
+// GetSetJmpMap - Retrieve (create and initialize, if necessary) the
+// setjmp map. This map is going to hold information about which setjmps
+// were called (each setjmp gets its own number) and with which buffer it
+// was called. There can be only one!
+AllocaInst* LowerSetJmp::GetSetJmpMap(Function* Func)
+{
+  if (SJMap[Func]) return SJMap[Func];
+
+  // Insert the setjmp map initialization before the first instruction in
+  // the function.
+  Instruction* Inst = Func->getEntryBlock().begin();
+  assert(Inst && "Couldn't find even ONE instruction in entry block!");
+
+  // Fill in the alloca and call to initialize the SJ map.
+  const Type *SBPTy = PointerType::getUnqual(Type::Int8Ty);
+  AllocaInst* Map = new AllocaInst(SBPTy, 0, "SJMap", Inst);
+  CallInst::Create(InitSJMap, Map, "", Inst);
+  return SJMap[Func] = Map;
+}
+
+// GetRethrowBB - Only one rethrow basic block is needed per function.
+// If this is a longjmp exception but not handled in this block, this BB
+// performs the rethrow.
+BasicBlock* LowerSetJmp::GetRethrowBB(Function* Func)
+{
+  if (RethrowBBMap[Func]) return RethrowBBMap[Func];
+
+  // The basic block we're going to jump to if we need to rethrow the
+  // exception.
+  BasicBlock* Rethrow = BasicBlock::Create("RethrowExcept", Func);
+
+  // Fill in the "Rethrow" BB with a call to rethrow the exception. This
+  // is the last instruction in the BB since at this point the runtime
+  // should exit this function and go to the next function.
+  new UnwindInst(Rethrow);
+  return RethrowBBMap[Func] = Rethrow;
+}
+
+// GetSJSwitch - Return the switch statement that controls which handler
+// (if any) gets called and the value returned to that handler.
+LowerSetJmp::SwitchValuePair LowerSetJmp::GetSJSwitch(Function* Func,
+                                                      BasicBlock* Rethrow)
+{
+  if (SwitchValMap[Func].first) return SwitchValMap[Func];
+
+  BasicBlock* LongJmpPre = BasicBlock::Create("LongJmpBlkPre", Func);
+
+  // Keep track of the preliminary basic block for some of the other
+  // transformations.
+  PrelimBBMap[Func] = LongJmpPre;
+
+  // Grab the exception.
+  CallInst* Cond = CallInst::Create(IsLJException, "IsLJExcept", LongJmpPre);
+
+  // The "decision basic block" gets the number associated with the
+  // setjmp call returning to switch on and the value returned by
+  // longjmp.
+  BasicBlock* DecisionBB = BasicBlock::Create("LJDecisionBB", Func);
+
+  BranchInst::Create(DecisionBB, Rethrow, Cond, LongJmpPre);
+
+  // Fill in the "decision" basic block.
+  CallInst* LJVal = CallInst::Create(GetLJValue, "LJVal", DecisionBB);
+  CallInst* SJNum = CallInst::Create(TryCatchLJ, GetSetJmpMap(Func), "SJNum",
+                                     DecisionBB);
+
+  SwitchInst* SI = SwitchInst::Create(SJNum, Rethrow, 0, DecisionBB);
+  return SwitchValMap[Func] = SwitchValuePair(SI, LJVal);
+}
+
+// TransformSetJmpCall - The setjmp call is a bit trickier to transform.
+// We're going to convert all setjmp calls to nops. Then all "call" and
+// "invoke" instructions in the function are converted to "invoke" where
+// the "except" branch is used when returning from a longjmp call.
+void LowerSetJmp::TransformSetJmpCall(CallInst* Inst)
+{
+  BasicBlock* ABlock = Inst->getParent();
+  Function* Func = ABlock->getParent();
+
+  // Add this setjmp to the setjmp map.
+  const Type* SBPTy = PointerType::getUnqual(Type::Int8Ty);
+  CastInst* BufPtr = 
+    new BitCastInst(Inst->getOperand(1), SBPTy, "SBJmpBuf", Inst);
+  std::vector<Value*> Args = 
+    make_vector<Value*>(GetSetJmpMap(Func), BufPtr,
+                        ConstantInt::get(Type::Int32Ty,
+                                         SetJmpIDMap[Func]++), 0);
+  CallInst::Create(AddSJToMap, Args.begin(), Args.end(), "", Inst);
+
+  // We are guaranteed that there are no values live across basic blocks
+  // (because we are "not in SSA form" yet), but there can still be values live
+  // in basic blocks.  Because of this, splitting the setjmp block can cause
+  // values above the setjmp to not dominate uses which are after the setjmp
+  // call.  For all of these occasions, we must spill the value to the stack.
+  //
+  std::set<Instruction*> InstrsAfterCall;
+
+  // The call is probably very close to the end of the basic block, for the
+  // common usage pattern of: 'if (setjmp(...))', so keep track of the
+  // instructions after the call.
+  for (BasicBlock::iterator I = ++BasicBlock::iterator(Inst), E = ABlock->end();
+       I != E; ++I)
+    InstrsAfterCall.insert(I);
+
+  for (BasicBlock::iterator II = ABlock->begin();
+       II != BasicBlock::iterator(Inst); ++II)
+    // Loop over all of the uses of instruction.  If any of them are after the
+    // call, "spill" the value to the stack.
+    for (Value::use_iterator UI = II->use_begin(), E = II->use_end();
+         UI != E; ++UI)
+      if (cast<Instruction>(*UI)->getParent() != ABlock ||
+          InstrsAfterCall.count(cast<Instruction>(*UI))) {
+        DemoteRegToStack(*II);
+        break;
+      }
+  InstrsAfterCall.clear();
+
+  // Change the setjmp call into a branch statement. We'll remove the
+  // setjmp call in a little bit. No worries.
+  BasicBlock* SetJmpContBlock = ABlock->splitBasicBlock(Inst);
+  assert(SetJmpContBlock && "Couldn't split setjmp BB!!");
+
+  SetJmpContBlock->setName(ABlock->getName()+"SetJmpCont");
+
+  // Add the SetJmpContBlock to the set of blocks reachable from a setjmp.
+  DFSBlocks.insert(SetJmpContBlock);
+
+  // This PHI node will be in the new block created from the
+  // splitBasicBlock call.
+  PHINode* PHI = PHINode::Create(Type::Int32Ty, "SetJmpReturn", Inst);
+
+  // Coming from a call to setjmp, the return is 0.
+  PHI->addIncoming(ConstantInt::getNullValue(Type::Int32Ty), ABlock);
+
+  // Add the case for this setjmp's number...
+  SwitchValuePair SVP = GetSJSwitch(Func, GetRethrowBB(Func));
+  SVP.first->addCase(ConstantInt::get(Type::Int32Ty, SetJmpIDMap[Func] - 1),
+                     SetJmpContBlock);
+
+  // Value coming from the handling of the exception.
+  PHI->addIncoming(SVP.second, SVP.second->getParent());
+
+  // Replace all uses of this instruction with the PHI node created by
+  // the eradication of setjmp.
+  Inst->replaceAllUsesWith(PHI);
+  Inst->eraseFromParent();
+
+  ++SetJmpsTransformed;
+}
+
+// visitCallInst - This converts all LLVM call instructions into invoke
+// instructions. The except part of the invoke goes to the "LongJmpBlkPre"
+// that grabs the exception and proceeds to determine if it's a longjmp
+// exception or not.
+void LowerSetJmp::visitCallInst(CallInst& CI)
+{
+  if (CI.getCalledFunction())
+    if (!IsTransformableFunction(CI.getCalledFunction()->getName()) ||
+        CI.getCalledFunction()->isIntrinsic()) return;
+
+  BasicBlock* OldBB = CI.getParent();
+
+  // If not reachable from a setjmp call, don't transform.
+  if (!DFSBlocks.count(OldBB)) return;
+
+  BasicBlock* NewBB = OldBB->splitBasicBlock(CI);
+  assert(NewBB && "Couldn't split BB of \"call\" instruction!!");
+  DFSBlocks.insert(NewBB);
+  NewBB->setName("Call2Invoke");
+
+  Function* Func = OldBB->getParent();
+
+  // Construct the new "invoke" instruction.
+  TerminatorInst* Term = OldBB->getTerminator();
+  std::vector<Value*> Params(CI.op_begin() + 1, CI.op_end());
+  InvokeInst* II =
+    InvokeInst::Create(CI.getCalledValue(), NewBB, PrelimBBMap[Func],
+                       Params.begin(), Params.end(), CI.getName(), Term);
+  II->setCallingConv(CI.getCallingConv());
+  II->setAttributes(CI.getAttributes());
+
+  // Replace the old call inst with the invoke inst and remove the call.
+  CI.replaceAllUsesWith(II);
+  CI.eraseFromParent();
+
+  // The old terminator is useless now that we have the invoke inst.
+  Term->eraseFromParent();
+  ++CallsTransformed;
+}
+
+// visitInvokeInst - Converting the "invoke" instruction is fairly
+// straight-forward. The old exception part is replaced by a query asking
+// if this is a longjmp exception. If it is, then it goes to the longjmp
+// exception blocks. Otherwise, control is passed the old exception.
+void LowerSetJmp::visitInvokeInst(InvokeInst& II)
+{
+  if (II.getCalledFunction())
+    if (!IsTransformableFunction(II.getCalledFunction()->getName()) ||
+        II.getCalledFunction()->isIntrinsic()) return;
+
+  BasicBlock* BB = II.getParent();
+
+  // If not reachable from a setjmp call, don't transform.
+  if (!DFSBlocks.count(BB)) return;
+
+  BasicBlock* ExceptBB = II.getUnwindDest();
+
+  Function* Func = BB->getParent();
+  BasicBlock* NewExceptBB = BasicBlock::Create("InvokeExcept", Func);
+
+  // If this is a longjmp exception, then branch to the preliminary BB of
+  // the longjmp exception handling. Otherwise, go to the old exception.
+  CallInst* IsLJExcept = CallInst::Create(IsLJException, "IsLJExcept",
+                                          NewExceptBB);
+
+  BranchInst::Create(PrelimBBMap[Func], ExceptBB, IsLJExcept, NewExceptBB);
+
+  II.setUnwindDest(NewExceptBB);
+  ++InvokesTransformed;
+}
+
+// visitReturnInst - We want to destroy the setjmp map upon exit from the
+// function.
+void LowerSetJmp::visitReturnInst(ReturnInst &RI) {
+  Function* Func = RI.getParent()->getParent();
+  CallInst::Create(DestroySJMap, GetSetJmpMap(Func), "", &RI);
+}
+
+// visitUnwindInst - We want to destroy the setjmp map upon exit from the
+// function.
+void LowerSetJmp::visitUnwindInst(UnwindInst &UI) {
+  Function* Func = UI.getParent()->getParent();
+  CallInst::Create(DestroySJMap, GetSetJmpMap(Func), "", &UI);
+}
+
+ModulePass *llvm::createLowerSetJmpPass() {
+  return new LowerSetJmp();
+}
+
diff --git a/lib/Transforms/IPO/Makefile b/lib/Transforms/IPO/Makefile
new file mode 100644
index 0000000..5c42374
--- /dev/null
+++ b/lib/Transforms/IPO/Makefile
@@ -0,0 +1,15 @@
+##===- lib/Transforms/IPO/Makefile -------------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../../..
+LIBRARYNAME = LLVMipo
+BUILD_ARCHIVE = 1
+
+include $(LEVEL)/Makefile.common
+
diff --git a/lib/Transforms/IPO/MergeFunctions.cpp b/lib/Transforms/IPO/MergeFunctions.cpp
new file mode 100644
index 0000000..17bc2d4
--- /dev/null
+++ b/lib/Transforms/IPO/MergeFunctions.cpp
@@ -0,0 +1,377 @@
+//===- MergeFunctions.cpp - Merge identical functions ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass looks for equivalent functions that are mergable and folds them.
+//
+// A Function will not be analyzed if:
+// * it is overridable at runtime (except for weak linkage), or
+// * it is used by anything other than the callee parameter of a call/invoke
+//
+// A hash is computed from the function, based on its type and number of
+// basic blocks.
+//
+// Once all hashes are computed, we perform an expensive equality comparison
+// on each function pair. This takes n^2/2 comparisons per bucket, so it's
+// important that the hash function be high quality. The equality comparison
+// iterates through each instruction in each basic block.
+//
+// When a match is found, the functions are folded. We can only fold two
+// functions when we know that the definition of one of them is not
+// overridable.
+// * fold a function marked internal by replacing all of its users.
+// * fold extern or weak functions by replacing them with a global alias
+//
+//===----------------------------------------------------------------------===//
+//
+// Future work:
+//
+// * fold vector<T*>::push_back and vector<S*>::push_back.
+//
+// These two functions have different types, but in a way that doesn't matter
+// to us. As long as we never see an S or T itself, using S* and S** is the
+// same as using a T* and T**.
+//
+// * virtual functions.
+//
+// Many functions have their address taken by the virtual function table for
+// the object they belong to. However, as long as it's only used for a lookup
+// and call, this is irrelevant, and we'd like to fold such implementations.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "mergefunc"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Constants.h"
+#include "llvm/InlineAsm.h"
+#include "llvm/Instructions.h"
+#include "llvm/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CallSite.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include <map>
+#include <vector>
+using namespace llvm;
+
+STATISTIC(NumFunctionsMerged, "Number of functions merged");
+STATISTIC(NumMergeFails, "Number of identical function pairings not merged");
+
+namespace {
+  struct VISIBILITY_HIDDEN MergeFunctions : public ModulePass {
+    static char ID; // Pass identification, replacement for typeid
+    MergeFunctions() : ModulePass((intptr_t)&ID) {}
+
+    bool runOnModule(Module &M);
+  };
+}
+
+char MergeFunctions::ID = 0;
+static RegisterPass<MergeFunctions>
+X("mergefunc", "Merge Functions");
+
+ModulePass *llvm::createMergeFunctionsPass() {
+  return new MergeFunctions();
+}
+
+static unsigned long hash(const Function *F) {
+  return F->size() ^ reinterpret_cast<unsigned long>(F->getType());
+  //return F->size() ^ F->arg_size() ^ F->getReturnType();
+}
+
+static bool compare(const Value *V, const Value *U) {
+  assert(!isa<BasicBlock>(V) && !isa<BasicBlock>(U) &&
+         "Must not compare basic blocks.");
+
+  assert(V->getType() == U->getType() &&
+        "Two of the same operation have operands of different type.");
+
+  // TODO: If the constant is an expression of F, we should accept that it's
+  // equal to the same expression in terms of G.
+  if (isa<Constant>(V))
+    return V == U;
+
+  // The caller has ensured that ValueMap[V] != U. Since Arguments are
+  // pre-loaded into the ValueMap, and Instructions are added as we go, we know
+  // that this can only be a mis-match.
+  if (isa<Instruction>(V) || isa<Argument>(V))
+    return false;
+
+  if (isa<InlineAsm>(V) && isa<InlineAsm>(U)) {
+    const InlineAsm *IAF = cast<InlineAsm>(V);
+    const InlineAsm *IAG = cast<InlineAsm>(U);
+    return IAF->getAsmString() == IAG->getAsmString() &&
+           IAF->getConstraintString() == IAG->getConstraintString();
+  }
+
+  return false;
+}
+
+static bool equals(const BasicBlock *BB1, const BasicBlock *BB2,
+                   DenseMap<const Value *, const Value *> &ValueMap,
+                   DenseMap<const Value *, const Value *> &SpeculationMap) {
+  // Specutively add it anyways. If it's false, we'll notice a difference later, and
+  // this won't matter.
+  ValueMap[BB1] = BB2;
+
+  BasicBlock::const_iterator FI = BB1->begin(), FE = BB1->end();
+  BasicBlock::const_iterator GI = BB2->begin(), GE = BB2->end();
+
+  do {
+    if (!FI->isSameOperationAs(const_cast<Instruction *>(&*GI)))
+      return false;
+
+    if (FI->getNumOperands() != GI->getNumOperands())
+      return false;
+
+    if (ValueMap[FI] == GI) {
+      ++FI, ++GI;
+      continue;
+    }
+
+    if (ValueMap[FI] != NULL)
+      return false;
+
+    for (unsigned i = 0, e = FI->getNumOperands(); i != e; ++i) {
+      Value *OpF = FI->getOperand(i);
+      Value *OpG = GI->getOperand(i);
+
+      if (ValueMap[OpF] == OpG)
+        continue;
+
+      if (ValueMap[OpF] != NULL)
+        return false;
+
+      assert(OpF->getType() == OpG->getType() &&
+             "Two of the same operation has operands of different type.");
+
+      if (OpF->getValueID() != OpG->getValueID())
+        return false;
+
+      if (isa<PHINode>(FI)) {
+        if (SpeculationMap[OpF] == NULL)
+          SpeculationMap[OpF] = OpG;
+        else if (SpeculationMap[OpF] != OpG)
+          return false;
+        continue;
+      } else if (isa<BasicBlock>(OpF)) {
+        assert(isa<TerminatorInst>(FI) &&
+               "BasicBlock referenced by non-Terminator non-PHI");
+        // This call changes the ValueMap, hence we can't use
+        // Value *& = ValueMap[...]
+        if (!equals(cast<BasicBlock>(OpF), cast<BasicBlock>(OpG), ValueMap,
+                    SpeculationMap))
+          return false;
+      } else {
+        if (!compare(OpF, OpG))
+          return false;
+      }
+
+      ValueMap[OpF] = OpG;
+    }
+
+    ValueMap[FI] = GI;
+    ++FI, ++GI;
+  } while (FI != FE && GI != GE);
+
+  return FI == FE && GI == GE;
+}
+
+static bool equals(const Function *F, const Function *G) {
+  // We need to recheck everything, but check the things that weren't included
+  // in the hash first.
+
+  if (F->getAttributes() != G->getAttributes())
+    return false;
+
+  if (F->hasGC() != G->hasGC())
+    return false;
+
+  if (F->hasGC() && F->getGC() != G->getGC())
+    return false;
+
+  if (F->hasSection() != G->hasSection())
+    return false;
+
+  if (F->hasSection() && F->getSection() != G->getSection())
+    return false;
+
+  // TODO: if it's internal and only used in direct calls, we could handle this
+  // case too.
+  if (F->getCallingConv() != G->getCallingConv())
+    return false;
+
+  // TODO: We want to permit cases where two functions take T* and S* but
+  // only load or store them into T** and S**.
+  if (F->getType() != G->getType())
+    return false;
+
+  DenseMap<const Value *, const Value *> ValueMap;
+  DenseMap<const Value *, const Value *> SpeculationMap;
+  ValueMap[F] = G;
+
+  assert(F->arg_size() == G->arg_size() &&
+         "Identical functions have a different number of args.");
+
+  for (Function::const_arg_iterator fi = F->arg_begin(), gi = G->arg_begin(),
+         fe = F->arg_end(); fi != fe; ++fi, ++gi)
+    ValueMap[fi] = gi;
+
+  if (!equals(&F->getEntryBlock(), &G->getEntryBlock(), ValueMap,
+              SpeculationMap))
+    return false;
+
+  for (DenseMap<const Value *, const Value *>::iterator
+         I = SpeculationMap.begin(), E = SpeculationMap.end(); I != E; ++I) {
+    if (ValueMap[I->first] != I->second)
+      return false;
+  }
+
+  return true;
+}
+
+static bool fold(std::vector<Function *> &FnVec, unsigned i, unsigned j) {
+  if (FnVec[i]->mayBeOverridden() && !FnVec[j]->mayBeOverridden())
+    std::swap(FnVec[i], FnVec[j]);
+
+  Function *F = FnVec[i];
+  Function *G = FnVec[j];
+
+  if (!F->mayBeOverridden()) {
+    if (G->hasLocalLinkage()) {
+      F->setAlignment(std::max(F->getAlignment(), G->getAlignment()));
+      G->replaceAllUsesWith(F);
+      G->eraseFromParent();
+      ++NumFunctionsMerged;
+      return true;
+    }
+
+    if (G->hasExternalLinkage() || G->hasWeakLinkage()) {
+      GlobalAlias *GA = new GlobalAlias(G->getType(), G->getLinkage(), "",
+                                        F, G->getParent());
+      F->setAlignment(std::max(F->getAlignment(), G->getAlignment()));
+      GA->takeName(G);
+      GA->setVisibility(G->getVisibility());
+      G->replaceAllUsesWith(GA);
+      G->eraseFromParent();
+      ++NumFunctionsMerged;
+      return true;
+    }
+  }
+
+  if (F->hasWeakLinkage() && G->hasWeakLinkage()) {
+    GlobalAlias *GA_F = new GlobalAlias(F->getType(), F->getLinkage(), "",
+                                        0, F->getParent());
+    GA_F->takeName(F);
+    GA_F->setVisibility(F->getVisibility());
+    F->setAlignment(std::max(F->getAlignment(), G->getAlignment()));
+    F->replaceAllUsesWith(GA_F);
+    F->setName("folded." + GA_F->getName());
+    F->setLinkage(GlobalValue::ExternalLinkage);
+    GA_F->setAliasee(F);
+
+    GlobalAlias *GA_G = new GlobalAlias(G->getType(), G->getLinkage(), "",
+                                        F, G->getParent());
+    GA_G->takeName(G);
+    GA_G->setVisibility(G->getVisibility());
+    G->replaceAllUsesWith(GA_G);
+    G->eraseFromParent();
+
+    ++NumFunctionsMerged;
+    return true;
+  }
+
+  DOUT << "Failed on " << F->getName() << " and " << G->getName() << "\n";
+
+  ++NumMergeFails;
+  return false;
+}
+
+static bool hasAddressTaken(User *U) {
+  for (User::use_iterator I = U->use_begin(), E = U->use_end(); I != E; ++I) {
+    User *Use = *I;
+
+    // 'call (bitcast @F to ...)' happens a lot.
+    while (isa<ConstantExpr>(Use) && Use->hasOneUse()) {
+      Use = *Use->use_begin();
+    }
+
+    if (isa<ConstantExpr>(Use)) {
+      if (hasAddressTaken(Use))
+        return true;
+    }
+
+    if (!isa<CallInst>(Use) && !isa<InvokeInst>(Use))
+      return true;
+
+    // Make sure we aren't passing U as a parameter to call instead of the
+    // callee.
+    if (CallSite(cast<Instruction>(Use)).hasArgument(U))
+      return true;
+  }
+
+  return false;
+}
+
+bool MergeFunctions::runOnModule(Module &M) {
+  bool Changed = false;
+
+  std::map<unsigned long, std::vector<Function *> > FnMap;
+
+  for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) {
+    if (F->isDeclaration() || F->isIntrinsic())
+      continue;
+
+    if (!F->hasLocalLinkage() && !F->hasExternalLinkage() &&
+        !F->hasWeakLinkage())
+      continue;
+
+    if (hasAddressTaken(F))
+      continue;
+
+    FnMap[hash(F)].push_back(F);
+  }
+
+  // TODO: instead of running in a loop, we could also fold functions in callgraph
+  // order. Constructing the CFG probably isn't cheaper than just running in a loop.
+
+  bool LocalChanged;
+  do {
+    LocalChanged = false;
+    for (std::map<unsigned long, std::vector<Function *> >::iterator
+         I = FnMap.begin(), E = FnMap.end(); I != E; ++I) {
+      DOUT << "size: " << FnMap.size() << "\n";
+      std::vector<Function *> &FnVec = I->second;
+      DOUT << "hash (" << I->first << "): " << FnVec.size() << "\n";
+
+      for (int i = 0, e = FnVec.size(); i != e; ++i) {
+        for (int j = i + 1; j != e; ++j) {
+          bool isEqual = equals(FnVec[i], FnVec[j]);
+
+          DOUT << "  " << FnVec[i]->getName()
+               << (isEqual ? " == " : " != ")
+               << FnVec[j]->getName() << "\n";
+
+          if (isEqual) {
+            if (fold(FnVec, i, j)) {
+              LocalChanged = true;
+              FnVec.erase(FnVec.begin() + j);
+              --j, --e;
+            }
+          }
+        }
+      }
+
+    }
+    Changed |= LocalChanged;
+  } while (LocalChanged);
+
+  return Changed;
+}
diff --git a/lib/Transforms/IPO/PartialSpecialization.cpp b/lib/Transforms/IPO/PartialSpecialization.cpp
new file mode 100644
index 0000000..0e1fdb9
--- /dev/null
+++ b/lib/Transforms/IPO/PartialSpecialization.cpp
@@ -0,0 +1,191 @@
+//===-- PartialSpecialization.cpp - Specialize for common constants--------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass finds function arguments that are often a common constant and 
+// specializes a version of the called function for that constant.
+//
+// This pass simply does the cloning for functions it specializes.  It depends
+// on IPSCCP and DAE to clean up the results.
+//
+// The initial heuristic favors constant arguments that are used in control 
+// flow.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "partialspecialization"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Constant.h"
+#include "llvm/Instructions.h"
+#include "llvm/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Support/CallSite.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/ADT/DenseSet.h"
+#include <map>
+using namespace llvm;
+
+STATISTIC(numSpecialized, "Number of specialized functions created");
+
+// Call must be used at least occasionally
+static const int CallsMin = 5;
+
+// Must have 10% of calls having the same constant to specialize on
+static const double ConstValPercent = .1;
+
+namespace {
+  class VISIBILITY_HIDDEN PartSpec : public ModulePass {
+    void scanForInterest(Function&, SmallVector<int, 6>&);
+    int scanDistribution(Function&, int, std::map<Constant*, int>&);
+  public :
+    static char ID; // Pass identification, replacement for typeid
+    PartSpec() : ModulePass(&ID) {}
+    bool runOnModule(Module &M);
+  };
+}
+
+char PartSpec::ID = 0;
+static RegisterPass<PartSpec>
+X("partialspecialization", "Partial Specialization");
+
+// Specialize F by replacing the arguments (keys) in replacements with the 
+// constants (values).  Replace all calls to F with those constants with
+// a call to the specialized function.  Returns the specialized function
+static Function* 
+SpecializeFunction(Function* F, 
+                   DenseMap<const Value*, Value*>& replacements) {
+  // arg numbers of deleted arguments
+  DenseSet<unsigned> deleted;
+  for (DenseMap<const Value*, Value*>::iterator 
+         repb = replacements.begin(), repe = replacements.end();
+       repb != repe; ++repb)
+    deleted.insert(cast<Argument>(repb->first)->getArgNo());
+
+  Function* NF = CloneFunction(F, replacements);
+  NF->setLinkage(GlobalValue::InternalLinkage);
+  F->getParent()->getFunctionList().push_back(NF);
+
+  for (Value::use_iterator ii = F->use_begin(), ee = F->use_end(); 
+       ii != ee; ) {
+    Value::use_iterator i = ii;
+    ++ii;
+    if (isa<CallInst>(i) || isa<InvokeInst>(i)) {
+      CallSite CS(cast<Instruction>(i));
+      if (CS.getCalledFunction() == F) {
+        
+        SmallVector<Value*, 6> args;
+        for (unsigned x = 0; x < CS.arg_size(); ++x)
+          if (!deleted.count(x))
+            args.push_back(CS.getArgument(x));
+        Value* NCall;
+        if (CallInst *CI = dyn_cast<CallInst>(i)) {
+          NCall = CallInst::Create(NF, args.begin(), args.end(), 
+                                   CI->getName(), CI);
+          cast<CallInst>(NCall)->setTailCall(CI->isTailCall());
+          cast<CallInst>(NCall)->setCallingConv(CI->getCallingConv());
+        } else {
+          InvokeInst *II = cast<InvokeInst>(i);
+          NCall = InvokeInst::Create(NF, II->getNormalDest(),
+                                     II->getUnwindDest(),
+                                     args.begin(), args.end(), 
+                                     II->getName(), II);
+          cast<InvokeInst>(NCall)->setCallingConv(II->getCallingConv());
+        }
+        CS.getInstruction()->replaceAllUsesWith(NCall);
+        CS.getInstruction()->eraseFromParent();
+      }
+    }
+  }
+  return NF;
+}
+
+
+bool PartSpec::runOnModule(Module &M) {
+  bool Changed = false;
+  for (Module::iterator I = M.begin(); I != M.end(); ++I) {
+    Function &F = *I;
+    if (F.isDeclaration() || F.mayBeOverridden()) continue;
+    SmallVector<int, 6> interestingArgs;
+    scanForInterest(F, interestingArgs);
+
+    // Find the first interesting Argument that we can specialize on
+    // If there are multiple interesting Arguments, then those will be found
+    // when processing the cloned function.
+    bool breakOuter = false;
+    for (unsigned int x = 0; !breakOuter && x < interestingArgs.size(); ++x) {
+      std::map<Constant*, int> distribution;
+      int total = scanDistribution(F, interestingArgs[x], distribution);
+      if (total > CallsMin) 
+        for (std::map<Constant*, int>::iterator ii = distribution.begin(),
+               ee = distribution.end(); ii != ee; ++ii)
+          if (total > ii->second && ii->first &&
+               ii->second > total * ConstValPercent) {
+            DenseMap<const Value*, Value*> m;
+            Function::arg_iterator arg = F.arg_begin();
+            for (int y = 0; y < interestingArgs[x]; ++y)
+              ++arg;
+            m[&*arg] = ii->first;
+            SpecializeFunction(&F, m);
+            ++numSpecialized;
+            breakOuter = true;
+            Changed = true;
+          }
+    }
+  }
+  return Changed;
+}
+
+/// scanForInterest - This function decides which arguments would be worth
+/// specializing on.
+void PartSpec::scanForInterest(Function& F, SmallVector<int, 6>& args) {
+  for(Function::arg_iterator ii = F.arg_begin(), ee = F.arg_end();
+      ii != ee; ++ii) {
+    for(Value::use_iterator ui = ii->use_begin(), ue = ii->use_end();
+        ui != ue; ++ui) {
+
+      bool interesting = false;
+
+      if (isa<CmpInst>(ui)) interesting = true;
+      else if (isa<CallInst>(ui))
+        interesting = ui->getOperand(0) == ii;
+      else if (isa<InvokeInst>(ui))
+        interesting = ui->getOperand(0) == ii;
+      else if (isa<SwitchInst>(ui)) interesting = true;
+      else if (isa<BranchInst>(ui)) interesting = true;
+
+      if (interesting) {
+        args.push_back(std::distance(F.arg_begin(), ii));
+        break;
+      }
+    }
+  }
+}
+
+/// scanDistribution - Construct a histogram of constants for arg of F at arg.
+int PartSpec::scanDistribution(Function& F, int arg, 
+                               std::map<Constant*, int>& dist) {
+  bool hasIndirect = false;
+  int total = 0;
+  for(Value::use_iterator ii = F.use_begin(), ee = F.use_end();
+      ii != ee; ++ii)
+    if ((isa<CallInst>(ii) || isa<InvokeInst>(ii))
+        && ii->getOperand(0) == &F) {
+      ++dist[dyn_cast<Constant>(ii->getOperand(arg + 1))];
+      ++total;
+    } else
+      hasIndirect = true;
+
+  // Preserve the original address taken function even if all other uses
+  // will be specialized.
+  if (hasIndirect) ++total;
+  return total;
+}
+
+ModulePass* llvm::createPartialSpecializationPass() { return new PartSpec(); }
diff --git a/lib/Transforms/IPO/PruneEH.cpp b/lib/Transforms/IPO/PruneEH.cpp
new file mode 100644
index 0000000..2b52f46
--- /dev/null
+++ b/lib/Transforms/IPO/PruneEH.cpp
@@ -0,0 +1,255 @@
+//===- PruneEH.cpp - Pass which deletes unused exception handlers ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a simple interprocedural pass which walks the
+// call-graph, turning invoke instructions into calls, iff the callee cannot
+// throw an exception, and marking functions 'nounwind' if they cannot throw.
+// It implements this as a bottom-up traversal of the call-graph.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "prune-eh"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/CallGraphSCCPass.h"
+#include "llvm/Constants.h"
+#include "llvm/Function.h"
+#include "llvm/Instructions.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/Compiler.h"
+#include <set>
+#include <algorithm>
+using namespace llvm;
+
+STATISTIC(NumRemoved, "Number of invokes removed");
+STATISTIC(NumUnreach, "Number of noreturn calls optimized");
+
+namespace {
+  struct VISIBILITY_HIDDEN PruneEH : public CallGraphSCCPass {
+    static char ID; // Pass identification, replacement for typeid
+    PruneEH() : CallGraphSCCPass(&ID) {}
+
+    // runOnSCC - Analyze the SCC, performing the transformation if possible.
+    bool runOnSCC(const std::vector<CallGraphNode *> &SCC);
+
+    bool SimplifyFunction(Function *F);
+    void DeleteBasicBlock(BasicBlock *BB);
+  };
+}
+
+char PruneEH::ID = 0;
+static RegisterPass<PruneEH>
+X("prune-eh", "Remove unused exception handling info");
+
+Pass *llvm::createPruneEHPass() { return new PruneEH(); }
+
+
+bool PruneEH::runOnSCC(const std::vector<CallGraphNode *> &SCC) {
+  SmallPtrSet<CallGraphNode *, 8> SCCNodes;
+  CallGraph &CG = getAnalysis<CallGraph>();
+  bool MadeChange = false;
+
+  // Fill SCCNodes with the elements of the SCC.  Used for quickly
+  // looking up whether a given CallGraphNode is in this SCC.
+  for (unsigned i = 0, e = SCC.size(); i != e; ++i)
+    SCCNodes.insert(SCC[i]);
+
+  // First pass, scan all of the functions in the SCC, simplifying them
+  // according to what we know.
+  for (unsigned i = 0, e = SCC.size(); i != e; ++i)
+    if (Function *F = SCC[i]->getFunction())
+      MadeChange |= SimplifyFunction(F);
+
+  // Next, check to see if any callees might throw or if there are any external
+  // functions in this SCC: if so, we cannot prune any functions in this SCC.
+  // Definitions that are weak and not declared non-throwing might be 
+  // overridden at linktime with something that throws, so assume that.
+  // If this SCC includes the unwind instruction, we KNOW it throws, so
+  // obviously the SCC might throw.
+  //
+  bool SCCMightUnwind = false, SCCMightReturn = false;
+  for (unsigned i = 0, e = SCC.size();
+       (!SCCMightUnwind || !SCCMightReturn) && i != e; ++i) {
+    Function *F = SCC[i]->getFunction();
+    if (F == 0) {
+      SCCMightUnwind = true;
+      SCCMightReturn = true;
+    } else if (F->isDeclaration() || F->mayBeOverridden()) {
+      SCCMightUnwind |= !F->doesNotThrow();
+      SCCMightReturn |= !F->doesNotReturn();
+    } else {
+      bool CheckUnwind = !SCCMightUnwind && !F->doesNotThrow();
+      bool CheckReturn = !SCCMightReturn && !F->doesNotReturn();
+
+      if (!CheckUnwind && !CheckReturn)
+        continue;
+
+      // Check to see if this function performs an unwind or calls an
+      // unwinding function.
+      for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
+        if (CheckUnwind && isa<UnwindInst>(BB->getTerminator())) {
+          // Uses unwind!
+          SCCMightUnwind = true;
+        } else if (CheckReturn && isa<ReturnInst>(BB->getTerminator())) {
+          SCCMightReturn = true;
+        }
+
+        // Invoke instructions don't allow unwinding to continue, so we are
+        // only interested in call instructions.
+        if (CheckUnwind && !SCCMightUnwind)
+          for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I)
+            if (CallInst *CI = dyn_cast<CallInst>(I)) {
+              if (CI->doesNotThrow()) {
+                // This call cannot throw.
+              } else if (Function *Callee = CI->getCalledFunction()) {
+                CallGraphNode *CalleeNode = CG[Callee];
+                // If the callee is outside our current SCC then we may
+                // throw because it might.
+                if (!SCCNodes.count(CalleeNode)) {
+                  SCCMightUnwind = true;
+                  break;
+                }
+              } else {
+                // Indirect call, it might throw.
+                SCCMightUnwind = true;
+                break;
+              }
+            }
+        if (SCCMightUnwind && SCCMightReturn) break;
+      }
+    }
+  }
+
+  // If the SCC doesn't unwind or doesn't throw, note this fact.
+  if (!SCCMightUnwind || !SCCMightReturn)
+    for (unsigned i = 0, e = SCC.size(); i != e; ++i) {
+      Attributes NewAttributes = Attribute::None;
+
+      if (!SCCMightUnwind)
+        NewAttributes |= Attribute::NoUnwind;
+      if (!SCCMightReturn)
+        NewAttributes |= Attribute::NoReturn;
+
+      const AttrListPtr &PAL = SCC[i]->getFunction()->getAttributes();
+      const AttrListPtr &NPAL = PAL.addAttr(~0, NewAttributes);
+      if (PAL != NPAL) {
+        MadeChange = true;
+        SCC[i]->getFunction()->setAttributes(NPAL);
+      }
+    }
+
+  for (unsigned i = 0, e = SCC.size(); i != e; ++i) {
+    // Convert any invoke instructions to non-throwing functions in this node
+    // into call instructions with a branch.  This makes the exception blocks
+    // dead.
+    if (Function *F = SCC[i]->getFunction())
+      MadeChange |= SimplifyFunction(F);
+  }
+
+  return MadeChange;
+}
+
+
+// SimplifyFunction - Given information about callees, simplify the specified
+// function if we have invokes to non-unwinding functions or code after calls to
+// no-return functions.
+bool PruneEH::SimplifyFunction(Function *F) {
+  CallGraph &CG = getAnalysis<CallGraph>();
+  CallGraphNode *CGN = CG[F];
+
+  bool MadeChange = false;
+  for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
+    if (InvokeInst *II = dyn_cast<InvokeInst>(BB->getTerminator()))
+      if (II->doesNotThrow()) {
+        SmallVector<Value*, 8> Args(II->op_begin()+3, II->op_end());
+        // Insert a call instruction before the invoke.
+        CallInst *Call = CallInst::Create(II->getCalledValue(),
+                                          Args.begin(), Args.end(), "", II);
+        Call->takeName(II);
+        Call->setCallingConv(II->getCallingConv());
+        Call->setAttributes(II->getAttributes());
+
+        // Anything that used the value produced by the invoke instruction
+        // now uses the value produced by the call instruction.
+        II->replaceAllUsesWith(Call);
+        BasicBlock *UnwindBlock = II->getUnwindDest();
+        UnwindBlock->removePredecessor(II->getParent());
+
+        // Fix up the call graph.
+        CGN->replaceCallSite(II, Call);
+
+        // Insert a branch to the normal destination right before the
+        // invoke.
+        BranchInst::Create(II->getNormalDest(), II);
+
+        // Finally, delete the invoke instruction!
+        BB->getInstList().pop_back();
+
+        // If the unwind block is now dead, nuke it.
+        if (pred_begin(UnwindBlock) == pred_end(UnwindBlock))
+          DeleteBasicBlock(UnwindBlock);  // Delete the new BB.
+
+        ++NumRemoved;
+        MadeChange = true;
+      }
+
+    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; )
+      if (CallInst *CI = dyn_cast<CallInst>(I++))
+        if (CI->doesNotReturn() && !isa<UnreachableInst>(I)) {
+          // This call calls a function that cannot return.  Insert an
+          // unreachable instruction after it and simplify the code.  Do this
+          // by splitting the BB, adding the unreachable, then deleting the
+          // new BB.
+          BasicBlock *New = BB->splitBasicBlock(I);
+
+          // Remove the uncond branch and add an unreachable.
+          BB->getInstList().pop_back();
+          new UnreachableInst(BB);
+
+          DeleteBasicBlock(New);  // Delete the new BB.
+          MadeChange = true;
+          ++NumUnreach;
+          break;
+        }
+  }
+
+  return MadeChange;
+}
+
+/// DeleteBasicBlock - remove the specified basic block from the program,
+/// updating the callgraph to reflect any now-obsolete edges due to calls that
+/// exist in the BB.
+void PruneEH::DeleteBasicBlock(BasicBlock *BB) {
+  assert(pred_begin(BB) == pred_end(BB) && "BB is not dead!");
+  CallGraph &CG = getAnalysis<CallGraph>();
+
+  CallGraphNode *CGN = CG[BB->getParent()];
+  for (BasicBlock::iterator I = BB->end(), E = BB->begin(); I != E; ) {
+    --I;
+    if (CallInst *CI = dyn_cast<CallInst>(I)) {
+      if (!isa<DbgInfoIntrinsic>(I))
+        CGN->removeCallEdgeFor(CI);
+    } else if (InvokeInst *II = dyn_cast<InvokeInst>(I))
+      CGN->removeCallEdgeFor(II);
+    if (!I->use_empty())
+      I->replaceAllUsesWith(UndefValue::get(I->getType()));
+  }
+
+  // Get the list of successors of this block.
+  std::vector<BasicBlock*> Succs(succ_begin(BB), succ_end(BB));
+
+  for (unsigned i = 0, e = Succs.size(); i != e; ++i)
+    Succs[i]->removePredecessor(BB);
+
+  BB->eraseFromParent();
+}
diff --git a/lib/Transforms/IPO/RaiseAllocations.cpp b/lib/Transforms/IPO/RaiseAllocations.cpp
new file mode 100644
index 0000000..a81bbdb
--- /dev/null
+++ b/lib/Transforms/IPO/RaiseAllocations.cpp
@@ -0,0 +1,251 @@
+//===- RaiseAllocations.cpp - Convert @malloc & @free calls to insts ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the RaiseAllocations pass which convert malloc and free
+// calls to malloc and free instructions.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "raiseallocs"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Module.h"
+#include "llvm/Instructions.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CallSite.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/ADT/Statistic.h"
+#include <algorithm>
+using namespace llvm;
+
+STATISTIC(NumRaised, "Number of allocations raised");
+
+namespace {
+  // RaiseAllocations - Turn @malloc and @free calls into the appropriate
+  // instruction.
+  //
+  class VISIBILITY_HIDDEN RaiseAllocations : public ModulePass {
+    Function *MallocFunc;   // Functions in the module we are processing
+    Function *FreeFunc;     // Initialized by doPassInitializationVirt
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    RaiseAllocations() 
+      : ModulePass(&ID), MallocFunc(0), FreeFunc(0) {}
+
+    // doPassInitialization - For the raise allocations pass, this finds a
+    // declaration for malloc and free if they exist.
+    //
+    void doInitialization(Module &M);
+
+    // run - This method does the actual work of converting instructions over.
+    //
+    bool runOnModule(Module &M);
+  };
+}  // end anonymous namespace
+
+char RaiseAllocations::ID = 0;
+static RegisterPass<RaiseAllocations>
+X("raiseallocs", "Raise allocations from calls to instructions");
+
+// createRaiseAllocationsPass - The interface to this file...
+ModulePass *llvm::createRaiseAllocationsPass() {
+  return new RaiseAllocations();
+}
+
+
+// If the module has a symbol table, they might be referring to the malloc and
+// free functions.  If this is the case, grab the method pointers that the
+// module is using.
+//
+// Lookup @malloc and @free in the symbol table, for later use.  If they don't
+// exist, or are not external, we do not worry about converting calls to that
+// function into the appropriate instruction.
+//
+void RaiseAllocations::doInitialization(Module &M) {
+
+  // Get Malloc and free prototypes if they exist!
+  MallocFunc = M.getFunction("malloc");
+  if (MallocFunc) {
+    const FunctionType* TyWeHave = MallocFunc->getFunctionType();
+
+    // Get the expected prototype for malloc
+    const FunctionType *Malloc1Type = 
+      FunctionType::get(PointerType::getUnqual(Type::Int8Ty),
+                      std::vector<const Type*>(1, Type::Int64Ty), false);
+
+    // Chck to see if we got the expected malloc
+    if (TyWeHave != Malloc1Type) {
+      // Check to see if the prototype is wrong, giving us sbyte*(uint) * malloc
+      // This handles the common declaration of: 'void *malloc(unsigned);'
+      const FunctionType *Malloc2Type = 
+        FunctionType::get(PointerType::getUnqual(Type::Int8Ty),
+                          std::vector<const Type*>(1, Type::Int32Ty), false);
+      if (TyWeHave != Malloc2Type) {
+        // Check to see if the prototype is missing, giving us 
+        // sbyte*(...) * malloc
+        // This handles the common declaration of: 'void *malloc();'
+        const FunctionType *Malloc3Type = 
+          FunctionType::get(PointerType::getUnqual(Type::Int8Ty),
+                            std::vector<const Type*>(), true);
+        if (TyWeHave != Malloc3Type)
+          // Give up
+          MallocFunc = 0;
+      }
+    }
+  }
+
+  FreeFunc = M.getFunction("free");
+  if (FreeFunc) {
+    const FunctionType* TyWeHave = FreeFunc->getFunctionType();
+    
+    // Get the expected prototype for void free(i8*)
+    const FunctionType *Free1Type = FunctionType::get(Type::VoidTy,
+      std::vector<const Type*>(1, PointerType::getUnqual(Type::Int8Ty)), false);
+
+    if (TyWeHave != Free1Type) {
+      // Check to see if the prototype was forgotten, giving us 
+      // void (...) * free
+      // This handles the common forward declaration of: 'void free();'
+      const FunctionType* Free2Type = FunctionType::get(Type::VoidTy, 
+        std::vector<const Type*>(),true);
+
+      if (TyWeHave != Free2Type) {
+        // One last try, check to see if we can find free as 
+        // int (...)* free.  This handles the case where NOTHING was declared.
+        const FunctionType* Free3Type = FunctionType::get(Type::Int32Ty, 
+          std::vector<const Type*>(),true);
+        
+        if (TyWeHave != Free3Type) {
+          // Give up.
+          FreeFunc = 0;
+        }
+      }
+    }
+  }
+
+  // Don't mess with locally defined versions of these functions...
+  if (MallocFunc && !MallocFunc->isDeclaration()) MallocFunc = 0;
+  if (FreeFunc && !FreeFunc->isDeclaration())     FreeFunc = 0;
+}
+
+// run - Transform calls into instructions...
+//
+bool RaiseAllocations::runOnModule(Module &M) {
+  // Find the malloc/free prototypes...
+  doInitialization(M);
+
+  bool Changed = false;
+
+  // First, process all of the malloc calls...
+  if (MallocFunc) {
+    std::vector<User*> Users(MallocFunc->use_begin(), MallocFunc->use_end());
+    std::vector<Value*> EqPointers;   // Values equal to MallocFunc
+    while (!Users.empty()) {
+      User *U = Users.back();
+      Users.pop_back();
+
+      if (Instruction *I = dyn_cast<Instruction>(U)) {
+        CallSite CS = CallSite::get(I);
+        if (CS.getInstruction() && !CS.arg_empty() &&
+            (CS.getCalledFunction() == MallocFunc ||
+             std::find(EqPointers.begin(), EqPointers.end(),
+                       CS.getCalledValue()) != EqPointers.end())) {
+
+          Value *Source = *CS.arg_begin();
+
+          // If no prototype was provided for malloc, we may need to cast the
+          // source size.
+          if (Source->getType() != Type::Int32Ty)
+            Source = 
+              CastInst::CreateIntegerCast(Source, Type::Int32Ty, false/*ZExt*/,
+                                          "MallocAmtCast", I);
+
+          MallocInst *MI = new MallocInst(Type::Int8Ty, Source, "", I);
+          MI->takeName(I);
+          I->replaceAllUsesWith(MI);
+
+          // If the old instruction was an invoke, add an unconditional branch
+          // before the invoke, which will become the new terminator.
+          if (InvokeInst *II = dyn_cast<InvokeInst>(I))
+            BranchInst::Create(II->getNormalDest(), I);
+
+          // Delete the old call site
+          I->eraseFromParent();
+          Changed = true;
+          ++NumRaised;
+        }
+      } else if (GlobalValue *GV = dyn_cast<GlobalValue>(U)) {
+        Users.insert(Users.end(), GV->use_begin(), GV->use_end());
+        EqPointers.push_back(GV);
+      } else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(U)) {
+        if (CE->isCast()) {
+          Users.insert(Users.end(), CE->use_begin(), CE->use_end());
+          EqPointers.push_back(CE);
+        }
+      }
+    }
+  }
+
+  // Next, process all free calls...
+  if (FreeFunc) {
+    std::vector<User*> Users(FreeFunc->use_begin(), FreeFunc->use_end());
+    std::vector<Value*> EqPointers;   // Values equal to FreeFunc
+
+    while (!Users.empty()) {
+      User *U = Users.back();
+      Users.pop_back();
+
+      if (Instruction *I = dyn_cast<Instruction>(U)) {
+        if (isa<InvokeInst>(I))
+          continue;
+        CallSite CS = CallSite::get(I);
+        if (CS.getInstruction() && !CS.arg_empty() &&
+            (CS.getCalledFunction() == FreeFunc ||
+             std::find(EqPointers.begin(), EqPointers.end(),
+                       CS.getCalledValue()) != EqPointers.end())) {
+
+          // If no prototype was provided for free, we may need to cast the
+          // source pointer.  This should be really uncommon, but it's necessary
+          // just in case we are dealing with weird code like this:
+          //   free((long)ptr);
+          //
+          Value *Source = *CS.arg_begin();
+          if (!isa<PointerType>(Source->getType()))
+            Source = new IntToPtrInst(Source,           
+                                      PointerType::getUnqual(Type::Int8Ty), 
+                                      "FreePtrCast", I);
+          new FreeInst(Source, I);
+
+          // If the old instruction was an invoke, add an unconditional branch
+          // before the invoke, which will become the new terminator.
+          if (InvokeInst *II = dyn_cast<InvokeInst>(I))
+            BranchInst::Create(II->getNormalDest(), I);
+
+          // Delete the old call site
+          if (I->getType() != Type::VoidTy)
+            I->replaceAllUsesWith(UndefValue::get(I->getType()));
+          I->eraseFromParent();
+          Changed = true;
+          ++NumRaised;
+        }
+      } else if (GlobalValue *GV = dyn_cast<GlobalValue>(U)) {
+        Users.insert(Users.end(), GV->use_begin(), GV->use_end());
+        EqPointers.push_back(GV);
+      } else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(U)) {
+        if (CE->isCast()) {
+          Users.insert(Users.end(), CE->use_begin(), CE->use_end());
+          EqPointers.push_back(CE);
+        }
+      }
+    }
+  }
+
+  return Changed;
+}
diff --git a/lib/Transforms/IPO/StripDeadPrototypes.cpp b/lib/Transforms/IPO/StripDeadPrototypes.cpp
new file mode 100644
index 0000000..a94d78e
--- /dev/null
+++ b/lib/Transforms/IPO/StripDeadPrototypes.cpp
@@ -0,0 +1,72 @@
+//===-- StripDeadPrototypes.cpp - Remove unused function declarations ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass loops over all of the functions in the input module, looking for 
+// dead declarations and removes them. Dead declarations are declarations of
+// functions for which no implementation is available (i.e., declarations for
+// unused library functions).
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "strip-dead-prototypes"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Pass.h"
+#include "llvm/Module.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/Compiler.h"
+using namespace llvm;
+
+STATISTIC(NumDeadPrototypes, "Number of dead prototypes removed");
+
+namespace {
+
+/// @brief Pass to remove unused function declarations.
+class VISIBILITY_HIDDEN StripDeadPrototypesPass : public ModulePass {
+public:
+  static char ID; // Pass identification, replacement for typeid
+  StripDeadPrototypesPass() : ModulePass(&ID) { }
+  virtual bool runOnModule(Module &M);
+};
+
+} // end anonymous namespace
+
+char StripDeadPrototypesPass::ID = 0;
+static RegisterPass<StripDeadPrototypesPass>
+X("strip-dead-prototypes", "Strip Unused Function Prototypes");
+
+bool StripDeadPrototypesPass::runOnModule(Module &M) {
+  bool MadeChange = false;
+  
+  // Erase dead function prototypes.
+  for (Module::iterator I = M.begin(), E = M.end(); I != E; ) {
+    Function *F = I++;
+    // Function must be a prototype and unused.
+    if (F->isDeclaration() && F->use_empty()) {
+      F->eraseFromParent();
+      ++NumDeadPrototypes;
+      MadeChange = true;
+    }
+  }
+
+  // Erase dead global var prototypes.
+  for (Module::global_iterator I = M.global_begin(), E = M.global_end();
+       I != E; ) {
+    GlobalVariable *GV = I++;
+    // Global must be a prototype and unused.
+    if (GV->isDeclaration() && GV->use_empty())
+      GV->eraseFromParent();
+  }
+  
+  // Return an indication of whether we changed anything or not.
+  return MadeChange;
+}
+
+ModulePass *llvm::createStripDeadPrototypesPass() {
+  return new StripDeadPrototypesPass();
+}
diff --git a/lib/Transforms/IPO/StripSymbols.cpp b/lib/Transforms/IPO/StripSymbols.cpp
new file mode 100644
index 0000000..ab8fe5f
--- /dev/null
+++ b/lib/Transforms/IPO/StripSymbols.cpp
@@ -0,0 +1,415 @@
+//===- StripSymbols.cpp - Strip symbols and debug info from a module ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// The StripSymbols transformation implements code stripping. Specifically, it
+// can delete:
+// 
+//   * names for virtual registers
+//   * symbols for internal globals and functions
+//   * debug information
+//
+// Note that this transformation makes code much less readable, so it should
+// only be used in situations where the 'strip' utility would be used, such as
+// reducing code size or making it harder to reverse engineer code.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Instructions.h"
+#include "llvm/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/ValueSymbolTable.h"
+#include "llvm/TypeSymbolTable.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/ADT/SmallPtrSet.h"
+using namespace llvm;
+
+namespace {
+  class VISIBILITY_HIDDEN StripSymbols : public ModulePass {
+    bool OnlyDebugInfo;
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    explicit StripSymbols(bool ODI = false) 
+      : ModulePass(&ID), OnlyDebugInfo(ODI) {}
+
+    virtual bool runOnModule(Module &M);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesAll();
+    }
+  };
+
+  class VISIBILITY_HIDDEN StripNonDebugSymbols : public ModulePass {
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    explicit StripNonDebugSymbols()
+      : ModulePass(&ID) {}
+
+    virtual bool runOnModule(Module &M);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesAll();
+    }
+  };
+
+  class VISIBILITY_HIDDEN StripDebugDeclare : public ModulePass {
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    explicit StripDebugDeclare()
+      : ModulePass(&ID) {}
+
+    virtual bool runOnModule(Module &M);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesAll();
+    }
+  };
+}
+
+char StripSymbols::ID = 0;
+static RegisterPass<StripSymbols>
+X("strip", "Strip all symbols from a module");
+
+ModulePass *llvm::createStripSymbolsPass(bool OnlyDebugInfo) {
+  return new StripSymbols(OnlyDebugInfo);
+}
+
+char StripNonDebugSymbols::ID = 0;
+static RegisterPass<StripNonDebugSymbols>
+Y("strip-nondebug", "Strip all symbols, except dbg symbols, from a module");
+
+ModulePass *llvm::createStripNonDebugSymbolsPass() {
+  return new StripNonDebugSymbols();
+}
+
+char StripDebugDeclare::ID = 0;
+static RegisterPass<StripDebugDeclare>
+Z("strip-debug-declare", "Strip all llvm.dbg.declare intrinsics");
+
+ModulePass *llvm::createStripDebugDeclarePass() {
+  return new StripDebugDeclare();
+}
+
+/// OnlyUsedBy - Return true if V is only used by Usr.
+static bool OnlyUsedBy(Value *V, Value *Usr) {
+  for(Value::use_iterator I = V->use_begin(), E = V->use_end(); I != E; ++I) {
+    User *U = *I;
+    if (U != Usr)
+      return false;
+  }
+  return true;
+}
+
+static void RemoveDeadConstant(Constant *C) {
+  assert(C->use_empty() && "Constant is not dead!");
+  SmallPtrSet<Constant *, 4> Operands;
+  for (unsigned i = 0, e = C->getNumOperands(); i != e; ++i)
+    if (isa<DerivedType>(C->getOperand(i)->getType()) &&
+        OnlyUsedBy(C->getOperand(i), C)) 
+      Operands.insert(C->getOperand(i));
+  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(C)) {
+    if (!GV->hasLocalLinkage()) return;   // Don't delete non static globals.
+    GV->eraseFromParent();
+  }
+  else if (!isa<Function>(C))
+    if (isa<CompositeType>(C->getType()))
+      C->destroyConstant();
+
+  // If the constant referenced anything, see if we can delete it as well.
+  for (SmallPtrSet<Constant *, 4>::iterator OI = Operands.begin(),
+         OE = Operands.end(); OI != OE; ++OI)
+    RemoveDeadConstant(*OI);
+}
+
+// Strip the symbol table of its names.
+//
+static void StripSymtab(ValueSymbolTable &ST, bool PreserveDbgInfo) {
+  for (ValueSymbolTable::iterator VI = ST.begin(), VE = ST.end(); VI != VE; ) {
+    Value *V = VI->getValue();
+    ++VI;
+    if (!isa<GlobalValue>(V) || cast<GlobalValue>(V)->hasLocalLinkage()) {
+      if (!PreserveDbgInfo || strncmp(V->getNameStart(), "llvm.dbg", 8))
+        // Set name to "", removing from symbol table!
+        V->setName("");
+    }
+  }
+}
+
+// Strip the symbol table of its names.
+static void StripTypeSymtab(TypeSymbolTable &ST, bool PreserveDbgInfo) {
+  for (TypeSymbolTable::iterator TI = ST.begin(), E = ST.end(); TI != E; ) {
+    if (PreserveDbgInfo && strncmp(TI->first.c_str(), "llvm.dbg", 8) == 0)
+      ++TI;
+    else
+      ST.remove(TI++);
+  }
+}
+
+/// Find values that are marked as llvm.used.
+void findUsedValues(Module &M,
+                    SmallPtrSet<const GlobalValue*, 8>& llvmUsedValues) {
+  if (GlobalVariable *LLVMUsed = M.getGlobalVariable("llvm.used")) {
+    llvmUsedValues.insert(LLVMUsed);
+    // Collect values that are preserved as per explicit request.
+    // llvm.used is used to list these values.
+    if (ConstantArray *Inits = 
+        dyn_cast<ConstantArray>(LLVMUsed->getInitializer())) {
+      for (unsigned i = 0, e = Inits->getNumOperands(); i != e; ++i) {
+        if (GlobalValue *GV = dyn_cast<GlobalValue>(Inits->getOperand(i)))
+          llvmUsedValues.insert(GV);
+        else if (ConstantExpr *CE =
+                 dyn_cast<ConstantExpr>(Inits->getOperand(i)))
+          if (CE->getOpcode() == Instruction::BitCast)
+            if (GlobalValue *GV = dyn_cast<GlobalValue>(CE->getOperand(0)))
+              llvmUsedValues.insert(GV);
+      }
+    }
+  }
+}
+
+/// StripSymbolNames - Strip symbol names.
+bool StripSymbolNames(Module &M, bool PreserveDbgInfo) {
+
+  SmallPtrSet<const GlobalValue*, 8> llvmUsedValues;
+  findUsedValues(M, llvmUsedValues);
+
+  for (Module::global_iterator I = M.global_begin(), E = M.global_end();
+       I != E; ++I) {
+    if (I->hasLocalLinkage() && llvmUsedValues.count(I) == 0)
+      if (!PreserveDbgInfo || strncmp(I->getNameStart(), "llvm.dbg", 8))
+        I->setName("");     // Internal symbols can't participate in linkage
+  }
+  
+  for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) {
+    if (I->hasLocalLinkage() && llvmUsedValues.count(I) == 0)
+      if (!PreserveDbgInfo || strncmp(I->getNameStart(), "llvm.dbg", 8))
+        I->setName("");     // Internal symbols can't participate in linkage
+    StripSymtab(I->getValueSymbolTable(), PreserveDbgInfo);
+  }
+  
+  // Remove all names from types.
+  StripTypeSymtab(M.getTypeSymbolTable(), PreserveDbgInfo);
+
+  return true;
+}
+
+// StripDebugInfo - Strip debug info in the module if it exists.  
+// To do this, we remove llvm.dbg.func.start, llvm.dbg.stoppoint, and 
+// llvm.dbg.region.end calls, and any globals they point to if now dead.
+bool StripDebugInfo(Module &M) {
+
+  SmallPtrSet<const GlobalValue*, 8> llvmUsedValues;
+  findUsedValues(M, llvmUsedValues);
+
+  // Delete all dbg variables.
+  for (Module::global_iterator I = M.global_begin(), E = M.global_end(); 
+       I != E; ++I) {
+    GlobalVariable *GV = dyn_cast<GlobalVariable>(I);
+    if (!GV) continue;
+    if (!GV->use_empty() && llvmUsedValues.count(I) == 0) {
+      if (strncmp(GV->getNameStart(), "llvm.dbg", 8) == 0) {
+        GV->replaceAllUsesWith(UndefValue::get(GV->getType()));
+      }
+    }
+  }
+
+  Function *FuncStart = M.getFunction("llvm.dbg.func.start");
+  Function *StopPoint = M.getFunction("llvm.dbg.stoppoint");
+  Function *RegionStart = M.getFunction("llvm.dbg.region.start");
+  Function *RegionEnd = M.getFunction("llvm.dbg.region.end");
+  Function *Declare = M.getFunction("llvm.dbg.declare");
+
+  std::vector<Constant*> DeadConstants;
+
+  // Remove all of the calls to the debugger intrinsics, and remove them from
+  // the module.
+  if (FuncStart) {
+    while (!FuncStart->use_empty()) {
+      CallInst *CI = cast<CallInst>(FuncStart->use_back());
+      Value *Arg = CI->getOperand(1);
+      assert(CI->use_empty() && "llvm.dbg intrinsic should have void result");
+      CI->eraseFromParent();
+      if (Arg->use_empty())
+        if (Constant *C = dyn_cast<Constant>(Arg)) 
+          DeadConstants.push_back(C);
+    }
+    FuncStart->eraseFromParent();
+  }
+  if (StopPoint) {
+    while (!StopPoint->use_empty()) {
+      CallInst *CI = cast<CallInst>(StopPoint->use_back());
+      Value *Arg = CI->getOperand(3);
+      assert(CI->use_empty() && "llvm.dbg intrinsic should have void result");
+      CI->eraseFromParent();
+      if (Arg->use_empty())
+        if (Constant *C = dyn_cast<Constant>(Arg)) 
+          DeadConstants.push_back(C);
+    }
+    StopPoint->eraseFromParent();
+  }
+  if (RegionStart) {
+    while (!RegionStart->use_empty()) {
+      CallInst *CI = cast<CallInst>(RegionStart->use_back());
+      Value *Arg = CI->getOperand(1);
+      assert(CI->use_empty() && "llvm.dbg intrinsic should have void result");
+      CI->eraseFromParent();
+      if (Arg->use_empty())
+        if (Constant *C = dyn_cast<Constant>(Arg)) 
+          DeadConstants.push_back(C);
+    }
+    RegionStart->eraseFromParent();
+  }
+  if (RegionEnd) {
+    while (!RegionEnd->use_empty()) {
+      CallInst *CI = cast<CallInst>(RegionEnd->use_back());
+      Value *Arg = CI->getOperand(1);
+      assert(CI->use_empty() && "llvm.dbg intrinsic should have void result");
+      CI->eraseFromParent();
+      if (Arg->use_empty())
+        if (Constant *C = dyn_cast<Constant>(Arg)) 
+          DeadConstants.push_back(C);
+    }
+    RegionEnd->eraseFromParent();
+  }
+  if (Declare) {
+    while (!Declare->use_empty()) {
+      CallInst *CI = cast<CallInst>(Declare->use_back());
+      Value *Arg1 = CI->getOperand(1);
+      Value *Arg2 = CI->getOperand(2);
+      assert(CI->use_empty() && "llvm.dbg intrinsic should have void result");
+      CI->eraseFromParent();
+      if (Arg1->use_empty()) {
+        if (Constant *C = dyn_cast<Constant>(Arg1)) 
+          DeadConstants.push_back(C);
+        else 
+          RecursivelyDeleteTriviallyDeadInstructions(Arg1);
+      }
+      if (Arg2->use_empty())
+        if (Constant *C = dyn_cast<Constant>(Arg2)) 
+          DeadConstants.push_back(C);
+    }
+    Declare->eraseFromParent();
+  }
+
+  // llvm.dbg.compile_units and llvm.dbg.subprograms are marked as linkonce
+  // but since we are removing all debug information, make them internal now.
+  // FIXME: Use private linkage maybe?
+  if (Constant *C = M.getNamedGlobal("llvm.dbg.compile_units"))
+    if (GlobalVariable *GV = dyn_cast<GlobalVariable>(C))
+      GV->setLinkage(GlobalValue::InternalLinkage);
+
+  if (Constant *C = M.getNamedGlobal("llvm.dbg.subprograms"))
+    if (GlobalVariable *GV = dyn_cast<GlobalVariable>(C))
+      GV->setLinkage(GlobalValue::InternalLinkage);
+ 
+  if (Constant *C = M.getNamedGlobal("llvm.dbg.global_variables"))
+    if (GlobalVariable *GV = dyn_cast<GlobalVariable>(C))
+      GV->setLinkage(GlobalValue::InternalLinkage);
+
+  // Delete all dbg variables.
+  for (Module::global_iterator I = M.global_begin(), E = M.global_end(); 
+       I != E; ++I) {
+    GlobalVariable *GV = dyn_cast<GlobalVariable>(I);
+    if (!GV) continue;
+    if (GV->use_empty() && llvmUsedValues.count(I) == 0
+        && (!GV->hasSection() 
+            || strcmp(GV->getSection().c_str(), "llvm.metadata") == 0))
+      DeadConstants.push_back(GV);
+  }
+
+  if (DeadConstants.empty())
+    return false;
+
+  // Delete any internal globals that were only used by the debugger intrinsics.
+  while (!DeadConstants.empty()) {
+    Constant *C = DeadConstants.back();
+    DeadConstants.pop_back();
+    if (GlobalVariable *GV = dyn_cast<GlobalVariable>(C)) {
+      if (GV->hasLocalLinkage())
+        RemoveDeadConstant(GV);
+    }
+    else
+      RemoveDeadConstant(C);
+  }
+
+  // Remove all llvm.dbg types.
+  TypeSymbolTable &ST = M.getTypeSymbolTable();
+  for (TypeSymbolTable::iterator TI = ST.begin(), TE = ST.end(); TI != TE; ) {
+    if (!strncmp(TI->first.c_str(), "llvm.dbg.", 9))
+      ST.remove(TI++);
+    else 
+      ++TI;
+  }
+  
+  return true;
+}
+
+bool StripSymbols::runOnModule(Module &M) {
+  bool Changed = false;
+  Changed |= StripDebugInfo(M);
+  if (!OnlyDebugInfo)
+    Changed |= StripSymbolNames(M, false);
+  return Changed;
+}
+
+bool StripNonDebugSymbols::runOnModule(Module &M) {
+  return StripSymbolNames(M, true);
+}
+
+bool StripDebugDeclare::runOnModule(Module &M) {
+
+  Function *Declare = M.getFunction("llvm.dbg.declare");
+  std::vector<Constant*> DeadConstants;
+
+  if (Declare) {
+    while (!Declare->use_empty()) {
+      CallInst *CI = cast<CallInst>(Declare->use_back());
+      Value *Arg1 = CI->getOperand(1);
+      Value *Arg2 = CI->getOperand(2);
+      assert(CI->use_empty() && "llvm.dbg intrinsic should have void result");
+      CI->eraseFromParent();
+      if (Arg1->use_empty()) {
+        if (Constant *C = dyn_cast<Constant>(Arg1)) 
+          DeadConstants.push_back(C);
+        else 
+          RecursivelyDeleteTriviallyDeadInstructions(Arg1);
+      }
+      if (Arg2->use_empty())
+        if (Constant *C = dyn_cast<Constant>(Arg2)) 
+          DeadConstants.push_back(C);
+    }
+    Declare->eraseFromParent();
+  }
+
+  // Delete all llvm.dbg.global_variables.
+  for (Module::global_iterator I = M.global_begin(), E = M.global_end(); 
+       I != E; ++I) {
+    GlobalVariable *GV = dyn_cast<GlobalVariable>(I);
+    if (!GV) continue;
+    if (GV->use_empty() && GV->hasName() 
+        && strncmp(GV->getNameStart(), "llvm.dbg.global_variable", 24) == 0)
+      DeadConstants.push_back(GV);
+  }
+
+  while (!DeadConstants.empty()) {
+    Constant *C = DeadConstants.back();
+    DeadConstants.pop_back();
+    if (GlobalVariable *GV = dyn_cast<GlobalVariable>(C)) {
+      if (GV->hasLocalLinkage())
+        RemoveDeadConstant(GV);
+    }
+    else
+      RemoveDeadConstant(C);
+  }
+
+  return true;
+}
diff --git a/lib/Transforms/IPO/StructRetPromotion.cpp b/lib/Transforms/IPO/StructRetPromotion.cpp
new file mode 100644
index 0000000..9f54388
--- /dev/null
+++ b/lib/Transforms/IPO/StructRetPromotion.cpp
@@ -0,0 +1,351 @@
+//===-- StructRetPromotion.cpp - Promote sret arguments ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass finds functions that return a struct (using a pointer to the struct
+// as the first argument of the function, marked with the 'sret' attribute) and
+// replaces them with a new function that simply returns each of the elements of
+// that struct (using multiple return values).
+//
+// This pass works under a number of conditions:
+//  1. The returned struct must not contain other structs
+//  2. The returned struct must only be used to load values from
+//  3. The placeholder struct passed in is the result of an alloca
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "sretpromotion"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Module.h"
+#include "llvm/CallGraphSCCPass.h"
+#include "llvm/Instructions.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/Support/CallSite.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/Compiler.h"
+using namespace llvm;
+
+STATISTIC(NumRejectedSRETUses , "Number of sret rejected due to unexpected uses");
+STATISTIC(NumSRET , "Number of sret promoted");
+namespace {
+  /// SRETPromotion - This pass removes sret parameter and updates
+  /// function to use multiple return value.
+  ///
+  struct VISIBILITY_HIDDEN SRETPromotion : public CallGraphSCCPass {
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      CallGraphSCCPass::getAnalysisUsage(AU);
+    }
+
+    virtual bool runOnSCC(const std::vector<CallGraphNode *> &SCC);
+    static char ID; // Pass identification, replacement for typeid
+    SRETPromotion() : CallGraphSCCPass(&ID) {}
+
+  private:
+    bool PromoteReturn(CallGraphNode *CGN);
+    bool isSafeToUpdateAllCallers(Function *F);
+    Function *cloneFunctionBody(Function *F, const StructType *STy);
+    void updateCallSites(Function *F, Function *NF);
+    bool nestedStructType(const StructType *STy);
+  };
+}
+
+char SRETPromotion::ID = 0;
+static RegisterPass<SRETPromotion>
+X("sretpromotion", "Promote sret arguments to multiple ret values");
+
+Pass *llvm::createStructRetPromotionPass() {
+  return new SRETPromotion();
+}
+
+bool SRETPromotion::runOnSCC(const std::vector<CallGraphNode *> &SCC) {
+  bool Changed = false;
+
+  for (unsigned i = 0, e = SCC.size(); i != e; ++i)
+    Changed |= PromoteReturn(SCC[i]);
+
+  return Changed;
+}
+
+/// PromoteReturn - This method promotes function that uses StructRet paramater 
+/// into a function that uses mulitple return value.
+bool SRETPromotion::PromoteReturn(CallGraphNode *CGN) {
+  Function *F = CGN->getFunction();
+
+  if (!F || F->isDeclaration() || !F->hasLocalLinkage())
+    return false;
+
+  // Make sure that function returns struct.
+  if (F->arg_size() == 0 || !F->hasStructRetAttr() || F->doesNotReturn())
+    return false;
+
+  DOUT << "SretPromotion: Looking at sret function " << F->getNameStart() << "\n";
+
+  assert (F->getReturnType() == Type::VoidTy && "Invalid function return type");
+  Function::arg_iterator AI = F->arg_begin();
+  const llvm::PointerType *FArgType = dyn_cast<PointerType>(AI->getType());
+  assert (FArgType && "Invalid sret parameter type");
+  const llvm::StructType *STy = 
+    dyn_cast<StructType>(FArgType->getElementType());
+  assert (STy && "Invalid sret parameter element type");
+
+  // Check if it is ok to perform this promotion.
+  if (isSafeToUpdateAllCallers(F) == false) {
+    DOUT << "SretPromotion: Not all callers can be updated\n";
+    NumRejectedSRETUses++;
+    return false;
+  }
+
+  DOUT << "SretPromotion: sret argument will be promoted\n";
+  NumSRET++;
+  // [1] Replace use of sret parameter 
+  AllocaInst *TheAlloca = new AllocaInst (STy, NULL, "mrv", 
+                                          F->getEntryBlock().begin());
+  Value *NFirstArg = F->arg_begin();
+  NFirstArg->replaceAllUsesWith(TheAlloca);
+
+  // [2] Find and replace ret instructions
+  for (Function::iterator FI = F->begin(), FE = F->end();  FI != FE; ++FI) 
+    for(BasicBlock::iterator BI = FI->begin(), BE = FI->end(); BI != BE; ) {
+      Instruction *I = BI;
+      ++BI;
+      if (isa<ReturnInst>(I)) {
+        Value *NV = new LoadInst(TheAlloca, "mrv.ld", I);
+        ReturnInst *NR = ReturnInst::Create(NV, I);
+        I->replaceAllUsesWith(NR);
+        I->eraseFromParent();
+      }
+    }
+
+  // [3] Create the new function body and insert it into the module.
+  Function *NF = cloneFunctionBody(F, STy);
+
+  // [4] Update all call sites to use new function
+  updateCallSites(F, NF);
+
+  F->eraseFromParent();
+  getAnalysis<CallGraph>().changeFunction(F, NF);
+  return true;
+}
+
+// Check if it is ok to perform this promotion.
+bool SRETPromotion::isSafeToUpdateAllCallers(Function *F) {
+
+  if (F->use_empty())
+    // No users. OK to modify signature.
+    return true;
+
+  for (Value::use_iterator FnUseI = F->use_begin(), FnUseE = F->use_end();
+       FnUseI != FnUseE; ++FnUseI) {
+    // The function is passed in as an argument to (possibly) another function,
+    // we can't change it!
+    CallSite CS = CallSite::get(*FnUseI);
+    Instruction *Call = CS.getInstruction();
+    // The function is used by something else than a call or invoke instruction,
+    // we can't change it!
+    if (!Call || !CS.isCallee(FnUseI))
+      return false;
+    CallSite::arg_iterator AI = CS.arg_begin();
+    Value *FirstArg = *AI;
+
+    if (!isa<AllocaInst>(FirstArg))
+      return false;
+
+    // Check FirstArg's users.
+    for (Value::use_iterator ArgI = FirstArg->use_begin(), 
+           ArgE = FirstArg->use_end(); ArgI != ArgE; ++ArgI) {
+
+      // If FirstArg user is a CallInst that does not correspond to current
+      // call site then this function F is not suitable for sret promotion.
+      if (CallInst *CI = dyn_cast<CallInst>(ArgI)) {
+        if (CI != Call)
+          return false;
+      }
+      // If FirstArg user is a GEP whose all users are not LoadInst then
+      // this function F is not suitable for sret promotion.
+      else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(ArgI)) {
+        // TODO : Use dom info and insert PHINodes to collect get results
+        // from multiple call sites for this GEP.
+        if (GEP->getParent() != Call->getParent())
+          return false;
+        for (Value::use_iterator GEPI = GEP->use_begin(), GEPE = GEP->use_end();
+             GEPI != GEPE; ++GEPI) 
+          if (!isa<LoadInst>(GEPI))
+            return false;
+      } 
+      // Any other FirstArg users make this function unsuitable for sret 
+      // promotion.
+      else
+        return false;
+    }
+  }
+
+  return true;
+}
+
+/// cloneFunctionBody - Create a new function based on F and
+/// insert it into module. Remove first argument. Use STy as
+/// the return type for new function.
+Function *SRETPromotion::cloneFunctionBody(Function *F, 
+                                           const StructType *STy) {
+
+  const FunctionType *FTy = F->getFunctionType();
+  std::vector<const Type*> Params;
+
+  // Attributes - Keep track of the parameter attributes for the arguments.
+  SmallVector<AttributeWithIndex, 8> AttributesVec;
+  const AttrListPtr &PAL = F->getAttributes();
+
+  // Add any return attributes.
+  if (Attributes attrs = PAL.getRetAttributes())
+    AttributesVec.push_back(AttributeWithIndex::get(0, attrs));
+
+  // Skip first argument.
+  Function::arg_iterator I = F->arg_begin(), E = F->arg_end();
+  ++I;
+  // 0th parameter attribute is reserved for return type.
+  // 1th parameter attribute is for first 1st sret argument.
+  unsigned ParamIndex = 2; 
+  while (I != E) {
+    Params.push_back(I->getType());
+    if (Attributes Attrs = PAL.getParamAttributes(ParamIndex))
+      AttributesVec.push_back(AttributeWithIndex::get(ParamIndex - 1, Attrs));
+    ++I;
+    ++ParamIndex;
+  }
+
+  // Add any fn attributes.
+  if (Attributes attrs = PAL.getFnAttributes())
+    AttributesVec.push_back(AttributeWithIndex::get(~0, attrs));
+
+
+  FunctionType *NFTy = FunctionType::get(STy, Params, FTy->isVarArg());
+  Function *NF = Function::Create(NFTy, F->getLinkage());
+  NF->takeName(F);
+  NF->copyAttributesFrom(F);
+  NF->setAttributes(AttrListPtr::get(AttributesVec.begin(), AttributesVec.end()));
+  F->getParent()->getFunctionList().insert(F, NF);
+  NF->getBasicBlockList().splice(NF->begin(), F->getBasicBlockList());
+
+  // Replace arguments
+  I = F->arg_begin();
+  E = F->arg_end();
+  Function::arg_iterator NI = NF->arg_begin();
+  ++I;
+  while (I != E) {
+      I->replaceAllUsesWith(NI);
+      NI->takeName(I);
+      ++I;
+      ++NI;
+  }
+
+  return NF;
+}
+
+/// updateCallSites - Update all sites that call F to use NF.
+void SRETPromotion::updateCallSites(Function *F, Function *NF) {
+  CallGraph &CG = getAnalysis<CallGraph>();
+  SmallVector<Value*, 16> Args;
+
+  // Attributes - Keep track of the parameter attributes for the arguments.
+  SmallVector<AttributeWithIndex, 8> ArgAttrsVec;
+
+  while (!F->use_empty()) {
+    CallSite CS = CallSite::get(*F->use_begin());
+    Instruction *Call = CS.getInstruction();
+
+    const AttrListPtr &PAL = F->getAttributes();
+    // Add any return attributes.
+    if (Attributes attrs = PAL.getRetAttributes())
+      ArgAttrsVec.push_back(AttributeWithIndex::get(0, attrs));
+
+    // Copy arguments, however skip first one.
+    CallSite::arg_iterator AI = CS.arg_begin(), AE = CS.arg_end();
+    Value *FirstCArg = *AI;
+    ++AI;
+    // 0th parameter attribute is reserved for return type.
+    // 1th parameter attribute is for first 1st sret argument.
+    unsigned ParamIndex = 2; 
+    while (AI != AE) {
+      Args.push_back(*AI); 
+      if (Attributes Attrs = PAL.getParamAttributes(ParamIndex))
+        ArgAttrsVec.push_back(AttributeWithIndex::get(ParamIndex - 1, Attrs));
+      ++ParamIndex;
+      ++AI;
+    }
+
+    // Add any function attributes.
+    if (Attributes attrs = PAL.getFnAttributes())
+      ArgAttrsVec.push_back(AttributeWithIndex::get(~0, attrs));
+    
+    AttrListPtr NewPAL = AttrListPtr::get(ArgAttrsVec.begin(), ArgAttrsVec.end());
+    
+    // Build new call instruction.
+    Instruction *New;
+    if (InvokeInst *II = dyn_cast<InvokeInst>(Call)) {
+      New = InvokeInst::Create(NF, II->getNormalDest(), II->getUnwindDest(),
+                               Args.begin(), Args.end(), "", Call);
+      cast<InvokeInst>(New)->setCallingConv(CS.getCallingConv());
+      cast<InvokeInst>(New)->setAttributes(NewPAL);
+    } else {
+      New = CallInst::Create(NF, Args.begin(), Args.end(), "", Call);
+      cast<CallInst>(New)->setCallingConv(CS.getCallingConv());
+      cast<CallInst>(New)->setAttributes(NewPAL);
+      if (cast<CallInst>(Call)->isTailCall())
+        cast<CallInst>(New)->setTailCall();
+    }
+    Args.clear();
+    ArgAttrsVec.clear();
+    New->takeName(Call);
+
+    // Update the callgraph to know that the callsite has been transformed.
+    CG[Call->getParent()->getParent()]->replaceCallSite(Call, New);
+
+    // Update all users of sret parameter to extract value using extractvalue.
+    for (Value::use_iterator UI = FirstCArg->use_begin(), 
+           UE = FirstCArg->use_end(); UI != UE; ) {
+      User *U2 = *UI++;
+      CallInst *C2 = dyn_cast<CallInst>(U2);
+      if (C2 && (C2 == Call))
+        continue;
+      else if (GetElementPtrInst *UGEP = dyn_cast<GetElementPtrInst>(U2)) {
+        ConstantInt *Idx = dyn_cast<ConstantInt>(UGEP->getOperand(2));
+        assert (Idx && "Unexpected getelementptr index!");
+        Value *GR = ExtractValueInst::Create(New, Idx->getZExtValue(),
+                                             "evi", UGEP);
+        while(!UGEP->use_empty()) {
+          // isSafeToUpdateAllCallers has checked that all GEP uses are
+          // LoadInsts
+          LoadInst *L = cast<LoadInst>(*UGEP->use_begin());
+          L->replaceAllUsesWith(GR);
+          L->eraseFromParent();
+        }
+        UGEP->eraseFromParent();
+      }
+      else assert( 0 && "Unexpected sret parameter use");
+    }
+    Call->eraseFromParent();
+  }
+}
+
+/// nestedStructType - Return true if STy includes any
+/// other aggregate types
+bool SRETPromotion::nestedStructType(const StructType *STy) {
+  unsigned Num = STy->getNumElements();
+  for (unsigned i = 0; i < Num; i++) {
+    const Type *Ty = STy->getElementType(i);
+    if (!Ty->isSingleValueType() && Ty != Type::VoidTy)
+      return true;
+  }
+  return false;
+}
diff --git a/lib/Transforms/Instrumentation/BlockProfiling.cpp b/lib/Transforms/Instrumentation/BlockProfiling.cpp
new file mode 100644
index 0000000..2bd9809
--- /dev/null
+++ b/lib/Transforms/Instrumentation/BlockProfiling.cpp
@@ -0,0 +1,126 @@
+//===- BlockProfiling.cpp - Insert counters for block profiling -----------===//
+//
+//                      The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass instruments the specified program with counters for basic block or
+// function profiling.  This is the most basic form of profiling, which can tell
+// which blocks are hot, but cannot reliably detect hot paths through the CFG.
+// Block profiling counts the number of times each basic block executes, and
+// function profiling counts the number of times each function is called.
+//
+// Note that this implementation is very naive.  Control equivalent regions of
+// the CFG should not require duplicate counters, but we do put duplicate
+// counters in.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Streams.h"
+#include "llvm/Transforms/Instrumentation.h"
+#include "RSProfiling.h"
+#include "ProfilingUtils.h"
+using namespace llvm;
+
+namespace {
+  class VISIBILITY_HIDDEN FunctionProfiler : public RSProfilers_std {
+  public:
+    static char ID;
+    bool runOnModule(Module &M);
+  };
+}
+
+char FunctionProfiler::ID = 0;
+
+static RegisterPass<FunctionProfiler>
+X("insert-function-profiling",
+  "Insert instrumentation for function profiling");
+static RegisterAnalysisGroup<RSProfilers> XG(X);
+
+ModulePass *llvm::createFunctionProfilerPass() {
+  return new FunctionProfiler();
+}
+
+bool FunctionProfiler::runOnModule(Module &M) {
+  Function *Main = M.getFunction("main");
+  if (Main == 0) {
+    cerr << "WARNING: cannot insert function profiling into a module"
+         << " with no main function!\n";
+    return false;  // No main, no instrumentation!
+  }
+
+  unsigned NumFunctions = 0;
+  for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I)
+    if (!I->isDeclaration())
+      ++NumFunctions;
+
+  const Type *ATy = ArrayType::get(Type::Int32Ty, NumFunctions);
+  GlobalVariable *Counters =
+    new GlobalVariable(ATy, false, GlobalValue::InternalLinkage,
+                       Constant::getNullValue(ATy), "FuncProfCounters", &M);
+
+  // Instrument all of the functions...
+  unsigned i = 0;
+  for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I)
+    if (!I->isDeclaration())
+      // Insert counter at the start of the function
+      IncrementCounterInBlock(&I->getEntryBlock(), i++, Counters);
+
+  // Add the initialization call to main.
+  InsertProfilingInitCall(Main, "llvm_start_func_profiling", Counters);
+  return true;
+}
+
+
+namespace {
+  class BlockProfiler : public RSProfilers_std {
+    bool runOnModule(Module &M);
+  public:
+    static char ID;
+  };
+}
+
+char BlockProfiler::ID = 0;
+static RegisterPass<BlockProfiler>
+Y("insert-block-profiling", "Insert instrumentation for block profiling");
+static RegisterAnalysisGroup<RSProfilers> YG(Y);
+
+ModulePass *llvm::createBlockProfilerPass() { return new BlockProfiler(); }
+
+bool BlockProfiler::runOnModule(Module &M) {
+  Function *Main = M.getFunction("main");
+  if (Main == 0) {
+    cerr << "WARNING: cannot insert block profiling into a module"
+         << " with no main function!\n";
+    return false;  // No main, no instrumentation!
+  }
+
+  unsigned NumBlocks = 0;
+  for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I)
+    NumBlocks += I->size();
+
+  const Type *ATy = ArrayType::get(Type::Int32Ty, NumBlocks);
+  GlobalVariable *Counters =
+    new GlobalVariable(ATy, false, GlobalValue::InternalLinkage,
+                       Constant::getNullValue(ATy), "BlockProfCounters", &M);
+
+  // Instrument all of the blocks...
+  unsigned i = 0;
+  for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I)
+    for (Function::iterator BB = I->begin(), E = I->end(); BB != E; ++BB)
+      // Insert counter at the start of the block
+      IncrementCounterInBlock(BB, i++, Counters);
+
+  // Add the initialization call to main.
+  InsertProfilingInitCall(Main, "llvm_start_block_profiling", Counters);
+  return true;
+}
+
diff --git a/lib/Transforms/Instrumentation/CMakeLists.txt b/lib/Transforms/Instrumentation/CMakeLists.txt
new file mode 100644
index 0000000..d7c518d
--- /dev/null
+++ b/lib/Transforms/Instrumentation/CMakeLists.txt
@@ -0,0 +1,6 @@
+add_llvm_library(LLVMInstrumentation
+  BlockProfiling.cpp
+  EdgeProfiling.cpp
+  ProfilingUtils.cpp
+  RSProfiling.cpp
+  )
diff --git a/lib/Transforms/Instrumentation/EdgeProfiling.cpp b/lib/Transforms/Instrumentation/EdgeProfiling.cpp
new file mode 100644
index 0000000..0831f3b
--- /dev/null
+++ b/lib/Transforms/Instrumentation/EdgeProfiling.cpp
@@ -0,0 +1,101 @@
+//===- EdgeProfiling.cpp - Insert counters for edge profiling -------------===//
+//
+//                      The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass instruments the specified program with counters for edge profiling.
+// Edge profiling can give a reasonable approximation of the hot paths through a
+// program, and is used for a wide variety of program transformations.
+//
+// Note that this implementation is very naive.  We insert a counter for *every*
+// edge in the program, instead of using control flow information to prune the
+// number of counters inserted.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ProfilingUtils.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Streams.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Instrumentation.h"
+#include <set>
+using namespace llvm;
+
+namespace {
+  class VISIBILITY_HIDDEN EdgeProfiler : public ModulePass {
+    bool runOnModule(Module &M);
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    EdgeProfiler() : ModulePass(&ID) {}
+  };
+}
+
+char EdgeProfiler::ID = 0;
+static RegisterPass<EdgeProfiler>
+X("insert-edge-profiling", "Insert instrumentation for edge profiling");
+
+ModulePass *llvm::createEdgeProfilerPass() { return new EdgeProfiler(); }
+
+bool EdgeProfiler::runOnModule(Module &M) {
+  Function *Main = M.getFunction("main");
+  if (Main == 0) {
+    cerr << "WARNING: cannot insert edge profiling into a module"
+         << " with no main function!\n";
+    return false;  // No main, no instrumentation!
+  }
+
+  std::set<BasicBlock*> BlocksToInstrument;
+  unsigned NumEdges = 0;
+  for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F)
+    for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
+      // Keep track of which blocks need to be instrumented.  We don't want to
+      // instrument blocks that are added as the result of breaking critical
+      // edges!
+      BlocksToInstrument.insert(BB);
+      NumEdges += BB->getTerminator()->getNumSuccessors();
+    }
+
+  const Type *ATy = ArrayType::get(Type::Int32Ty, NumEdges);
+  GlobalVariable *Counters =
+    new GlobalVariable(ATy, false, GlobalValue::InternalLinkage,
+                       Constant::getNullValue(ATy), "EdgeProfCounters", &M);
+
+  // Instrument all of the edges...
+  unsigned i = 0;
+  for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F)
+    for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB)
+      if (BlocksToInstrument.count(BB)) {  // Don't instrument inserted blocks
+        // Okay, we have to add a counter of each outgoing edge.  If the
+        // outgoing edge is not critical don't split it, just insert the counter
+        // in the source or destination of the edge.
+        TerminatorInst *TI = BB->getTerminator();
+        for (unsigned s = 0, e = TI->getNumSuccessors(); s != e; ++s) {
+          // If the edge is critical, split it.
+          SplitCriticalEdge(TI, s, this);
+
+          // Okay, we are guaranteed that the edge is no longer critical.  If we
+          // only have a single successor, insert the counter in this block,
+          // otherwise insert it in the successor block.
+          if (TI->getNumSuccessors() == 1) {
+            // Insert counter at the start of the block
+            IncrementCounterInBlock(BB, i++, Counters);
+          } else {
+            // Insert counter at the start of the block
+            IncrementCounterInBlock(TI->getSuccessor(s), i++, Counters);
+          }
+        }
+      }
+
+  // Add the initialization call to main.
+  InsertProfilingInitCall(Main, "llvm_start_edge_profiling", Counters);
+  return true;
+}
+
diff --git a/lib/Transforms/Instrumentation/Makefile b/lib/Transforms/Instrumentation/Makefile
new file mode 100644
index 0000000..6cbc7a9
--- /dev/null
+++ b/lib/Transforms/Instrumentation/Makefile
@@ -0,0 +1,15 @@
+##===- lib/Transforms/Instrumentation/Makefile -------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../../..
+LIBRARYNAME = LLVMInstrumentation
+BUILD_ARCHIVE = 1
+
+include $(LEVEL)/Makefile.common
+
diff --git a/lib/Transforms/Instrumentation/ProfilingUtils.cpp b/lib/Transforms/Instrumentation/ProfilingUtils.cpp
new file mode 100644
index 0000000..48071f1
--- /dev/null
+++ b/lib/Transforms/Instrumentation/ProfilingUtils.cpp
@@ -0,0 +1,120 @@
+//===- ProfilingUtils.cpp - Helper functions shared by profilers ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a few helper functions which are used by profile
+// instrumentation code to instrument the code.  This allows the profiler pass
+// to worry about *what* to insert, and these functions take care of *how* to do
+// it.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ProfilingUtils.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Instructions.h"
+#include "llvm/Module.h"
+
+void llvm::InsertProfilingInitCall(Function *MainFn, const char *FnName,
+                                   GlobalValue *Array) {
+  const Type *ArgVTy = 
+    PointerType::getUnqual(PointerType::getUnqual(Type::Int8Ty));
+  const PointerType *UIntPtr = PointerType::getUnqual(Type::Int32Ty);
+  Module &M = *MainFn->getParent();
+  Constant *InitFn = M.getOrInsertFunction(FnName, Type::Int32Ty, Type::Int32Ty,
+                                           ArgVTy, UIntPtr, Type::Int32Ty,
+                                           (Type *)0);
+
+  // This could force argc and argv into programs that wouldn't otherwise have
+  // them, but instead we just pass null values in.
+  std::vector<Value*> Args(4);
+  Args[0] = Constant::getNullValue(Type::Int32Ty);
+  Args[1] = Constant::getNullValue(ArgVTy);
+
+  // Skip over any allocas in the entry block.
+  BasicBlock *Entry = MainFn->begin();
+  BasicBlock::iterator InsertPos = Entry->begin();
+  while (isa<AllocaInst>(InsertPos)) ++InsertPos;
+
+  std::vector<Constant*> GEPIndices(2, Constant::getNullValue(Type::Int32Ty));
+  unsigned NumElements = 0;
+  if (Array) {
+    Args[2] = ConstantExpr::getGetElementPtr(Array, &GEPIndices[0],
+                                             GEPIndices.size());
+    NumElements =
+      cast<ArrayType>(Array->getType()->getElementType())->getNumElements();
+  } else {
+    // If this profiling instrumentation doesn't have a constant array, just
+    // pass null.
+    Args[2] = ConstantPointerNull::get(UIntPtr);
+  }
+  Args[3] = ConstantInt::get(Type::Int32Ty, NumElements);
+
+  Instruction *InitCall = CallInst::Create(InitFn, Args.begin(), Args.end(),
+                                           "newargc", InsertPos);
+
+  // If argc or argv are not available in main, just pass null values in.
+  Function::arg_iterator AI;
+  switch (MainFn->arg_size()) {
+  default:
+  case 2:
+    AI = MainFn->arg_begin(); ++AI;
+    if (AI->getType() != ArgVTy) {
+      Instruction::CastOps opcode = CastInst::getCastOpcode(AI, false, ArgVTy, 
+                                                            false);
+      InitCall->setOperand(2, 
+          CastInst::Create(opcode, AI, ArgVTy, "argv.cast", InitCall));
+    } else {
+      InitCall->setOperand(2, AI);
+    }
+    /* FALL THROUGH */
+
+  case 1:
+    AI = MainFn->arg_begin();
+    // If the program looked at argc, have it look at the return value of the
+    // init call instead.
+    if (AI->getType() != Type::Int32Ty) {
+      Instruction::CastOps opcode;
+      if (!AI->use_empty()) {
+        opcode = CastInst::getCastOpcode(InitCall, true, AI->getType(), true);
+        AI->replaceAllUsesWith(
+          CastInst::Create(opcode, InitCall, AI->getType(), "", InsertPos));
+      }
+      opcode = CastInst::getCastOpcode(AI, true, Type::Int32Ty, true);
+      InitCall->setOperand(1, 
+          CastInst::Create(opcode, AI, Type::Int32Ty, "argc.cast", InitCall));
+    } else {
+      AI->replaceAllUsesWith(InitCall);
+      InitCall->setOperand(1, AI);
+    }
+
+  case 0: break;
+  }
+}
+
+void llvm::IncrementCounterInBlock(BasicBlock *BB, unsigned CounterNum,
+                                   GlobalValue *CounterArray) {
+  // Insert the increment after any alloca or PHI instructions...
+  BasicBlock::iterator InsertPos = BB->getFirstNonPHI();
+  while (isa<AllocaInst>(InsertPos))
+    ++InsertPos;
+
+  // Create the getelementptr constant expression
+  std::vector<Constant*> Indices(2);
+  Indices[0] = Constant::getNullValue(Type::Int32Ty);
+  Indices[1] = ConstantInt::get(Type::Int32Ty, CounterNum);
+  Constant *ElementPtr = 
+    ConstantExpr::getGetElementPtr(CounterArray, &Indices[0], Indices.size());
+
+  // Load, increment and store the value back.
+  Value *OldVal = new LoadInst(ElementPtr, "OldFuncCounter", InsertPos);
+  Value *NewVal = BinaryOperator::Create(Instruction::Add, OldVal,
+                                         ConstantInt::get(Type::Int32Ty, 1),
+                                         "NewFuncCounter", InsertPos);
+  new StoreInst(NewVal, ElementPtr, InsertPos);
+}
diff --git a/lib/Transforms/Instrumentation/ProfilingUtils.h b/lib/Transforms/Instrumentation/ProfilingUtils.h
new file mode 100644
index 0000000..94efffe
--- /dev/null
+++ b/lib/Transforms/Instrumentation/ProfilingUtils.h
@@ -0,0 +1,31 @@
+//===- ProfilingUtils.h - Helper functions shared by profilers --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a few helper functions which are used by profile
+// instrumentation code to instrument the code.  This allows the profiler pass
+// to worry about *what* to insert, and these functions take care of *how* to do
+// it.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef PROFILINGUTILS_H
+#define PROFILINGUTILS_H
+
+namespace llvm {
+  class Function;
+  class GlobalValue;
+  class BasicBlock;
+
+  void InsertProfilingInitCall(Function *MainFn, const char *FnName,
+                               GlobalValue *Arr = 0);
+  void IncrementCounterInBlock(BasicBlock *BB, unsigned CounterNum,
+                               GlobalValue *CounterArray);
+}
+
+#endif
diff --git a/lib/Transforms/Instrumentation/RSProfiling.cpp b/lib/Transforms/Instrumentation/RSProfiling.cpp
new file mode 100644
index 0000000..c6cf4df
--- /dev/null
+++ b/lib/Transforms/Instrumentation/RSProfiling.cpp
@@ -0,0 +1,653 @@
+//===- RSProfiling.cpp - Various profiling using random sampling ----------===//
+//
+//                      The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// These passes implement a random sampling based profiling.  Different methods
+// of choosing when to sample are supported, as well as different types of
+// profiling.  This is done as two passes.  The first is a sequence of profiling
+// passes which insert profiling into the program, and remember what they 
+// inserted.
+//
+// The second stage duplicates all instructions in a function, ignoring the 
+// profiling code, then connects the two versions togeather at the entry and at
+// backedges.  At each connection point a choice is made as to whether to jump
+// to the profiled code (take a sample) or execute the unprofiled code.
+//
+// It is highly recommended that after this pass one runs mem2reg and adce
+// (instcombine load-vn gdce dse also are good to run afterwards)
+//
+// This design is intended to make the profiling passes independent of the RS
+// framework, but any profiling pass that implements the RSProfiling interface
+// is compatible with the rs framework (and thus can be sampled)
+//
+// TODO: obviously the block and function profiling are almost identical to the
+// existing ones, so they can be unified (esp since these passes are valid
+// without the rs framework).
+// TODO: Fix choice code so that frequency is not hard coded
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Pass.h"
+#include "llvm/Module.h"
+#include "llvm/Instructions.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Instrumentation.h"
+#include "RSProfiling.h"
+#include <set>
+#include <map>
+#include <queue>
+using namespace llvm;
+
+namespace {
+  enum RandomMeth {
+    GBV, GBVO, HOSTCC
+  };
+}
+
+static cl::opt<RandomMeth> RandomMethod("profile-randomness",
+    cl::desc("How to randomly choose to profile:"),
+    cl::values(
+               clEnumValN(GBV, "global", "global counter"),
+               clEnumValN(GBVO, "ra_global", 
+                          "register allocated global counter"),
+               clEnumValN(HOSTCC, "rdcc", "cycle counter"),
+               clEnumValEnd));
+  
+namespace {
+  /// NullProfilerRS - The basic profiler that does nothing.  It is the default
+  /// profiler and thus terminates RSProfiler chains.  It is useful for 
+  /// measuring framework overhead
+  class VISIBILITY_HIDDEN NullProfilerRS : public RSProfilers {
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    bool isProfiling(Value* v) {
+      return false;
+    }
+    bool runOnModule(Module &M) {
+      return false;
+    }
+    void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesAll();
+    }
+  };
+}
+
+static RegisterAnalysisGroup<RSProfilers> A("Profiling passes");
+static RegisterPass<NullProfilerRS> NP("insert-null-profiling-rs",
+                                       "Measure profiling framework overhead");
+static RegisterAnalysisGroup<RSProfilers, true> NPT(NP);
+
+namespace {
+  /// Chooser - Something that chooses when to make a sample of the profiled code
+  class VISIBILITY_HIDDEN Chooser {
+  public:
+    /// ProcessChoicePoint - is called for each basic block inserted to choose 
+    /// between normal and sample code
+    virtual void ProcessChoicePoint(BasicBlock*) = 0;
+    /// PrepFunction - is called once per function before other work is done.
+    /// This gives the opertunity to insert new allocas and such.
+    virtual void PrepFunction(Function*) = 0;
+    virtual ~Chooser() {}
+  };
+
+  //Things that implement sampling policies
+  //A global value that is read-mod-stored to choose when to sample.
+  //A sample is taken when the global counter hits 0
+  class VISIBILITY_HIDDEN GlobalRandomCounter : public Chooser {
+    GlobalVariable* Counter;
+    Value* ResetValue;
+    const Type* T;
+  public:
+    GlobalRandomCounter(Module& M, const Type* t, uint64_t resetval);
+    virtual ~GlobalRandomCounter();
+    virtual void PrepFunction(Function* F);
+    virtual void ProcessChoicePoint(BasicBlock* bb);
+  };
+
+  //Same is GRC, but allow register allocation of the global counter
+  class VISIBILITY_HIDDEN GlobalRandomCounterOpt : public Chooser {
+    GlobalVariable* Counter;
+    Value* ResetValue;
+    AllocaInst* AI;
+    const Type* T;
+  public:
+    GlobalRandomCounterOpt(Module& M, const Type* t, uint64_t resetval);
+    virtual ~GlobalRandomCounterOpt();
+    virtual void PrepFunction(Function* F);
+    virtual void ProcessChoicePoint(BasicBlock* bb);
+  };
+
+  //Use the cycle counter intrinsic as a source of pseudo randomness when
+  //deciding when to sample.
+  class VISIBILITY_HIDDEN CycleCounter : public Chooser {
+    uint64_t rm;
+    Constant *F;
+  public:
+    CycleCounter(Module& m, uint64_t resetmask);
+    virtual ~CycleCounter();
+    virtual void PrepFunction(Function* F);
+    virtual void ProcessChoicePoint(BasicBlock* bb);
+  };
+
+  /// ProfilerRS - Insert the random sampling framework
+  struct VISIBILITY_HIDDEN ProfilerRS : public FunctionPass {
+    static char ID; // Pass identification, replacement for typeid
+    ProfilerRS() : FunctionPass(&ID) {}
+
+    std::map<Value*, Value*> TransCache;
+    std::set<BasicBlock*> ChoicePoints;
+    Chooser* c;
+
+    //Translate and duplicate values for the new profile free version of stuff
+    Value* Translate(Value* v);
+    //Duplicate an entire function (with out profiling)
+    void Duplicate(Function& F, RSProfilers& LI);
+    //Called once for each backedge, handle the insertion of choice points and
+    //the interconection of the two versions of the code
+    void ProcessBackEdge(BasicBlock* src, BasicBlock* dst, Function& F);
+    bool runOnFunction(Function& F);
+    bool doInitialization(Module &M);
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const;
+  };
+}
+
+static RegisterPass<ProfilerRS>
+X("insert-rs-profiling-framework",
+  "Insert random sampling instrumentation framework");
+
+char RSProfilers::ID = 0;
+char NullProfilerRS::ID = 0;
+char ProfilerRS::ID = 0;
+
+//Local utilities
+static void ReplacePhiPred(BasicBlock* btarget, 
+                           BasicBlock* bold, BasicBlock* bnew);
+
+static void CollapsePhi(BasicBlock* btarget, BasicBlock* bsrc);
+
+template<class T>
+static void recBackEdge(BasicBlock* bb, T& BackEdges, 
+                        std::map<BasicBlock*, int>& color,
+                        std::map<BasicBlock*, int>& depth,
+                        std::map<BasicBlock*, int>& finish,
+                        int& time);
+
+//find the back edges and where they go to
+template<class T>
+static void getBackEdges(Function& F, T& BackEdges);
+
+
+///////////////////////////////////////
+// Methods of choosing when to profile
+///////////////////////////////////////
+  
+GlobalRandomCounter::GlobalRandomCounter(Module& M, const Type* t, 
+                                         uint64_t resetval) : T(t) {
+  ConstantInt* Init = ConstantInt::get(T, resetval); 
+  ResetValue = Init;
+  Counter = new GlobalVariable(T, false, GlobalValue::InternalLinkage,
+                               Init, "RandomSteeringCounter", &M);
+}
+
+GlobalRandomCounter::~GlobalRandomCounter() {}
+
+void GlobalRandomCounter::PrepFunction(Function* F) {}
+
+void GlobalRandomCounter::ProcessChoicePoint(BasicBlock* bb) {
+  BranchInst* t = cast<BranchInst>(bb->getTerminator());
+  
+  //decrement counter
+  LoadInst* l = new LoadInst(Counter, "counter", t);
+  
+  ICmpInst* s = new ICmpInst(ICmpInst::ICMP_EQ, l, ConstantInt::get(T, 0), 
+                             "countercc", t);
+
+  Value* nv = BinaryOperator::CreateSub(l, ConstantInt::get(T, 1),
+                                        "counternew", t);
+  new StoreInst(nv, Counter, t);
+  t->setCondition(s);
+  
+  //reset counter
+  BasicBlock* oldnext = t->getSuccessor(0);
+  BasicBlock* resetblock = BasicBlock::Create("reset", oldnext->getParent(), 
+                                              oldnext);
+  TerminatorInst* t2 = BranchInst::Create(oldnext, resetblock);
+  t->setSuccessor(0, resetblock);
+  new StoreInst(ResetValue, Counter, t2);
+  ReplacePhiPred(oldnext, bb, resetblock);
+}
+
+GlobalRandomCounterOpt::GlobalRandomCounterOpt(Module& M, const Type* t, 
+                                               uint64_t resetval) 
+  : AI(0), T(t) {
+  ConstantInt* Init = ConstantInt::get(T, resetval);
+  ResetValue  = Init;
+  Counter = new GlobalVariable(T, false, GlobalValue::InternalLinkage,
+                               Init, "RandomSteeringCounter", &M);
+}
+
+GlobalRandomCounterOpt::~GlobalRandomCounterOpt() {}
+
+void GlobalRandomCounterOpt::PrepFunction(Function* F) {
+  //make a local temporary to cache the global
+  BasicBlock& bb = F->getEntryBlock();
+  BasicBlock::iterator InsertPt = bb.begin();
+  AI = new AllocaInst(T, 0, "localcounter", InsertPt);
+  LoadInst* l = new LoadInst(Counter, "counterload", InsertPt);
+  new StoreInst(l, AI, InsertPt);
+  
+  //modify all functions and return values to restore the local variable to/from
+  //the global variable
+  for(Function::iterator fib = F->begin(), fie = F->end();
+      fib != fie; ++fib)
+    for(BasicBlock::iterator bib = fib->begin(), bie = fib->end();
+        bib != bie; ++bib)
+      if (isa<CallInst>(bib)) {
+        LoadInst* l = new LoadInst(AI, "counter", bib);
+        new StoreInst(l, Counter, bib);
+        l = new LoadInst(Counter, "counter", ++bib);
+        new StoreInst(l, AI, bib--);
+      } else if (isa<InvokeInst>(bib)) {
+        LoadInst* l = new LoadInst(AI, "counter", bib);
+        new StoreInst(l, Counter, bib);
+        
+        BasicBlock* bb = cast<InvokeInst>(bib)->getNormalDest();
+        BasicBlock::iterator i = bb->getFirstNonPHI();
+        l = new LoadInst(Counter, "counter", i);
+        
+        bb = cast<InvokeInst>(bib)->getUnwindDest();
+        i = bb->getFirstNonPHI();
+        l = new LoadInst(Counter, "counter", i);
+        new StoreInst(l, AI, i);
+      } else if (isa<UnwindInst>(&*bib) || isa<ReturnInst>(&*bib)) {
+        LoadInst* l = new LoadInst(AI, "counter", bib);
+        new StoreInst(l, Counter, bib);
+      }
+}
+
+void GlobalRandomCounterOpt::ProcessChoicePoint(BasicBlock* bb) {
+  BranchInst* t = cast<BranchInst>(bb->getTerminator());
+  
+  //decrement counter
+  LoadInst* l = new LoadInst(AI, "counter", t);
+  
+  ICmpInst* s = new ICmpInst(ICmpInst::ICMP_EQ, l, ConstantInt::get(T, 0), 
+                             "countercc", t);
+
+  Value* nv = BinaryOperator::CreateSub(l, ConstantInt::get(T, 1),
+                                        "counternew", t);
+  new StoreInst(nv, AI, t);
+  t->setCondition(s);
+  
+  //reset counter
+  BasicBlock* oldnext = t->getSuccessor(0);
+  BasicBlock* resetblock = BasicBlock::Create("reset", oldnext->getParent(), 
+                                              oldnext);
+  TerminatorInst* t2 = BranchInst::Create(oldnext, resetblock);
+  t->setSuccessor(0, resetblock);
+  new StoreInst(ResetValue, AI, t2);
+  ReplacePhiPred(oldnext, bb, resetblock);
+}
+
+
+CycleCounter::CycleCounter(Module& m, uint64_t resetmask) : rm(resetmask) {
+  F = Intrinsic::getDeclaration(&m, Intrinsic::readcyclecounter);
+}
+
+CycleCounter::~CycleCounter() {}
+
+void CycleCounter::PrepFunction(Function* F) {}
+
+void CycleCounter::ProcessChoicePoint(BasicBlock* bb) {
+  BranchInst* t = cast<BranchInst>(bb->getTerminator());
+  
+  CallInst* c = CallInst::Create(F, "rdcc", t);
+  BinaryOperator* b = 
+    BinaryOperator::CreateAnd(c, ConstantInt::get(Type::Int64Ty, rm),
+                              "mrdcc", t);
+  
+  ICmpInst *s = new ICmpInst(ICmpInst::ICMP_EQ, b,
+                             ConstantInt::get(Type::Int64Ty, 0), 
+                             "mrdccc", t);
+
+  t->setCondition(s);
+}
+
+///////////////////////////////////////
+// Profiling:
+///////////////////////////////////////
+bool RSProfilers_std::isProfiling(Value* v) {
+  if (profcode.find(v) != profcode.end())
+    return true;
+  //else
+  RSProfilers& LI = getAnalysis<RSProfilers>();
+  return LI.isProfiling(v);
+}
+
+void RSProfilers_std::IncrementCounterInBlock(BasicBlock *BB, unsigned CounterNum,
+                                          GlobalValue *CounterArray) {
+  // Insert the increment after any alloca or PHI instructions...
+  BasicBlock::iterator InsertPos = BB->getFirstNonPHI();
+  while (isa<AllocaInst>(InsertPos))
+    ++InsertPos;
+  
+  // Create the getelementptr constant expression
+  std::vector<Constant*> Indices(2);
+  Indices[0] = Constant::getNullValue(Type::Int32Ty);
+  Indices[1] = ConstantInt::get(Type::Int32Ty, CounterNum);
+  Constant *ElementPtr = ConstantExpr::getGetElementPtr(CounterArray,
+                                                        &Indices[0], 2);
+  
+  // Load, increment and store the value back.
+  Value *OldVal = new LoadInst(ElementPtr, "OldCounter", InsertPos);
+  profcode.insert(OldVal);
+  Value *NewVal = BinaryOperator::CreateAdd(OldVal,
+                                            ConstantInt::get(Type::Int32Ty, 1),
+                                            "NewCounter", InsertPos);
+  profcode.insert(NewVal);
+  profcode.insert(new StoreInst(NewVal, ElementPtr, InsertPos));
+}
+
+void RSProfilers_std::getAnalysisUsage(AnalysisUsage &AU) const {
+  //grab any outstanding profiler, or get the null one
+  AU.addRequired<RSProfilers>();
+}
+
+///////////////////////////////////////
+// RS Framework
+///////////////////////////////////////
+
+Value* ProfilerRS::Translate(Value* v) {
+  if(TransCache[v])
+    return TransCache[v];
+  
+  if (BasicBlock* bb = dyn_cast<BasicBlock>(v)) {
+    if (bb == &bb->getParent()->getEntryBlock())
+      TransCache[bb] = bb; //don't translate entry block
+    else
+      TransCache[bb] = BasicBlock::Create("dup_" + bb->getName(),
+                                          bb->getParent(), NULL);
+    return TransCache[bb];
+  } else if (Instruction* i = dyn_cast<Instruction>(v)) {
+    //we have already translated this
+    //do not translate entry block allocas
+    if(&i->getParent()->getParent()->getEntryBlock() == i->getParent()) {
+      TransCache[i] = i;
+      return i;
+    } else {
+      //translate this
+      Instruction* i2 = i->clone();
+      if (i->hasName())
+        i2->setName("dup_" + i->getName());
+      TransCache[i] = i2;
+      //NumNewInst++;
+      for (unsigned x = 0; x < i2->getNumOperands(); ++x)
+        i2->setOperand(x, Translate(i2->getOperand(x)));
+      return i2;
+    }
+  } else if (isa<Function>(v) || isa<Constant>(v) || isa<Argument>(v)) {
+    TransCache[v] = v;
+    return v;
+  }
+  assert(0 && "Value not handled");
+  return 0;
+}
+
+void ProfilerRS::Duplicate(Function& F, RSProfilers& LI)
+{
+  //perform a breadth first search, building up a duplicate of the code
+  std::queue<BasicBlock*> worklist;
+  std::set<BasicBlock*> seen;
+  
+  //This loop ensures proper BB order, to help performance
+  for (Function::iterator fib = F.begin(), fie = F.end(); fib != fie; ++fib)
+    worklist.push(fib);
+  while (!worklist.empty()) {
+    Translate(worklist.front());
+    worklist.pop();
+  }
+  
+  //remember than reg2mem created a new entry block we don't want to duplicate
+  worklist.push(F.getEntryBlock().getTerminator()->getSuccessor(0));
+  seen.insert(&F.getEntryBlock());
+  
+  while (!worklist.empty()) {
+    BasicBlock* bb = worklist.front();
+    worklist.pop();
+    if(seen.find(bb) == seen.end()) {
+      BasicBlock* bbtarget = cast<BasicBlock>(Translate(bb));
+      BasicBlock::InstListType& instlist = bbtarget->getInstList();
+      for (BasicBlock::iterator iib = bb->begin(), iie = bb->end(); 
+           iib != iie; ++iib) {
+        //NumOldInst++;
+        if (!LI.isProfiling(&*iib)) {
+          Instruction* i = cast<Instruction>(Translate(iib));
+          instlist.insert(bbtarget->end(), i);
+        }
+      }
+      //updated search state;
+      seen.insert(bb);
+      TerminatorInst* ti = bb->getTerminator();
+      for (unsigned x = 0; x < ti->getNumSuccessors(); ++x) {
+        BasicBlock* bbs = ti->getSuccessor(x);
+        if (seen.find(bbs) == seen.end()) {
+          worklist.push(bbs);
+        }
+      }
+    }
+  }
+}
+
+void ProfilerRS::ProcessBackEdge(BasicBlock* src, BasicBlock* dst, Function& F) {
+  //given a backedge from B -> A, and translations A' and B',
+  //a: insert C and C'
+  //b: add branches in C to A and A' and in C' to A and A'
+  //c: mod terminators@B, replace A with C
+  //d: mod terminators@B', replace A' with C'
+  //e: mod phis@A for pred B to be pred C
+  //       if multiple entries, simplify to one
+  //f: mod phis@A' for pred B' to be pred C'
+  //       if multiple entries, simplify to one
+  //g: for all phis@A with pred C using x
+  //       add in edge from C' using x'
+  //       add in edge from C using x in A'
+  
+  //a:
+  Function::iterator BBN = src; ++BBN;
+  BasicBlock* bbC = BasicBlock::Create("choice", &F, BBN);
+  //ChoicePoints.insert(bbC);
+  BBN = cast<BasicBlock>(Translate(src));
+  BasicBlock* bbCp = BasicBlock::Create("choice", &F, ++BBN);
+  ChoicePoints.insert(bbCp);
+  
+  //b:
+  BranchInst::Create(cast<BasicBlock>(Translate(dst)), bbC);
+  BranchInst::Create(dst, cast<BasicBlock>(Translate(dst)), 
+                     ConstantInt::get(Type::Int1Ty, true), bbCp);
+  //c:
+  {
+    TerminatorInst* iB = src->getTerminator();
+    for (unsigned x = 0; x < iB->getNumSuccessors(); ++x)
+      if (iB->getSuccessor(x) == dst)
+        iB->setSuccessor(x, bbC);
+  }
+  //d:
+  {
+    TerminatorInst* iBp = cast<TerminatorInst>(Translate(src->getTerminator()));
+    for (unsigned x = 0; x < iBp->getNumSuccessors(); ++x)
+      if (iBp->getSuccessor(x) == cast<BasicBlock>(Translate(dst)))
+        iBp->setSuccessor(x, bbCp);
+  }
+  //e:
+  ReplacePhiPred(dst, src, bbC);
+  //src could be a switch, in which case we are replacing several edges with one
+  //thus collapse those edges int the Phi
+  CollapsePhi(dst, bbC);
+  //f:
+  ReplacePhiPred(cast<BasicBlock>(Translate(dst)),
+                 cast<BasicBlock>(Translate(src)),bbCp);
+  CollapsePhi(cast<BasicBlock>(Translate(dst)), bbCp);
+  //g:
+  for(BasicBlock::iterator ib = dst->begin(), ie = dst->end(); ib != ie;
+      ++ib)
+    if (PHINode* phi = dyn_cast<PHINode>(&*ib)) {
+      for(unsigned x = 0; x < phi->getNumIncomingValues(); ++x)
+        if(bbC == phi->getIncomingBlock(x)) {
+          phi->addIncoming(Translate(phi->getIncomingValue(x)), bbCp);
+          cast<PHINode>(Translate(phi))->addIncoming(phi->getIncomingValue(x), 
+                                                     bbC);
+        }
+      phi->removeIncomingValue(bbC);
+    }
+}
+
+bool ProfilerRS::runOnFunction(Function& F) {
+  if (!F.isDeclaration()) {
+    std::set<std::pair<BasicBlock*, BasicBlock*> > BackEdges;
+    RSProfilers& LI = getAnalysis<RSProfilers>();
+    
+    getBackEdges(F, BackEdges);
+    Duplicate(F, LI);
+    //assume that stuff worked.  now connect the duplicated basic blocks 
+    //with the originals in such a way as to preserve ssa.  yuk!
+    for (std::set<std::pair<BasicBlock*, BasicBlock*> >::iterator 
+           ib = BackEdges.begin(), ie = BackEdges.end(); ib != ie; ++ib)
+      ProcessBackEdge(ib->first, ib->second, F);
+    
+    //oh, and add the edge from the reg2mem created entry node to the 
+    //duplicated second node
+    TerminatorInst* T = F.getEntryBlock().getTerminator();
+    ReplaceInstWithInst(T, BranchInst::Create(T->getSuccessor(0),
+                                              cast<BasicBlock>(
+                                                Translate(T->getSuccessor(0))),
+                                              ConstantInt::get(Type::Int1Ty,
+                                                               true)));
+    
+    //do whatever is needed now that the function is duplicated
+    c->PrepFunction(&F);
+    
+    //add entry node to choice points
+    ChoicePoints.insert(&F.getEntryBlock());
+    
+    for (std::set<BasicBlock*>::iterator 
+           ii = ChoicePoints.begin(), ie = ChoicePoints.end(); ii != ie; ++ii)
+      c->ProcessChoicePoint(*ii);
+    
+    ChoicePoints.clear();
+    TransCache.clear();
+    
+    return true;
+  }
+  return false;
+}
+
+bool ProfilerRS::doInitialization(Module &M) {
+  switch (RandomMethod) {
+  case GBV:
+    c = new GlobalRandomCounter(M, Type::Int32Ty, (1 << 14) - 1);
+    break;
+  case GBVO:
+    c = new GlobalRandomCounterOpt(M, Type::Int32Ty, (1 << 14) - 1);
+    break;
+  case HOSTCC:
+    c = new CycleCounter(M, (1 << 14) - 1);
+    break;
+  };
+  return true;
+}
+
+void ProfilerRS::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<RSProfilers>();
+  AU.addRequiredID(DemoteRegisterToMemoryID);
+}
+
+///////////////////////////////////////
+// Utilities:
+///////////////////////////////////////
+static void ReplacePhiPred(BasicBlock* btarget, 
+                           BasicBlock* bold, BasicBlock* bnew) {
+  for(BasicBlock::iterator ib = btarget->begin(), ie = btarget->end();
+      ib != ie; ++ib)
+    if (PHINode* phi = dyn_cast<PHINode>(&*ib)) {
+      for(unsigned x = 0; x < phi->getNumIncomingValues(); ++x)
+        if(bold == phi->getIncomingBlock(x))
+          phi->setIncomingBlock(x, bnew);
+    }
+}
+
+static void CollapsePhi(BasicBlock* btarget, BasicBlock* bsrc) {
+  for(BasicBlock::iterator ib = btarget->begin(), ie = btarget->end();
+      ib != ie; ++ib)
+    if (PHINode* phi = dyn_cast<PHINode>(&*ib)) {
+      std::map<BasicBlock*, Value*> counter;
+      for(unsigned i = 0; i < phi->getNumIncomingValues(); ) {
+        if (counter[phi->getIncomingBlock(i)]) {
+          assert(phi->getIncomingValue(i) == counter[phi->getIncomingBlock(i)]);
+          phi->removeIncomingValue(i, false);
+        } else {
+          counter[phi->getIncomingBlock(i)] = phi->getIncomingValue(i);
+          ++i;
+        }
+      }
+    } 
+}
+
+template<class T>
+static void recBackEdge(BasicBlock* bb, T& BackEdges, 
+                        std::map<BasicBlock*, int>& color,
+                        std::map<BasicBlock*, int>& depth,
+                        std::map<BasicBlock*, int>& finish,
+                        int& time)
+{
+  color[bb] = 1;
+  ++time;
+  depth[bb] = time;
+  TerminatorInst* t= bb->getTerminator();
+  for(unsigned i = 0; i < t->getNumSuccessors(); ++i) {
+    BasicBlock* bbnew = t->getSuccessor(i);
+    if (color[bbnew] == 0)
+      recBackEdge(bbnew, BackEdges, color, depth, finish, time);
+    else if (color[bbnew] == 1) {
+      BackEdges.insert(std::make_pair(bb, bbnew));
+      //NumBackEdges++;
+    }
+  }
+  color[bb] = 2;
+  ++time;
+  finish[bb] = time;
+}
+
+
+
+//find the back edges and where they go to
+template<class T>
+static void getBackEdges(Function& F, T& BackEdges) {
+  std::map<BasicBlock*, int> color;
+  std::map<BasicBlock*, int> depth;
+  std::map<BasicBlock*, int> finish;
+  int time = 0;
+  recBackEdge(&F.getEntryBlock(), BackEdges, color, depth, finish, time);
+  DOUT << F.getName() << " " << BackEdges.size() << "\n";
+}
+
+
+//Creation functions
+ModulePass* llvm::createNullProfilerRSPass() {
+  return new NullProfilerRS();
+}
+
+FunctionPass* llvm::createRSProfilingPass() {
+  return new ProfilerRS();
+}
diff --git a/lib/Transforms/Instrumentation/RSProfiling.h b/lib/Transforms/Instrumentation/RSProfiling.h
new file mode 100644
index 0000000..8bbe7c7
--- /dev/null
+++ b/lib/Transforms/Instrumentation/RSProfiling.h
@@ -0,0 +1,31 @@
+//===- RSProfiling.h - Various profiling using random sampling ----------===//
+//
+//                      The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// See notes in RSProfiling.cpp
+//
+//===----------------------------------------------------------------------===//
+#include "llvm/Transforms/RSProfiling.h"
+#include <set>
+
+namespace llvm {
+  /// RSProfilers_std - a simple support class for profilers that handles most
+  /// of the work of chaining and tracking inserted code.
+  struct RSProfilers_std : public RSProfilers {
+    static char ID;
+    std::set<Value*> profcode;
+    // Lookup up values in profcode
+    virtual bool isProfiling(Value* v);
+    // handles required chaining
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const;
+    // places counter updates in basic blocks and recordes added instructions in
+    // profcode
+    void IncrementCounterInBlock(BasicBlock *BB, unsigned CounterNum,
+                                 GlobalValue *CounterArray);
+  };
+}
diff --git a/lib/Transforms/Makefile b/lib/Transforms/Makefile
new file mode 100644
index 0000000..5fe1eeb
--- /dev/null
+++ b/lib/Transforms/Makefile
@@ -0,0 +1,20 @@
+##===- lib/Transforms/Makefile -----------------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../..
+PARALLEL_DIRS = Utils Instrumentation Scalar IPO Hello
+
+include $(LEVEL)/Makefile.config
+
+# No support for plugins on windows targets
+ifeq ($(OS), $(filter $(OS), Cygwin MingW))
+  PARALLEL_DIRS := $(filter-out Hello, $(PARALLEL_DIRS))
+endif
+
+include $(LEVEL)/Makefile.common
diff --git a/lib/Transforms/Scalar/ADCE.cpp b/lib/Transforms/Scalar/ADCE.cpp
new file mode 100644
index 0000000..9c55f66
--- /dev/null
+++ b/lib/Transforms/Scalar/ADCE.cpp
@@ -0,0 +1,98 @@
+//===- DCE.cpp - Code to perform dead code elimination --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Aggressive Dead Code Elimination pass.  This pass
+// optimistically assumes that all instructions are dead until proven otherwise,
+// allowing it to eliminate dead computations that other DCE passes do not 
+// catch, particularly involving loop computations.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "adce"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/BasicBlock.h"
+#include "llvm/Instructions.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/InstIterator.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+
+using namespace llvm;
+
+STATISTIC(NumRemoved, "Number of instructions removed");
+
+namespace {
+  struct VISIBILITY_HIDDEN ADCE : public FunctionPass {
+    static char ID; // Pass identification, replacement for typeid
+    ADCE() : FunctionPass(&ID) {}
+    
+    virtual bool runOnFunction(Function& F);
+    
+    virtual void getAnalysisUsage(AnalysisUsage& AU) const {
+      AU.setPreservesCFG();
+    }
+    
+  };
+}
+
+char ADCE::ID = 0;
+static RegisterPass<ADCE> X("adce", "Aggressive Dead Code Elimination");
+
+bool ADCE::runOnFunction(Function& F) {
+  SmallPtrSet<Instruction*, 128> alive;
+  SmallVector<Instruction*, 128> worklist;
+  
+  // Collect the set of "root" instructions that are known live.
+  for (inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I)
+    if (isa<TerminatorInst>(I.getInstructionIterator()) ||
+        isa<DbgInfoIntrinsic>(I.getInstructionIterator()) ||
+        I->mayHaveSideEffects()) {
+      alive.insert(I.getInstructionIterator());
+      worklist.push_back(I.getInstructionIterator());
+    }
+  
+  // Propagate liveness backwards to operands.
+  while (!worklist.empty()) {
+    Instruction* curr = worklist.back();
+    worklist.pop_back();
+    
+    for (Instruction::op_iterator OI = curr->op_begin(), OE = curr->op_end();
+         OI != OE; ++OI)
+      if (Instruction* Inst = dyn_cast<Instruction>(OI))
+        if (alive.insert(Inst))
+          worklist.push_back(Inst);
+  }
+  
+  // The inverse of the live set is the dead set.  These are those instructions
+  // which have no side effects and do not influence the control flow or return
+  // value of the function, and may therefore be deleted safely.
+  // NOTE: We reuse the worklist vector here for memory efficiency.
+  for (inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I)
+    if (!alive.count(I.getInstructionIterator())) {
+      worklist.push_back(I.getInstructionIterator());
+      I->dropAllReferences();
+    }
+  
+  for (SmallVector<Instruction*, 1024>::iterator I = worklist.begin(),
+       E = worklist.end(); I != E; ++I) {
+    NumRemoved++;
+    (*I)->eraseFromParent();
+  }
+
+  return !worklist.empty();
+}
+
+FunctionPass *llvm::createAggressiveDCEPass() {
+  return new ADCE();
+}
diff --git a/lib/Transforms/Scalar/BasicBlockPlacement.cpp b/lib/Transforms/Scalar/BasicBlockPlacement.cpp
new file mode 100644
index 0000000..fb9b880
--- /dev/null
+++ b/lib/Transforms/Scalar/BasicBlockPlacement.cpp
@@ -0,0 +1,148 @@
+//===-- BasicBlockPlacement.cpp - Basic Block Code Layout optimization ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a very simple profile guided basic block placement
+// algorithm.  The idea is to put frequently executed blocks together at the
+// start of the function, and hopefully increase the number of fall-through
+// conditional branches.  If there is no profile information for a particular
+// function, this pass basically orders blocks in depth-first order
+//
+// The algorithm implemented here is basically "Algo1" from "Profile Guided Code
+// Positioning" by Pettis and Hansen, except that it uses basic block counts
+// instead of edge counts.  This should be improved in many ways, but is very
+// simple for now.
+//
+// Basically we "place" the entry block, then loop over all successors in a DFO,
+// placing the most frequently executed successor until we run out of blocks.  I
+// told you this was _extremely_ simplistic. :) This is also much slower than it
+// could be.  When it becomes important, this pass will be rewritten to use a
+// better algorithm, and then we can worry about efficiency.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "block-placement"
+#include "llvm/Analysis/ProfileInfo.h"
+#include "llvm/Function.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Transforms/Scalar.h"
+#include <set>
+using namespace llvm;
+
+STATISTIC(NumMoved, "Number of basic blocks moved");
+
+namespace {
+  struct VISIBILITY_HIDDEN BlockPlacement : public FunctionPass {
+    static char ID; // Pass identification, replacement for typeid
+    BlockPlacement() : FunctionPass(&ID) {}
+
+    virtual bool runOnFunction(Function &F);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesCFG();
+      AU.addRequired<ProfileInfo>();
+      //AU.addPreserved<ProfileInfo>();  // Does this work?
+    }
+  private:
+    /// PI - The profile information that is guiding us.
+    ///
+    ProfileInfo *PI;
+
+    /// NumMovedBlocks - Every time we move a block, increment this counter.
+    ///
+    unsigned NumMovedBlocks;
+
+    /// PlacedBlocks - Every time we place a block, remember it so we don't get
+    /// into infinite loops.
+    std::set<BasicBlock*> PlacedBlocks;
+
+    /// InsertPos - This an iterator to the next place we want to insert a
+    /// block.
+    Function::iterator InsertPos;
+
+    /// PlaceBlocks - Recursively place the specified blocks and any unplaced
+    /// successors.
+    void PlaceBlocks(BasicBlock *BB);
+  };
+}
+
+char BlockPlacement::ID = 0;
+static RegisterPass<BlockPlacement>
+X("block-placement", "Profile Guided Basic Block Placement");
+
+FunctionPass *llvm::createBlockPlacementPass() { return new BlockPlacement(); }
+
+bool BlockPlacement::runOnFunction(Function &F) {
+  PI = &getAnalysis<ProfileInfo>();
+
+  NumMovedBlocks = 0;
+  InsertPos = F.begin();
+
+  // Recursively place all blocks.
+  PlaceBlocks(F.begin());
+
+  PlacedBlocks.clear();
+  NumMoved += NumMovedBlocks;
+  return NumMovedBlocks != 0;
+}
+
+
+/// PlaceBlocks - Recursively place the specified blocks and any unplaced
+/// successors.
+void BlockPlacement::PlaceBlocks(BasicBlock *BB) {
+  assert(!PlacedBlocks.count(BB) && "Already placed this block!");
+  PlacedBlocks.insert(BB);
+
+  // Place the specified block.
+  if (&*InsertPos != BB) {
+    // Use splice to move the block into the right place.  This avoids having to
+    // remove the block from the function then readd it, which causes a bunch of
+    // symbol table traffic that is entirely pointless.
+    Function::BasicBlockListType &Blocks = BB->getParent()->getBasicBlockList();
+    Blocks.splice(InsertPos, Blocks, BB);
+
+    ++NumMovedBlocks;
+  } else {
+    // This block is already in the right place, we don't have to do anything.
+    ++InsertPos;
+  }
+
+  // Keep placing successors until we run out of ones to place.  Note that this
+  // loop is very inefficient (N^2) for blocks with many successors, like switch
+  // statements.  FIXME!
+  while (1) {
+    // Okay, now place any unplaced successors.
+    succ_iterator SI = succ_begin(BB), E = succ_end(BB);
+
+    // Scan for the first unplaced successor.
+    for (; SI != E && PlacedBlocks.count(*SI); ++SI)
+      /*empty*/;
+    if (SI == E) return;  // No more successors to place.
+
+    unsigned MaxExecutionCount = PI->getExecutionCount(*SI);
+    BasicBlock *MaxSuccessor = *SI;
+
+    // Scan for more frequently executed successors
+    for (; SI != E; ++SI)
+      if (!PlacedBlocks.count(*SI)) {
+        unsigned Count = PI->getExecutionCount(*SI);
+        if (Count > MaxExecutionCount ||
+            // Prefer to not disturb the code.
+            (Count == MaxExecutionCount && *SI == &*InsertPos)) {
+          MaxExecutionCount = Count;
+          MaxSuccessor = *SI;
+        }
+      }
+
+    // Now that we picked the maximally executed successor, place it.
+    PlaceBlocks(MaxSuccessor);
+  }
+}
diff --git a/lib/Transforms/Scalar/CMakeLists.txt b/lib/Transforms/Scalar/CMakeLists.txt
new file mode 100644
index 0000000..7a7c48b
--- /dev/null
+++ b/lib/Transforms/Scalar/CMakeLists.txt
@@ -0,0 +1,33 @@
+add_llvm_library(LLVMScalarOpts
+  ADCE.cpp
+  BasicBlockPlacement.cpp
+  CodeGenPrepare.cpp
+  CondPropagate.cpp
+  ConstantProp.cpp
+  DCE.cpp
+  DeadStoreElimination.cpp
+  GVN.cpp
+  GVNPRE.cpp
+  IndVarSimplify.cpp
+  InstructionCombining.cpp
+  JumpThreading.cpp
+  LICM.cpp
+  LoopDeletion.cpp
+  LoopIndexSplit.cpp
+  LoopRotation.cpp
+  LoopStrengthReduce.cpp
+  LoopUnroll.cpp
+  LoopUnswitch.cpp
+  MemCpyOptimizer.cpp
+  PredicateSimplifier.cpp
+  Reassociate.cpp
+  Reg2Mem.cpp
+  SCCP.cpp
+  Scalar.cpp
+  ScalarReplAggregates.cpp
+  SimplifyCFGPass.cpp
+  SimplifyHalfPowrLibCalls.cpp
+  SimplifyLibCalls.cpp
+  TailDuplication.cpp
+  TailRecursionElimination.cpp
+  )
diff --git a/lib/Transforms/Scalar/CodeGenPrepare.cpp b/lib/Transforms/Scalar/CodeGenPrepare.cpp
new file mode 100644
index 0000000..342b1e5
--- /dev/null
+++ b/lib/Transforms/Scalar/CodeGenPrepare.cpp
@@ -0,0 +1,873 @@
+//===- CodeGenPrepare.cpp - Prepare a function for code generation --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass munges the code in the input function to better prepare it for
+// SelectionDAG-based code generation. This works around limitations in it's
+// basic-block-at-a-time approach. It should eventually be removed.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "codegenprepare"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
+#include "llvm/InlineAsm.h"
+#include "llvm/Instructions.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Pass.h"
+#include "llvm/Target/TargetAsmInfo.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Transforms/Utils/AddrModeMatcher.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/Assembly/Writer.h"
+#include "llvm/Support/CallSite.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/GetElementPtrTypeIterator.h"
+#include "llvm/Support/PatternMatch.h"
+using namespace llvm;
+using namespace llvm::PatternMatch;
+
+static cl::opt<bool> FactorCommonPreds("split-critical-paths-tweak",
+                                       cl::init(false), cl::Hidden);
+
+namespace {
+  class VISIBILITY_HIDDEN CodeGenPrepare : public FunctionPass {
+    /// TLI - Keep a pointer of a TargetLowering to consult for determining
+    /// transformation profitability.
+    const TargetLowering *TLI;
+
+    /// BackEdges - Keep a set of all the loop back edges.
+    ///
+    SmallSet<std::pair<const BasicBlock*, const BasicBlock*>, 8> BackEdges;
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    explicit CodeGenPrepare(const TargetLowering *tli = 0)
+      : FunctionPass(&ID), TLI(tli) {}
+    bool runOnFunction(Function &F);
+
+  private:
+    bool EliminateMostlyEmptyBlocks(Function &F);
+    bool CanMergeBlocks(const BasicBlock *BB, const BasicBlock *DestBB) const;
+    void EliminateMostlyEmptyBlock(BasicBlock *BB);
+    bool OptimizeBlock(BasicBlock &BB);
+    bool OptimizeMemoryInst(Instruction *I, Value *Addr, const Type *AccessTy,
+                            DenseMap<Value*,Value*> &SunkAddrs);
+    bool OptimizeInlineAsmInst(Instruction *I, CallSite CS,
+                               DenseMap<Value*,Value*> &SunkAddrs);
+    bool OptimizeExtUses(Instruction *I);
+    void findLoopBackEdges(const Function &F);
+  };
+}
+
+char CodeGenPrepare::ID = 0;
+static RegisterPass<CodeGenPrepare> X("codegenprepare",
+                                      "Optimize for code generation");
+
+FunctionPass *llvm::createCodeGenPreparePass(const TargetLowering *TLI) {
+  return new CodeGenPrepare(TLI);
+}
+
+/// findLoopBackEdges - Do a DFS walk to find loop back edges.
+///
+void CodeGenPrepare::findLoopBackEdges(const Function &F) {
+  SmallVector<std::pair<const BasicBlock*,const BasicBlock*>, 32> Edges;
+  FindFunctionBackedges(F, Edges);
+  
+  BackEdges.insert(Edges.begin(), Edges.end());
+}
+
+
+bool CodeGenPrepare::runOnFunction(Function &F) {
+  bool EverMadeChange = false;
+
+  // First pass, eliminate blocks that contain only PHI nodes and an
+  // unconditional branch.
+  EverMadeChange |= EliminateMostlyEmptyBlocks(F);
+
+  // Now find loop back edges.
+  findLoopBackEdges(F);
+
+  bool MadeChange = true;
+  while (MadeChange) {
+    MadeChange = false;
+    for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB)
+      MadeChange |= OptimizeBlock(*BB);
+    EverMadeChange |= MadeChange;
+  }
+  return EverMadeChange;
+}
+
+/// EliminateMostlyEmptyBlocks - eliminate blocks that contain only PHI nodes,
+/// debug info directives, and an unconditional branch.  Passes before isel
+/// (e.g. LSR/loopsimplify) often split edges in ways that are non-optimal for
+/// isel.  Start by eliminating these blocks so we can split them the way we
+/// want them.
+bool CodeGenPrepare::EliminateMostlyEmptyBlocks(Function &F) {
+  bool MadeChange = false;
+  // Note that this intentionally skips the entry block.
+  for (Function::iterator I = ++F.begin(), E = F.end(); I != E; ) {
+    BasicBlock *BB = I++;
+
+    // If this block doesn't end with an uncond branch, ignore it.
+    BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator());
+    if (!BI || !BI->isUnconditional())
+      continue;
+
+    // If the instruction before the branch (skipping debug info) isn't a phi
+    // node, then other stuff is happening here.
+    BasicBlock::iterator BBI = BI;
+    if (BBI != BB->begin()) {
+      --BBI;
+      while (isa<DbgInfoIntrinsic>(BBI)) {
+        if (BBI == BB->begin())
+          break;
+        --BBI;
+      }
+      if (!isa<DbgInfoIntrinsic>(BBI) && !isa<PHINode>(BBI))
+        continue;
+    }
+
+    // Do not break infinite loops.
+    BasicBlock *DestBB = BI->getSuccessor(0);
+    if (DestBB == BB)
+      continue;
+
+    if (!CanMergeBlocks(BB, DestBB))
+      continue;
+
+    EliminateMostlyEmptyBlock(BB);
+    MadeChange = true;
+  }
+  return MadeChange;
+}
+
+/// CanMergeBlocks - Return true if we can merge BB into DestBB if there is a
+/// single uncond branch between them, and BB contains no other non-phi
+/// instructions.
+bool CodeGenPrepare::CanMergeBlocks(const BasicBlock *BB,
+                                    const BasicBlock *DestBB) const {
+  // We only want to eliminate blocks whose phi nodes are used by phi nodes in
+  // the successor.  If there are more complex condition (e.g. preheaders),
+  // don't mess around with them.
+  BasicBlock::const_iterator BBI = BB->begin();
+  while (const PHINode *PN = dyn_cast<PHINode>(BBI++)) {
+    for (Value::use_const_iterator UI = PN->use_begin(), E = PN->use_end();
+         UI != E; ++UI) {
+      const Instruction *User = cast<Instruction>(*UI);
+      if (User->getParent() != DestBB || !isa<PHINode>(User))
+        return false;
+      // If User is inside DestBB block and it is a PHINode then check
+      // incoming value. If incoming value is not from BB then this is
+      // a complex condition (e.g. preheaders) we want to avoid here.
+      if (User->getParent() == DestBB) {
+        if (const PHINode *UPN = dyn_cast<PHINode>(User))
+          for (unsigned I = 0, E = UPN->getNumIncomingValues(); I != E; ++I) {
+            Instruction *Insn = dyn_cast<Instruction>(UPN->getIncomingValue(I));
+            if (Insn && Insn->getParent() == BB &&
+                Insn->getParent() != UPN->getIncomingBlock(I))
+              return false;
+          }
+      }
+    }
+  }
+
+  // If BB and DestBB contain any common predecessors, then the phi nodes in BB
+  // and DestBB may have conflicting incoming values for the block.  If so, we
+  // can't merge the block.
+  const PHINode *DestBBPN = dyn_cast<PHINode>(DestBB->begin());
+  if (!DestBBPN) return true;  // no conflict.
+
+  // Collect the preds of BB.
+  SmallPtrSet<const BasicBlock*, 16> BBPreds;
+  if (const PHINode *BBPN = dyn_cast<PHINode>(BB->begin())) {
+    // It is faster to get preds from a PHI than with pred_iterator.
+    for (unsigned i = 0, e = BBPN->getNumIncomingValues(); i != e; ++i)
+      BBPreds.insert(BBPN->getIncomingBlock(i));
+  } else {
+    BBPreds.insert(pred_begin(BB), pred_end(BB));
+  }
+
+  // Walk the preds of DestBB.
+  for (unsigned i = 0, e = DestBBPN->getNumIncomingValues(); i != e; ++i) {
+    BasicBlock *Pred = DestBBPN->getIncomingBlock(i);
+    if (BBPreds.count(Pred)) {   // Common predecessor?
+      BBI = DestBB->begin();
+      while (const PHINode *PN = dyn_cast<PHINode>(BBI++)) {
+        const Value *V1 = PN->getIncomingValueForBlock(Pred);
+        const Value *V2 = PN->getIncomingValueForBlock(BB);
+
+        // If V2 is a phi node in BB, look up what the mapped value will be.
+        if (const PHINode *V2PN = dyn_cast<PHINode>(V2))
+          if (V2PN->getParent() == BB)
+            V2 = V2PN->getIncomingValueForBlock(Pred);
+
+        // If there is a conflict, bail out.
+        if (V1 != V2) return false;
+      }
+    }
+  }
+
+  return true;
+}
+
+
+/// EliminateMostlyEmptyBlock - Eliminate a basic block that have only phi's and
+/// an unconditional branch in it.
+void CodeGenPrepare::EliminateMostlyEmptyBlock(BasicBlock *BB) {
+  BranchInst *BI = cast<BranchInst>(BB->getTerminator());
+  BasicBlock *DestBB = BI->getSuccessor(0);
+
+  DOUT << "MERGING MOSTLY EMPTY BLOCKS - BEFORE:\n" << *BB << *DestBB;
+
+  // If the destination block has a single pred, then this is a trivial edge,
+  // just collapse it.
+  if (BasicBlock *SinglePred = DestBB->getSinglePredecessor()) {
+    if (SinglePred != DestBB) {
+      // Remember if SinglePred was the entry block of the function.  If so, we
+      // will need to move BB back to the entry position.
+      bool isEntry = SinglePred == &SinglePred->getParent()->getEntryBlock();
+      MergeBasicBlockIntoOnlyPred(DestBB);
+
+      if (isEntry && BB != &BB->getParent()->getEntryBlock())
+        BB->moveBefore(&BB->getParent()->getEntryBlock());
+      
+      DOUT << "AFTER:\n" << *DestBB << "\n\n\n";
+      return;
+    }
+  }
+
+  // Otherwise, we have multiple predecessors of BB.  Update the PHIs in DestBB
+  // to handle the new incoming edges it is about to have.
+  PHINode *PN;
+  for (BasicBlock::iterator BBI = DestBB->begin();
+       (PN = dyn_cast<PHINode>(BBI)); ++BBI) {
+    // Remove the incoming value for BB, and remember it.
+    Value *InVal = PN->removeIncomingValue(BB, false);
+
+    // Two options: either the InVal is a phi node defined in BB or it is some
+    // value that dominates BB.
+    PHINode *InValPhi = dyn_cast<PHINode>(InVal);
+    if (InValPhi && InValPhi->getParent() == BB) {
+      // Add all of the input values of the input PHI as inputs of this phi.
+      for (unsigned i = 0, e = InValPhi->getNumIncomingValues(); i != e; ++i)
+        PN->addIncoming(InValPhi->getIncomingValue(i),
+                        InValPhi->getIncomingBlock(i));
+    } else {
+      // Otherwise, add one instance of the dominating value for each edge that
+      // we will be adding.
+      if (PHINode *BBPN = dyn_cast<PHINode>(BB->begin())) {
+        for (unsigned i = 0, e = BBPN->getNumIncomingValues(); i != e; ++i)
+          PN->addIncoming(InVal, BBPN->getIncomingBlock(i));
+      } else {
+        for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI)
+          PN->addIncoming(InVal, *PI);
+      }
+    }
+  }
+
+  // The PHIs are now updated, change everything that refers to BB to use
+  // DestBB and remove BB.
+  BB->replaceAllUsesWith(DestBB);
+  BB->eraseFromParent();
+
+  DOUT << "AFTER:\n" << *DestBB << "\n\n\n";
+}
+
+
+/// SplitEdgeNicely - Split the critical edge from TI to its specified
+/// successor if it will improve codegen.  We only do this if the successor has
+/// phi nodes (otherwise critical edges are ok).  If there is already another
+/// predecessor of the succ that is empty (and thus has no phi nodes), use it
+/// instead of introducing a new block.
+static void SplitEdgeNicely(TerminatorInst *TI, unsigned SuccNum,
+                     SmallSet<std::pair<const BasicBlock*,
+                                        const BasicBlock*>, 8> &BackEdges,
+                             Pass *P) {
+  BasicBlock *TIBB = TI->getParent();
+  BasicBlock *Dest = TI->getSuccessor(SuccNum);
+  assert(isa<PHINode>(Dest->begin()) &&
+         "This should only be called if Dest has a PHI!");
+
+  // Do not split edges to EH landing pads.
+  if (InvokeInst *Invoke = dyn_cast<InvokeInst>(TI)) {
+    if (Invoke->getSuccessor(1) == Dest)
+      return;
+  }
+
+  // As a hack, never split backedges of loops.  Even though the copy for any
+  // PHIs inserted on the backedge would be dead for exits from the loop, we
+  // assume that the cost of *splitting* the backedge would be too high.
+  if (BackEdges.count(std::make_pair(TIBB, Dest)))
+    return;
+
+  if (!FactorCommonPreds) {
+    /// TIPHIValues - This array is lazily computed to determine the values of
+    /// PHIs in Dest that TI would provide.
+    SmallVector<Value*, 32> TIPHIValues;
+
+    // Check to see if Dest has any blocks that can be used as a split edge for
+    // this terminator.
+    for (pred_iterator PI = pred_begin(Dest), E = pred_end(Dest); PI != E; ++PI) {
+      BasicBlock *Pred = *PI;
+      // To be usable, the pred has to end with an uncond branch to the dest.
+      BranchInst *PredBr = dyn_cast<BranchInst>(Pred->getTerminator());
+      if (!PredBr || !PredBr->isUnconditional())
+        continue;
+      // Must be empty other than the branch and debug info.
+      BasicBlock::iterator I = Pred->begin();
+      while (isa<DbgInfoIntrinsic>(I))
+        I++;
+      if (dyn_cast<Instruction>(I) != PredBr)
+        continue;
+      // Cannot be the entry block; its label does not get emitted.
+      if (Pred == &(Dest->getParent()->getEntryBlock()))
+        continue;
+
+      // Finally, since we know that Dest has phi nodes in it, we have to make
+      // sure that jumping to Pred will have the same effect as going to Dest in
+      // terms of PHI values.
+      PHINode *PN;
+      unsigned PHINo = 0;
+      bool FoundMatch = true;
+      for (BasicBlock::iterator I = Dest->begin();
+           (PN = dyn_cast<PHINode>(I)); ++I, ++PHINo) {
+        if (PHINo == TIPHIValues.size())
+          TIPHIValues.push_back(PN->getIncomingValueForBlock(TIBB));
+
+        // If the PHI entry doesn't work, we can't use this pred.
+        if (TIPHIValues[PHINo] != PN->getIncomingValueForBlock(Pred)) {
+          FoundMatch = false;
+          break;
+        }
+      }
+
+      // If we found a workable predecessor, change TI to branch to Succ.
+      if (FoundMatch) {
+        Dest->removePredecessor(TIBB);
+        TI->setSuccessor(SuccNum, Pred);
+        return;
+      }
+    }
+
+    SplitCriticalEdge(TI, SuccNum, P, true);
+    return;
+  }
+
+  PHINode *PN;
+  SmallVector<Value*, 8> TIPHIValues;
+  for (BasicBlock::iterator I = Dest->begin();
+       (PN = dyn_cast<PHINode>(I)); ++I)
+    TIPHIValues.push_back(PN->getIncomingValueForBlock(TIBB));
+
+  SmallVector<BasicBlock*, 8> IdenticalPreds;
+  for (pred_iterator PI = pred_begin(Dest), E = pred_end(Dest); PI != E; ++PI) {
+    BasicBlock *Pred = *PI;
+    if (BackEdges.count(std::make_pair(Pred, Dest)))
+      continue;
+    if (PI == TIBB)
+      IdenticalPreds.push_back(Pred);
+    else {
+      bool Identical = true;
+      unsigned PHINo = 0;
+      for (BasicBlock::iterator I = Dest->begin();
+           (PN = dyn_cast<PHINode>(I)); ++I, ++PHINo)
+        if (TIPHIValues[PHINo] != PN->getIncomingValueForBlock(Pred)) {
+          Identical = false;
+          break;
+        }
+      if (Identical)
+        IdenticalPreds.push_back(Pred);
+    }
+  }
+
+  assert(!IdenticalPreds.empty());
+  SplitBlockPredecessors(Dest, &IdenticalPreds[0], IdenticalPreds.size(),
+                         ".critedge", P);
+}
+
+
+/// OptimizeNoopCopyExpression - If the specified cast instruction is a noop
+/// copy (e.g. it's casting from one pointer type to another, int->uint, or
+/// int->sbyte on PPC), sink it into user blocks to reduce the number of virtual
+/// registers that must be created and coalesced.
+///
+/// Return true if any changes are made.
+///
+static bool OptimizeNoopCopyExpression(CastInst *CI, const TargetLowering &TLI){
+  // If this is a noop copy,
+  MVT SrcVT = TLI.getValueType(CI->getOperand(0)->getType());
+  MVT DstVT = TLI.getValueType(CI->getType());
+
+  // This is an fp<->int conversion?
+  if (SrcVT.isInteger() != DstVT.isInteger())
+    return false;
+
+  // If this is an extension, it will be a zero or sign extension, which
+  // isn't a noop.
+  if (SrcVT.bitsLT(DstVT)) return false;
+
+  // If these values will be promoted, find out what they will be promoted
+  // to.  This helps us consider truncates on PPC as noop copies when they
+  // are.
+  if (TLI.getTypeAction(SrcVT) == TargetLowering::Promote)
+    SrcVT = TLI.getTypeToTransformTo(SrcVT);
+  if (TLI.getTypeAction(DstVT) == TargetLowering::Promote)
+    DstVT = TLI.getTypeToTransformTo(DstVT);
+
+  // If, after promotion, these are the same types, this is a noop copy.
+  if (SrcVT != DstVT)
+    return false;
+
+  BasicBlock *DefBB = CI->getParent();
+
+  /// InsertedCasts - Only insert a cast in each block once.
+  DenseMap<BasicBlock*, CastInst*> InsertedCasts;
+
+  bool MadeChange = false;
+  for (Value::use_iterator UI = CI->use_begin(), E = CI->use_end();
+       UI != E; ) {
+    Use &TheUse = UI.getUse();
+    Instruction *User = cast<Instruction>(*UI);
+
+    // Figure out which BB this cast is used in.  For PHI's this is the
+    // appropriate predecessor block.
+    BasicBlock *UserBB = User->getParent();
+    if (PHINode *PN = dyn_cast<PHINode>(User)) {
+      UserBB = PN->getIncomingBlock(UI);
+    }
+
+    // Preincrement use iterator so we don't invalidate it.
+    ++UI;
+
+    // If this user is in the same block as the cast, don't change the cast.
+    if (UserBB == DefBB) continue;
+
+    // If we have already inserted a cast into this block, use it.
+    CastInst *&InsertedCast = InsertedCasts[UserBB];
+
+    if (!InsertedCast) {
+      BasicBlock::iterator InsertPt = UserBB->getFirstNonPHI();
+
+      InsertedCast =
+        CastInst::Create(CI->getOpcode(), CI->getOperand(0), CI->getType(), "",
+                         InsertPt);
+      MadeChange = true;
+    }
+
+    // Replace a use of the cast with a use of the new cast.
+    TheUse = InsertedCast;
+  }
+
+  // If we removed all uses, nuke the cast.
+  if (CI->use_empty()) {
+    CI->eraseFromParent();
+    MadeChange = true;
+  }
+
+  return MadeChange;
+}
+
+/// OptimizeCmpExpression - sink the given CmpInst into user blocks to reduce
+/// the number of virtual registers that must be created and coalesced.  This is
+/// a clear win except on targets with multiple condition code registers
+///  (PowerPC), where it might lose; some adjustment may be wanted there.
+///
+/// Return true if any changes are made.
+static bool OptimizeCmpExpression(CmpInst *CI) {
+  BasicBlock *DefBB = CI->getParent();
+
+  /// InsertedCmp - Only insert a cmp in each block once.
+  DenseMap<BasicBlock*, CmpInst*> InsertedCmps;
+
+  bool MadeChange = false;
+  for (Value::use_iterator UI = CI->use_begin(), E = CI->use_end();
+       UI != E; ) {
+    Use &TheUse = UI.getUse();
+    Instruction *User = cast<Instruction>(*UI);
+
+    // Preincrement use iterator so we don't invalidate it.
+    ++UI;
+
+    // Don't bother for PHI nodes.
+    if (isa<PHINode>(User))
+      continue;
+
+    // Figure out which BB this cmp is used in.
+    BasicBlock *UserBB = User->getParent();
+
+    // If this user is in the same block as the cmp, don't change the cmp.
+    if (UserBB == DefBB) continue;
+
+    // If we have already inserted a cmp into this block, use it.
+    CmpInst *&InsertedCmp = InsertedCmps[UserBB];
+
+    if (!InsertedCmp) {
+      BasicBlock::iterator InsertPt = UserBB->getFirstNonPHI();
+
+      InsertedCmp =
+        CmpInst::Create(CI->getOpcode(), CI->getPredicate(), CI->getOperand(0),
+                        CI->getOperand(1), "", InsertPt);
+      MadeChange = true;
+    }
+
+    // Replace a use of the cmp with a use of the new cmp.
+    TheUse = InsertedCmp;
+  }
+
+  // If we removed all uses, nuke the cmp.
+  if (CI->use_empty())
+    CI->eraseFromParent();
+
+  return MadeChange;
+}
+
+//===----------------------------------------------------------------------===//
+// Memory Optimization
+//===----------------------------------------------------------------------===//
+
+/// IsNonLocalValue - Return true if the specified values are defined in a
+/// different basic block than BB.
+static bool IsNonLocalValue(Value *V, BasicBlock *BB) {
+  if (Instruction *I = dyn_cast<Instruction>(V))
+    return I->getParent() != BB;
+  return false;
+}
+
+/// OptimizeMemoryInst - Load and Store Instructions have often have
+/// addressing modes that can do significant amounts of computation.  As such,
+/// instruction selection will try to get the load or store to do as much
+/// computation as possible for the program.  The problem is that isel can only
+/// see within a single block.  As such, we sink as much legal addressing mode
+/// stuff into the block as possible.
+///
+/// This method is used to optimize both load/store and inline asms with memory
+/// operands.
+bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
+                                        const Type *AccessTy,
+                                        DenseMap<Value*,Value*> &SunkAddrs) {
+  // Figure out what addressing mode will be built up for this operation.
+  SmallVector<Instruction*, 16> AddrModeInsts;
+  ExtAddrMode AddrMode = AddressingModeMatcher::Match(Addr, AccessTy,MemoryInst,
+                                                      AddrModeInsts, *TLI);
+
+  // Check to see if any of the instructions supersumed by this addr mode are
+  // non-local to I's BB.
+  bool AnyNonLocal = false;
+  for (unsigned i = 0, e = AddrModeInsts.size(); i != e; ++i) {
+    if (IsNonLocalValue(AddrModeInsts[i], MemoryInst->getParent())) {
+      AnyNonLocal = true;
+      break;
+    }
+  }
+
+  // If all the instructions matched are already in this BB, don't do anything.
+  if (!AnyNonLocal) {
+    DEBUG(cerr << "CGP: Found      local addrmode: " << AddrMode << "\n");
+    return false;
+  }
+
+  // Insert this computation right after this user.  Since our caller is
+  // scanning from the top of the BB to the bottom, reuse of the expr are
+  // guaranteed to happen later.
+  BasicBlock::iterator InsertPt = MemoryInst;
+
+  // Now that we determined the addressing expression we want to use and know
+  // that we have to sink it into this block.  Check to see if we have already
+  // done this for some other load/store instr in this block.  If so, reuse the
+  // computation.
+  Value *&SunkAddr = SunkAddrs[Addr];
+  if (SunkAddr) {
+    DEBUG(cerr << "CGP: Reusing nonlocal addrmode: " << AddrMode << " for "
+               << *MemoryInst);
+    if (SunkAddr->getType() != Addr->getType())
+      SunkAddr = new BitCastInst(SunkAddr, Addr->getType(), "tmp", InsertPt);
+  } else {
+    DEBUG(cerr << "CGP: SINKING nonlocal addrmode: " << AddrMode << " for "
+               << *MemoryInst);
+    const Type *IntPtrTy = TLI->getTargetData()->getIntPtrType();
+
+    Value *Result = 0;
+    // Start with the scale value.
+    if (AddrMode.Scale) {
+      Value *V = AddrMode.ScaledReg;
+      if (V->getType() == IntPtrTy) {
+        // done.
+      } else if (isa<PointerType>(V->getType())) {
+        V = new PtrToIntInst(V, IntPtrTy, "sunkaddr", InsertPt);
+      } else if (cast<IntegerType>(IntPtrTy)->getBitWidth() <
+                 cast<IntegerType>(V->getType())->getBitWidth()) {
+        V = new TruncInst(V, IntPtrTy, "sunkaddr", InsertPt);
+      } else {
+        V = new SExtInst(V, IntPtrTy, "sunkaddr", InsertPt);
+      }
+      if (AddrMode.Scale != 1)
+        V = BinaryOperator::CreateMul(V, ConstantInt::get(IntPtrTy,
+                                                          AddrMode.Scale),
+                                      "sunkaddr", InsertPt);
+      Result = V;
+    }
+
+    // Add in the base register.
+    if (AddrMode.BaseReg) {
+      Value *V = AddrMode.BaseReg;
+      if (V->getType() != IntPtrTy)
+        V = new PtrToIntInst(V, IntPtrTy, "sunkaddr", InsertPt);
+      if (Result)
+        Result = BinaryOperator::CreateAdd(Result, V, "sunkaddr", InsertPt);
+      else
+        Result = V;
+    }
+
+    // Add in the BaseGV if present.
+    if (AddrMode.BaseGV) {
+      Value *V = new PtrToIntInst(AddrMode.BaseGV, IntPtrTy, "sunkaddr",
+                                  InsertPt);
+      if (Result)
+        Result = BinaryOperator::CreateAdd(Result, V, "sunkaddr", InsertPt);
+      else
+        Result = V;
+    }
+
+    // Add in the Base Offset if present.
+    if (AddrMode.BaseOffs) {
+      Value *V = ConstantInt::get(IntPtrTy, AddrMode.BaseOffs);
+      if (Result)
+        Result = BinaryOperator::CreateAdd(Result, V, "sunkaddr", InsertPt);
+      else
+        Result = V;
+    }
+
+    if (Result == 0)
+      SunkAddr = Constant::getNullValue(Addr->getType());
+    else
+      SunkAddr = new IntToPtrInst(Result, Addr->getType(), "sunkaddr",InsertPt);
+  }
+
+  MemoryInst->replaceUsesOfWith(Addr, SunkAddr);
+
+  if (Addr->use_empty())
+    RecursivelyDeleteTriviallyDeadInstructions(Addr);
+  return true;
+}
+
+/// OptimizeInlineAsmInst - If there are any memory operands, use
+/// OptimizeMemoryInst to sink their address computing into the block when
+/// possible / profitable.
+bool CodeGenPrepare::OptimizeInlineAsmInst(Instruction *I, CallSite CS,
+                                           DenseMap<Value*,Value*> &SunkAddrs) {
+  bool MadeChange = false;
+  InlineAsm *IA = cast<InlineAsm>(CS.getCalledValue());
+
+  // Do a prepass over the constraints, canonicalizing them, and building up the
+  // ConstraintOperands list.
+  std::vector<InlineAsm::ConstraintInfo>
+    ConstraintInfos = IA->ParseConstraints();
+
+  /// ConstraintOperands - Information about all of the constraints.
+  std::vector<TargetLowering::AsmOperandInfo> ConstraintOperands;
+  unsigned ArgNo = 0;   // ArgNo - The argument of the CallInst.
+  for (unsigned i = 0, e = ConstraintInfos.size(); i != e; ++i) {
+    ConstraintOperands.
+      push_back(TargetLowering::AsmOperandInfo(ConstraintInfos[i]));
+    TargetLowering::AsmOperandInfo &OpInfo = ConstraintOperands.back();
+
+    // Compute the value type for each operand.
+    switch (OpInfo.Type) {
+    case InlineAsm::isOutput:
+      if (OpInfo.isIndirect)
+        OpInfo.CallOperandVal = CS.getArgument(ArgNo++);
+      break;
+    case InlineAsm::isInput:
+      OpInfo.CallOperandVal = CS.getArgument(ArgNo++);
+      break;
+    case InlineAsm::isClobber:
+      // Nothing to do.
+      break;
+    }
+
+    // Compute the constraint code and ConstraintType to use.
+    TLI->ComputeConstraintToUse(OpInfo, SDValue(),
+                             OpInfo.ConstraintType == TargetLowering::C_Memory);
+
+    if (OpInfo.ConstraintType == TargetLowering::C_Memory &&
+        OpInfo.isIndirect) {
+      Value *OpVal = OpInfo.CallOperandVal;
+      MadeChange |= OptimizeMemoryInst(I, OpVal, OpVal->getType(), SunkAddrs);
+    }
+  }
+
+  return MadeChange;
+}
+
+bool CodeGenPrepare::OptimizeExtUses(Instruction *I) {
+  BasicBlock *DefBB = I->getParent();
+
+  // If both result of the {s|z}xt and its source are live out, rewrite all
+  // other uses of the source with result of extension.
+  Value *Src = I->getOperand(0);
+  if (Src->hasOneUse())
+    return false;
+
+  // Only do this xform if truncating is free.
+  if (TLI && !TLI->isTruncateFree(I->getType(), Src->getType()))
+    return false;
+
+  // Only safe to perform the optimization if the source is also defined in
+  // this block.
+  if (!isa<Instruction>(Src) || DefBB != cast<Instruction>(Src)->getParent())
+    return false;
+
+  bool DefIsLiveOut = false;
+  for (Value::use_iterator UI = I->use_begin(), E = I->use_end();
+       UI != E; ++UI) {
+    Instruction *User = cast<Instruction>(*UI);
+
+    // Figure out which BB this ext is used in.
+    BasicBlock *UserBB = User->getParent();
+    if (UserBB == DefBB) continue;
+    DefIsLiveOut = true;
+    break;
+  }
+  if (!DefIsLiveOut)
+    return false;
+
+  // Make sure non of the uses are PHI nodes.
+  for (Value::use_iterator UI = Src->use_begin(), E = Src->use_end();
+       UI != E; ++UI) {
+    Instruction *User = cast<Instruction>(*UI);
+    BasicBlock *UserBB = User->getParent();
+    if (UserBB == DefBB) continue;
+    // Be conservative. We don't want this xform to end up introducing
+    // reloads just before load / store instructions.
+    if (isa<PHINode>(User) || isa<LoadInst>(User) || isa<StoreInst>(User))
+      return false;
+  }
+
+  // InsertedTruncs - Only insert one trunc in each block once.
+  DenseMap<BasicBlock*, Instruction*> InsertedTruncs;
+
+  bool MadeChange = false;
+  for (Value::use_iterator UI = Src->use_begin(), E = Src->use_end();
+       UI != E; ++UI) {
+    Use &TheUse = UI.getUse();
+    Instruction *User = cast<Instruction>(*UI);
+
+    // Figure out which BB this ext is used in.
+    BasicBlock *UserBB = User->getParent();
+    if (UserBB == DefBB) continue;
+
+    // Both src and def are live in this block. Rewrite the use.
+    Instruction *&InsertedTrunc = InsertedTruncs[UserBB];
+
+    if (!InsertedTrunc) {
+      BasicBlock::iterator InsertPt = UserBB->getFirstNonPHI();
+
+      InsertedTrunc = new TruncInst(I, Src->getType(), "", InsertPt);
+    }
+
+    // Replace a use of the {s|z}ext source with a use of the result.
+    TheUse = InsertedTrunc;
+
+    MadeChange = true;
+  }
+
+  return MadeChange;
+}
+
+// In this pass we look for GEP and cast instructions that are used
+// across basic blocks and rewrite them to improve basic-block-at-a-time
+// selection.
+bool CodeGenPrepare::OptimizeBlock(BasicBlock &BB) {
+  bool MadeChange = false;
+
+  // Split all critical edges where the dest block has a PHI.
+  TerminatorInst *BBTI = BB.getTerminator();
+  if (BBTI->getNumSuccessors() > 1) {
+    for (unsigned i = 0, e = BBTI->getNumSuccessors(); i != e; ++i) {
+      BasicBlock *SuccBB = BBTI->getSuccessor(i);
+      if (isa<PHINode>(SuccBB->begin()) && isCriticalEdge(BBTI, i, true))
+        SplitEdgeNicely(BBTI, i, BackEdges, this);
+    }
+  }
+
+  // Keep track of non-local addresses that have been sunk into this block.
+  // This allows us to avoid inserting duplicate code for blocks with multiple
+  // load/stores of the same address.
+  DenseMap<Value*, Value*> SunkAddrs;
+
+  for (BasicBlock::iterator BBI = BB.begin(), E = BB.end(); BBI != E; ) {
+    Instruction *I = BBI++;
+
+    if (CastInst *CI = dyn_cast<CastInst>(I)) {
+      // If the source of the cast is a constant, then this should have
+      // already been constant folded.  The only reason NOT to constant fold
+      // it is if something (e.g. LSR) was careful to place the constant
+      // evaluation in a block other than then one that uses it (e.g. to hoist
+      // the address of globals out of a loop).  If this is the case, we don't
+      // want to forward-subst the cast.
+      if (isa<Constant>(CI->getOperand(0)))
+        continue;
+
+      bool Change = false;
+      if (TLI) {
+        Change = OptimizeNoopCopyExpression(CI, *TLI);
+        MadeChange |= Change;
+      }
+
+      if (!Change && (isa<ZExtInst>(I) || isa<SExtInst>(I)))
+        MadeChange |= OptimizeExtUses(I);
+    } else if (CmpInst *CI = dyn_cast<CmpInst>(I)) {
+      MadeChange |= OptimizeCmpExpression(CI);
+    } else if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+      if (TLI)
+        MadeChange |= OptimizeMemoryInst(I, I->getOperand(0), LI->getType(),
+                                         SunkAddrs);
+    } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
+      if (TLI)
+        MadeChange |= OptimizeMemoryInst(I, SI->getOperand(1),
+                                         SI->getOperand(0)->getType(),
+                                         SunkAddrs);
+    } else if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(I)) {
+      if (GEPI->hasAllZeroIndices()) {
+        /// The GEP operand must be a pointer, so must its result -> BitCast
+        Instruction *NC = new BitCastInst(GEPI->getOperand(0), GEPI->getType(),
+                                          GEPI->getName(), GEPI);
+        GEPI->replaceAllUsesWith(NC);
+        GEPI->eraseFromParent();
+        MadeChange = true;
+        BBI = NC;
+      }
+    } else if (CallInst *CI = dyn_cast<CallInst>(I)) {
+      // If we found an inline asm expession, and if the target knows how to
+      // lower it to normal LLVM code, do so now.
+      if (TLI && isa<InlineAsm>(CI->getCalledValue()))
+        if (const TargetAsmInfo *TAI =
+            TLI->getTargetMachine().getTargetAsmInfo()) {
+          if (TAI->ExpandInlineAsm(CI)) {
+            BBI = BB.begin();
+            // Avoid processing instructions out of order, which could cause
+            // reuse before a value is defined.
+            SunkAddrs.clear();
+          } else
+            // Sink address computing for memory operands into the block.
+            MadeChange |= OptimizeInlineAsmInst(I, &(*CI), SunkAddrs);
+        }
+    }
+  }
+
+  return MadeChange;
+}
diff --git a/lib/Transforms/Scalar/CondPropagate.cpp b/lib/Transforms/Scalar/CondPropagate.cpp
new file mode 100644
index 0000000..c85d031
--- /dev/null
+++ b/lib/Transforms/Scalar/CondPropagate.cpp
@@ -0,0 +1,295 @@
+//===-- CondPropagate.cpp - Propagate Conditional Expressions -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass propagates information about conditional expressions through the
+// program, allowing it to eliminate conditional branches in some cases.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "condprop"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Constants.h"
+#include "llvm/Function.h"
+#include "llvm/Instructions.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Pass.h"
+#include "llvm/Type.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Streams.h"
+using namespace llvm;
+
+STATISTIC(NumBrThread, "Number of CFG edges threaded through branches");
+STATISTIC(NumSwThread, "Number of CFG edges threaded through switches");
+
+namespace {
+  struct VISIBILITY_HIDDEN CondProp : public FunctionPass {
+    static char ID; // Pass identification, replacement for typeid
+    CondProp() : FunctionPass(&ID) {}
+
+    virtual bool runOnFunction(Function &F);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequiredID(BreakCriticalEdgesID);
+      //AU.addRequired<DominanceFrontier>();
+    }
+
+  private:
+    bool MadeChange;
+    SmallVector<BasicBlock *, 4> DeadBlocks;
+    void SimplifyBlock(BasicBlock *BB);
+    void SimplifyPredecessors(BranchInst *BI);
+    void SimplifyPredecessors(SwitchInst *SI);
+    void RevectorBlockTo(BasicBlock *FromBB, BasicBlock *ToBB);
+    bool RevectorBlockTo(BasicBlock *FromBB, Value *Cond, BranchInst *BI);
+  };
+}
+  
+char CondProp::ID = 0;
+static RegisterPass<CondProp> X("condprop", "Conditional Propagation");
+
+FunctionPass *llvm::createCondPropagationPass() {
+  return new CondProp();
+}
+
+bool CondProp::runOnFunction(Function &F) {
+  bool EverMadeChange = false;
+  DeadBlocks.clear();
+
+  // While we are simplifying blocks, keep iterating.
+  do {
+    MadeChange = false;
+    for (Function::iterator BB = F.begin(), E = F.end(); BB != E;)
+      SimplifyBlock(BB++);
+    EverMadeChange = EverMadeChange || MadeChange;
+  } while (MadeChange);
+
+  if (EverMadeChange) {
+    while (!DeadBlocks.empty()) {
+      BasicBlock *BB = DeadBlocks.back(); DeadBlocks.pop_back();
+      DeleteDeadBlock(BB);
+    }
+  }
+  return EverMadeChange;
+}
+
+void CondProp::SimplifyBlock(BasicBlock *BB) {
+  if (BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator())) {
+    // If this is a conditional branch based on a phi node that is defined in
+    // this block, see if we can simplify predecessors of this block.
+    if (BI->isConditional() && isa<PHINode>(BI->getCondition()) &&
+        cast<PHINode>(BI->getCondition())->getParent() == BB)
+      SimplifyPredecessors(BI);
+
+  } else if (SwitchInst *SI = dyn_cast<SwitchInst>(BB->getTerminator())) {
+    if (isa<PHINode>(SI->getCondition()) &&
+        cast<PHINode>(SI->getCondition())->getParent() == BB)
+      SimplifyPredecessors(SI);
+  }
+
+  // If possible, simplify the terminator of this block.
+  if (ConstantFoldTerminator(BB))
+    MadeChange = true;
+
+  // If this block ends with an unconditional branch and the only successor has
+  // only this block as a predecessor, merge the two blocks together.
+  if (BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator()))
+    if (BI->isUnconditional() && BI->getSuccessor(0)->getSinglePredecessor() &&
+        BB != BI->getSuccessor(0)) {
+      BasicBlock *Succ = BI->getSuccessor(0);
+      
+      // If Succ has any PHI nodes, they are all single-entry PHI's.  Eliminate
+      // them.
+      FoldSingleEntryPHINodes(Succ);
+      
+      // Remove BI.
+      BI->eraseFromParent();
+
+      // Move over all of the instructions.
+      BB->getInstList().splice(BB->end(), Succ->getInstList());
+
+      // Any phi nodes that had entries for Succ now have entries from BB.
+      Succ->replaceAllUsesWith(BB);
+
+      // Succ is now dead, but we cannot delete it without potentially
+      // invalidating iterators elsewhere.  Just insert an unreachable
+      // instruction in it and delete this block later on.
+      new UnreachableInst(Succ);
+      DeadBlocks.push_back(Succ);
+      MadeChange = true;
+    }
+}
+
+// SimplifyPredecessors(branches) - We know that BI is a conditional branch
+// based on a PHI node defined in this block.  If the phi node contains constant
+// operands, then the blocks corresponding to those operands can be modified to
+// jump directly to the destination instead of going through this block.
+void CondProp::SimplifyPredecessors(BranchInst *BI) {
+  // TODO: We currently only handle the most trival case, where the PHI node has
+  // one use (the branch), and is the only instruction besides the branch and dbg
+  // intrinsics in the block.
+  PHINode *PN = cast<PHINode>(BI->getCondition());
+
+  if (PN->getNumIncomingValues() == 1) {
+    // Eliminate single-entry PHI nodes.
+    FoldSingleEntryPHINodes(PN->getParent());
+    return;
+  }
+  
+  
+  if (!PN->hasOneUse()) return;
+
+  BasicBlock *BB = BI->getParent();
+  if (&*BB->begin() != PN)
+    return;
+  BasicBlock::iterator BBI = BB->begin();
+  BasicBlock::iterator BBE = BB->end();
+  while (BBI != BBE && isa<DbgInfoIntrinsic>(++BBI)) /* empty */;
+  if (&*BBI != BI)
+    return;
+
+  // Ok, we have this really simple case, walk the PHI operands, looking for
+  // constants.  Walk from the end to remove operands from the end when
+  // possible, and to avoid invalidating "i".
+  for (unsigned i = PN->getNumIncomingValues(); i != 0; --i) {
+    Value *InVal = PN->getIncomingValue(i-1);
+    if (!RevectorBlockTo(PN->getIncomingBlock(i-1), InVal, BI))
+      continue;
+
+    ++NumBrThread;
+
+    // If there were two predecessors before this simplification, or if the
+    // PHI node contained all the same value except for the one we just
+    // substituted, the PHI node may be deleted.  Don't iterate through it the
+    // last time.
+    if (BI->getCondition() != PN) return;
+  }
+}
+
+// SimplifyPredecessors(switch) - We know that SI is switch based on a PHI node
+// defined in this block.  If the phi node contains constant operands, then the
+// blocks corresponding to those operands can be modified to jump directly to
+// the destination instead of going through this block.
+void CondProp::SimplifyPredecessors(SwitchInst *SI) {
+  // TODO: We currently only handle the most trival case, where the PHI node has
+  // one use (the branch), and is the only instruction besides the branch and 
+  // dbg intrinsics in the block.
+  PHINode *PN = cast<PHINode>(SI->getCondition());
+  if (!PN->hasOneUse()) return;
+
+  BasicBlock *BB = SI->getParent();
+  if (&*BB->begin() != PN)
+    return;
+  BasicBlock::iterator BBI = BB->begin();
+  BasicBlock::iterator BBE = BB->end();
+  while (BBI != BBE && isa<DbgInfoIntrinsic>(++BBI)) /* empty */;
+  if (&*BBI != SI)
+    return;
+
+  bool RemovedPreds = false;
+
+  // Ok, we have this really simple case, walk the PHI operands, looking for
+  // constants.  Walk from the end to remove operands from the end when
+  // possible, and to avoid invalidating "i".
+  for (unsigned i = PN->getNumIncomingValues(); i != 0; --i)
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(PN->getIncomingValue(i-1))) {
+      // If we have a constant, forward the edge from its current to its
+      // ultimate destination.
+      unsigned DestCase = SI->findCaseValue(CI);
+      RevectorBlockTo(PN->getIncomingBlock(i-1),
+                      SI->getSuccessor(DestCase));
+      ++NumSwThread;
+      RemovedPreds = true;
+
+      // If there were two predecessors before this simplification, or if the
+      // PHI node contained all the same value except for the one we just
+      // substituted, the PHI node may be deleted.  Don't iterate through it the
+      // last time.
+      if (SI->getCondition() != PN) return;
+    }
+}
+
+
+// RevectorBlockTo - Revector the unconditional branch at the end of FromBB to
+// the ToBB block, which is one of the successors of its current successor.
+void CondProp::RevectorBlockTo(BasicBlock *FromBB, BasicBlock *ToBB) {
+  BranchInst *FromBr = cast<BranchInst>(FromBB->getTerminator());
+  assert(FromBr->isUnconditional() && "FromBB should end with uncond br!");
+
+  // Get the old block we are threading through.
+  BasicBlock *OldSucc = FromBr->getSuccessor(0);
+
+  // OldSucc had multiple successors. If ToBB has multiple predecessors, then 
+  // the edge between them would be critical, which we already took care of.
+  // If ToBB has single operand PHI node then take care of it here.
+  FoldSingleEntryPHINodes(ToBB);
+
+  // Update PHI nodes in OldSucc to know that FromBB no longer branches to it.
+  OldSucc->removePredecessor(FromBB);
+
+  // Change FromBr to branch to the new destination.
+  FromBr->setSuccessor(0, ToBB);
+
+  MadeChange = true;
+}
+
+bool CondProp::RevectorBlockTo(BasicBlock *FromBB, Value *Cond, BranchInst *BI){
+  BranchInst *FromBr = cast<BranchInst>(FromBB->getTerminator());
+  if (!FromBr->isUnconditional())
+    return false;
+
+  // Get the old block we are threading through.
+  BasicBlock *OldSucc = FromBr->getSuccessor(0);
+
+  // If the condition is a constant, simply revector the unconditional branch at
+  // the end of FromBB to one of the successors of its current successor.
+  if (ConstantInt *CB = dyn_cast<ConstantInt>(Cond)) {
+    BasicBlock *ToBB = BI->getSuccessor(CB->isZero());
+
+    // OldSucc had multiple successors. If ToBB has multiple predecessors, then 
+    // the edge between them would be critical, which we already took care of.
+    // If ToBB has single operand PHI node then take care of it here.
+    FoldSingleEntryPHINodes(ToBB);
+
+    // Update PHI nodes in OldSucc to know that FromBB no longer branches to it.
+    OldSucc->removePredecessor(FromBB);
+
+    // Change FromBr to branch to the new destination.
+    FromBr->setSuccessor(0, ToBB);
+  } else {
+    BasicBlock *Succ0 = BI->getSuccessor(0);
+    // Do not perform transform if the new destination has PHI nodes. The
+    // transform will add new preds to the PHI's.
+    if (isa<PHINode>(Succ0->begin()))
+      return false;
+
+    BasicBlock *Succ1 = BI->getSuccessor(1);
+    if (isa<PHINode>(Succ1->begin()))
+      return false;
+
+    // Insert the new conditional branch.
+    BranchInst::Create(Succ0, Succ1, Cond, FromBr);
+
+    FoldSingleEntryPHINodes(Succ0);
+    FoldSingleEntryPHINodes(Succ1);
+
+    // Update PHI nodes in OldSucc to know that FromBB no longer branches to it.
+    OldSucc->removePredecessor(FromBB);
+
+    // Delete the old branch.
+    FromBr->eraseFromParent();
+  }
+
+  MadeChange = true;
+  return true;
+}
diff --git a/lib/Transforms/Scalar/ConstantProp.cpp b/lib/Transforms/Scalar/ConstantProp.cpp
new file mode 100644
index 0000000..b933488
--- /dev/null
+++ b/lib/Transforms/Scalar/ConstantProp.cpp
@@ -0,0 +1,90 @@
+//===- ConstantProp.cpp - Code to perform Simple Constant Propagation -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements constant propagation and merging:
+//
+// Specifically, this:
+//   * Converts instructions like "add int 1, 2" into 3
+//
+// Notice that:
+//   * This pass has a habit of making definitions be dead.  It is a good idea
+//     to run a DIE pass sometime after running this pass.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "constprop"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Constant.h"
+#include "llvm/Instruction.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/InstIterator.h"
+#include "llvm/ADT/Statistic.h"
+#include <set>
+using namespace llvm;
+
+STATISTIC(NumInstKilled, "Number of instructions killed");
+
+namespace {
+  struct VISIBILITY_HIDDEN ConstantPropagation : public FunctionPass {
+    static char ID; // Pass identification, replacement for typeid
+    ConstantPropagation() : FunctionPass(&ID) {}
+
+    bool runOnFunction(Function &F);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesCFG();
+    }
+  };
+}
+
+char ConstantPropagation::ID = 0;
+static RegisterPass<ConstantPropagation>
+X("constprop", "Simple constant propagation");
+
+FunctionPass *llvm::createConstantPropagationPass() {
+  return new ConstantPropagation();
+}
+
+
+bool ConstantPropagation::runOnFunction(Function &F) {
+  // Initialize the worklist to all of the instructions ready to process...
+  std::set<Instruction*> WorkList;
+  for(inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i) {
+      WorkList.insert(&*i);
+  }
+  bool Changed = false;
+
+  while (!WorkList.empty()) {
+    Instruction *I = *WorkList.begin();
+    WorkList.erase(WorkList.begin());    // Get an element from the worklist...
+
+    if (!I->use_empty())                 // Don't muck with dead instructions...
+      if (Constant *C = ConstantFoldInstruction(I)) {
+        // Add all of the users of this instruction to the worklist, they might
+        // be constant propagatable now...
+        for (Value::use_iterator UI = I->use_begin(), UE = I->use_end();
+             UI != UE; ++UI)
+          WorkList.insert(cast<Instruction>(*UI));
+
+        // Replace all of the uses of a variable with uses of the constant.
+        I->replaceAllUsesWith(C);
+
+        // Remove the dead instruction.
+        WorkList.erase(I);
+        I->eraseFromParent();
+
+        // We made a change to the function...
+        Changed = true;
+        ++NumInstKilled;
+      }
+  }
+  return Changed;
+}
diff --git a/lib/Transforms/Scalar/DCE.cpp b/lib/Transforms/Scalar/DCE.cpp
new file mode 100644
index 0000000..8bb504c
--- /dev/null
+++ b/lib/Transforms/Scalar/DCE.cpp
@@ -0,0 +1,133 @@
+//===- DCE.cpp - Code to perform dead code elimination --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements dead inst elimination and dead code elimination.
+//
+// Dead Inst Elimination performs a single pass over the function removing
+// instructions that are obviously dead.  Dead Code Elimination is similar, but
+// it rechecks instructions that were used by removed instructions to see if
+// they are newly dead.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "dce"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Instruction.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/InstIterator.h"
+#include "llvm/ADT/Statistic.h"
+#include <set>
+using namespace llvm;
+
+STATISTIC(DIEEliminated, "Number of insts removed by DIE pass");
+STATISTIC(DCEEliminated, "Number of insts removed");
+
+namespace {
+  //===--------------------------------------------------------------------===//
+  // DeadInstElimination pass implementation
+  //
+  struct VISIBILITY_HIDDEN DeadInstElimination : public BasicBlockPass {
+    static char ID; // Pass identification, replacement for typeid
+    DeadInstElimination() : BasicBlockPass(&ID) {}
+    virtual bool runOnBasicBlock(BasicBlock &BB) {
+      bool Changed = false;
+      for (BasicBlock::iterator DI = BB.begin(); DI != BB.end(); ) {
+        Instruction *Inst = DI++;
+        if (isInstructionTriviallyDead(Inst)) {
+          Inst->eraseFromParent();
+          Changed = true;
+          ++DIEEliminated;
+        }
+      }
+      return Changed;
+    }
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesCFG();
+    }
+  };
+}
+
+char DeadInstElimination::ID = 0;
+static RegisterPass<DeadInstElimination>
+X("die", "Dead Instruction Elimination");
+
+Pass *llvm::createDeadInstEliminationPass() {
+  return new DeadInstElimination();
+}
+
+
+namespace {
+  //===--------------------------------------------------------------------===//
+  // DeadCodeElimination pass implementation
+  //
+  struct DCE : public FunctionPass {
+    static char ID; // Pass identification, replacement for typeid
+    DCE() : FunctionPass(&ID) {}
+
+    virtual bool runOnFunction(Function &F);
+
+     virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesCFG();
+    }
+ };
+}
+
+char DCE::ID = 0;
+static RegisterPass<DCE> Y("dce", "Dead Code Elimination");
+
+bool DCE::runOnFunction(Function &F) {
+  // Start out with all of the instructions in the worklist...
+  std::vector<Instruction*> WorkList;
+  for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i)
+    WorkList.push_back(&*i);
+
+  // Loop over the worklist finding instructions that are dead.  If they are
+  // dead make them drop all of their uses, making other instructions
+  // potentially dead, and work until the worklist is empty.
+  //
+  bool MadeChange = false;
+  while (!WorkList.empty()) {
+    Instruction *I = WorkList.back();
+    WorkList.pop_back();
+
+    if (isInstructionTriviallyDead(I)) {       // If the instruction is dead.
+      // Loop over all of the values that the instruction uses, if there are
+      // instructions being used, add them to the worklist, because they might
+      // go dead after this one is removed.
+      //
+      for (User::op_iterator OI = I->op_begin(), E = I->op_end(); OI != E; ++OI)
+        if (Instruction *Used = dyn_cast<Instruction>(*OI))
+          WorkList.push_back(Used);
+
+      // Remove the instruction.
+      I->eraseFromParent();
+
+      // Remove the instruction from the worklist if it still exists in it.
+      for (std::vector<Instruction*>::iterator WI = WorkList.begin();
+           WI != WorkList.end(); ) {
+        if (*WI == I)
+          WI = WorkList.erase(WI);
+        else
+          ++WI;
+      }
+
+      MadeChange = true;
+      ++DCEEliminated;
+    }
+  }
+  return MadeChange;
+}
+
+FunctionPass *llvm::createDeadCodeEliminationPass() {
+  return new DCE();
+}
+
diff --git a/lib/Transforms/Scalar/DeadStoreElimination.cpp b/lib/Transforms/Scalar/DeadStoreElimination.cpp
new file mode 100644
index 0000000..b923c92
--- /dev/null
+++ b/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -0,0 +1,461 @@
+//===- DeadStoreElimination.cpp - Fast Dead Store Elimination -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a trivial dead store elimination that only considers
+// basic-block local redundant stores.
+//
+// FIXME: This should eventually be extended to be a post-dominator tree
+// traversal.  Doing so would be pretty trivial.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "dse"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Constants.h"
+#include "llvm/Function.h"
+#include "llvm/Instructions.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Pass.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/MemoryDependenceAnalysis.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Support/Compiler.h"
+using namespace llvm;
+
+STATISTIC(NumFastStores, "Number of stores deleted");
+STATISTIC(NumFastOther , "Number of other instrs removed");
+
+namespace {
+  struct VISIBILITY_HIDDEN DSE : public FunctionPass {
+    static char ID; // Pass identification, replacement for typeid
+    DSE() : FunctionPass(&ID) {}
+
+    virtual bool runOnFunction(Function &F) {
+      bool Changed = false;
+      for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I)
+        Changed |= runOnBasicBlock(*I);
+      return Changed;
+    }
+    
+    bool runOnBasicBlock(BasicBlock &BB);
+    bool handleFreeWithNonTrivialDependency(FreeInst *F, MemDepResult Dep);
+    bool handleEndBlock(BasicBlock &BB);
+    bool RemoveUndeadPointers(Value* Ptr, uint64_t killPointerSize,
+                              BasicBlock::iterator& BBI,
+                              SmallPtrSet<Value*, 64>& deadPointers);
+    void DeleteDeadInstruction(Instruction *I,
+                               SmallPtrSet<Value*, 64> *deadPointers = 0);
+    
+
+    // getAnalysisUsage - We require post dominance frontiers (aka Control
+    // Dependence Graph)
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesCFG();
+      AU.addRequired<DominatorTree>();
+      AU.addRequired<TargetData>();
+      AU.addRequired<AliasAnalysis>();
+      AU.addRequired<MemoryDependenceAnalysis>();
+      AU.addPreserved<DominatorTree>();
+      AU.addPreserved<AliasAnalysis>();
+      AU.addPreserved<MemoryDependenceAnalysis>();
+    }
+  };
+}
+
+char DSE::ID = 0;
+static RegisterPass<DSE> X("dse", "Dead Store Elimination");
+
+FunctionPass *llvm::createDeadStoreEliminationPass() { return new DSE(); }
+
+bool DSE::runOnBasicBlock(BasicBlock &BB) {
+  MemoryDependenceAnalysis& MD = getAnalysis<MemoryDependenceAnalysis>();
+  TargetData &TD = getAnalysis<TargetData>();  
+
+  bool MadeChange = false;
+  
+  // Do a top-down walk on the BB
+  for (BasicBlock::iterator BBI = BB.begin(), BBE = BB.end(); BBI != BBE; ) {
+    Instruction *Inst = BBI++;
+    
+    // If we find a store or a free, get it's memory dependence.
+    if (!isa<StoreInst>(Inst) && !isa<FreeInst>(Inst))
+      continue;
+    
+    // Don't molest volatile stores or do queries that will return "clobber".
+    if (StoreInst *SI = dyn_cast<StoreInst>(Inst))
+      if (SI->isVolatile())
+        continue;
+
+    MemDepResult InstDep = MD.getDependency(Inst);
+    
+    // Ignore non-local stores.
+    // FIXME: cross-block DSE would be fun. :)
+    if (InstDep.isNonLocal()) continue;
+  
+    // Handle frees whose dependencies are non-trivial.
+    if (FreeInst *FI = dyn_cast<FreeInst>(Inst)) {
+      MadeChange |= handleFreeWithNonTrivialDependency(FI, InstDep);
+      continue;
+    }
+    
+    StoreInst *SI = cast<StoreInst>(Inst);
+    
+    // If not a definite must-alias dependency, ignore it.
+    if (!InstDep.isDef())
+      continue;
+    
+    // If this is a store-store dependence, then the previous store is dead so
+    // long as this store is at least as big as it.
+    if (StoreInst *DepStore = dyn_cast<StoreInst>(InstDep.getInst()))
+      if (TD.getTypeStoreSize(DepStore->getOperand(0)->getType()) <=
+          TD.getTypeStoreSize(SI->getOperand(0)->getType())) {
+        // Delete the store and now-dead instructions that feed it.
+        DeleteDeadInstruction(DepStore);
+        NumFastStores++;
+        MadeChange = true;
+        
+        if (BBI != BB.begin())
+          --BBI;
+        continue;
+      }
+    
+    // If we're storing the same value back to a pointer that we just
+    // loaded from, then the store can be removed.
+    if (LoadInst *DepLoad = dyn_cast<LoadInst>(InstDep.getInst())) {
+      if (SI->getPointerOperand() == DepLoad->getPointerOperand() &&
+          SI->getOperand(0) == DepLoad) {
+        DeleteDeadInstruction(SI);
+        if (BBI != BB.begin())
+          --BBI;
+        NumFastStores++;
+        MadeChange = true;
+        continue;
+      }
+    }
+  }
+  
+  // If this block ends in a return, unwind, or unreachable, all allocas are
+  // dead at its end, which means stores to them are also dead.
+  if (BB.getTerminator()->getNumSuccessors() == 0)
+    MadeChange |= handleEndBlock(BB);
+  
+  return MadeChange;
+}
+
+/// handleFreeWithNonTrivialDependency - Handle frees of entire structures whose
+/// dependency is a store to a field of that structure.
+bool DSE::handleFreeWithNonTrivialDependency(FreeInst *F, MemDepResult Dep) {
+  AliasAnalysis &AA = getAnalysis<AliasAnalysis>();
+  
+  StoreInst *Dependency = dyn_cast_or_null<StoreInst>(Dep.getInst());
+  if (!Dependency || Dependency->isVolatile())
+    return false;
+  
+  Value *DepPointer = Dependency->getPointerOperand()->getUnderlyingObject();
+
+  // Check for aliasing.
+  if (AA.alias(F->getPointerOperand(), 1, DepPointer, 1) !=
+         AliasAnalysis::MustAlias)
+    return false;
+  
+  // DCE instructions only used to calculate that store
+  DeleteDeadInstruction(Dependency);
+  NumFastStores++;
+  return true;
+}
+
+/// handleEndBlock - Remove dead stores to stack-allocated locations in the
+/// function end block.  Ex:
+/// %A = alloca i32
+/// ...
+/// store i32 1, i32* %A
+/// ret void
+bool DSE::handleEndBlock(BasicBlock &BB) {
+  TargetData &TD = getAnalysis<TargetData>();
+  AliasAnalysis &AA = getAnalysis<AliasAnalysis>();
+  
+  bool MadeChange = false;
+  
+  // Pointers alloca'd in this function are dead in the end block
+  SmallPtrSet<Value*, 64> deadPointers;
+  
+  // Find all of the alloca'd pointers in the entry block.
+  BasicBlock *Entry = BB.getParent()->begin();
+  for (BasicBlock::iterator I = Entry->begin(), E = Entry->end(); I != E; ++I)
+    if (AllocaInst *AI = dyn_cast<AllocaInst>(I))
+      deadPointers.insert(AI);
+  
+  // Treat byval arguments the same, stores to them are dead at the end of the
+  // function.
+  for (Function::arg_iterator AI = BB.getParent()->arg_begin(),
+       AE = BB.getParent()->arg_end(); AI != AE; ++AI)
+    if (AI->hasByValAttr())
+      deadPointers.insert(AI);
+  
+  // Scan the basic block backwards
+  for (BasicBlock::iterator BBI = BB.end(); BBI != BB.begin(); ){
+    --BBI;
+    
+    // If we find a store whose pointer is dead.
+    if (StoreInst* S = dyn_cast<StoreInst>(BBI)) {
+      if (!S->isVolatile()) {
+        // See through pointer-to-pointer bitcasts
+        Value* pointerOperand = S->getPointerOperand()->getUnderlyingObject();
+
+        // Alloca'd pointers or byval arguments (which are functionally like
+        // alloca's) are valid candidates for removal.
+        if (deadPointers.count(pointerOperand)) {
+          // DCE instructions only used to calculate that store.
+          BBI++;
+          DeleteDeadInstruction(S, &deadPointers);
+          NumFastStores++;
+          MadeChange = true;
+        }
+      }
+      
+      continue;
+    }
+    
+    // We can also remove memcpy's to local variables at the end of a function.
+    if (MemCpyInst *M = dyn_cast<MemCpyInst>(BBI)) {
+      Value *dest = M->getDest()->getUnderlyingObject();
+
+      if (deadPointers.count(dest)) {
+        BBI++;
+        DeleteDeadInstruction(M, &deadPointers);
+        NumFastOther++;
+        MadeChange = true;
+        continue;
+      }
+      
+      // Because a memcpy is also a load, we can't skip it if we didn't remove
+      // it.
+    }
+    
+    Value* killPointer = 0;
+    uint64_t killPointerSize = ~0UL;
+    
+    // If we encounter a use of the pointer, it is no longer considered dead
+    if (LoadInst *L = dyn_cast<LoadInst>(BBI)) {
+      // However, if this load is unused and not volatile, we can go ahead and
+      // remove it, and not have to worry about it making our pointer undead!
+      if (L->use_empty() && !L->isVolatile()) {
+        BBI++;
+        DeleteDeadInstruction(L, &deadPointers);
+        NumFastOther++;
+        MadeChange = true;
+        continue;
+      }
+      
+      killPointer = L->getPointerOperand();
+    } else if (VAArgInst* V = dyn_cast<VAArgInst>(BBI)) {
+      killPointer = V->getOperand(0);
+    } else if (isa<MemCpyInst>(BBI) &&
+               isa<ConstantInt>(cast<MemCpyInst>(BBI)->getLength())) {
+      killPointer = cast<MemCpyInst>(BBI)->getSource();
+      killPointerSize = cast<ConstantInt>(
+                            cast<MemCpyInst>(BBI)->getLength())->getZExtValue();
+    } else if (AllocaInst* A = dyn_cast<AllocaInst>(BBI)) {
+      deadPointers.erase(A);
+      
+      // Dead alloca's can be DCE'd when we reach them
+      if (A->use_empty()) {
+        BBI++;
+        DeleteDeadInstruction(A, &deadPointers);
+        NumFastOther++;
+        MadeChange = true;
+      }
+      
+      continue;
+    } else if (CallSite::get(BBI).getInstruction() != 0) {
+      // If this call does not access memory, it can't
+      // be undeadifying any of our pointers.
+      CallSite CS = CallSite::get(BBI);
+      if (AA.doesNotAccessMemory(CS))
+        continue;
+      
+      unsigned modRef = 0;
+      unsigned other = 0;
+      
+      // Remove any pointers made undead by the call from the dead set
+      std::vector<Value*> dead;
+      for (SmallPtrSet<Value*, 64>::iterator I = deadPointers.begin(),
+           E = deadPointers.end(); I != E; ++I) {
+        // HACK: if we detect that our AA is imprecise, it's not
+        // worth it to scan the rest of the deadPointers set.  Just
+        // assume that the AA will return ModRef for everything, and
+        // go ahead and bail.
+        if (modRef >= 16 && other == 0) {
+          deadPointers.clear();
+          return MadeChange;
+        }
+
+        // Get size information for the alloca
+        unsigned pointerSize = ~0U;
+        if (AllocaInst* A = dyn_cast<AllocaInst>(*I)) {
+          if (ConstantInt* C = dyn_cast<ConstantInt>(A->getArraySize()))
+            pointerSize = C->getZExtValue() *
+                          TD.getTypeAllocSize(A->getAllocatedType());
+        } else {
+          const PointerType* PT = cast<PointerType>(
+                                                 cast<Argument>(*I)->getType());
+          pointerSize = TD.getTypeAllocSize(PT->getElementType());
+        }
+
+        // See if the call site touches it
+        AliasAnalysis::ModRefResult A = AA.getModRefInfo(CS, *I, pointerSize);
+        
+        if (A == AliasAnalysis::ModRef)
+          modRef++;
+        else
+          other++;
+        
+        if (A == AliasAnalysis::ModRef || A == AliasAnalysis::Ref)
+          dead.push_back(*I);
+      }
+
+      for (std::vector<Value*>::iterator I = dead.begin(), E = dead.end();
+           I != E; ++I)
+        deadPointers.erase(*I);
+      
+      continue;
+    } else if (isInstructionTriviallyDead(BBI)) {
+      // For any non-memory-affecting non-terminators, DCE them as we reach them
+      Instruction *Inst = BBI;
+      BBI++;
+      DeleteDeadInstruction(Inst, &deadPointers);
+      NumFastOther++;
+      MadeChange = true;
+      continue;
+    }
+    
+    if (!killPointer)
+      continue;
+
+    killPointer = killPointer->getUnderlyingObject();
+
+    // Deal with undead pointers
+    MadeChange |= RemoveUndeadPointers(killPointer, killPointerSize, BBI,
+                                       deadPointers);
+  }
+  
+  return MadeChange;
+}
+
+/// RemoveUndeadPointers - check for uses of a pointer that make it
+/// undead when scanning for dead stores to alloca's.
+bool DSE::RemoveUndeadPointers(Value* killPointer, uint64_t killPointerSize,
+                               BasicBlock::iterator &BBI,
+                               SmallPtrSet<Value*, 64>& deadPointers) {
+  TargetData &TD = getAnalysis<TargetData>();
+  AliasAnalysis &AA = getAnalysis<AliasAnalysis>();
+                                  
+  // If the kill pointer can be easily reduced to an alloca,
+  // don't bother doing extraneous AA queries.
+  if (deadPointers.count(killPointer)) {
+    deadPointers.erase(killPointer);
+    return false;
+  }
+  
+  // A global can't be in the dead pointer set.
+  if (isa<GlobalValue>(killPointer))
+    return false;
+  
+  bool MadeChange = false;
+  
+  SmallVector<Value*, 16> undead;
+    
+  for (SmallPtrSet<Value*, 64>::iterator I = deadPointers.begin(),
+      E = deadPointers.end(); I != E; ++I) {
+    // Get size information for the alloca.
+    unsigned pointerSize = ~0U;
+    if (AllocaInst* A = dyn_cast<AllocaInst>(*I)) {
+      if (ConstantInt* C = dyn_cast<ConstantInt>(A->getArraySize()))
+        pointerSize = C->getZExtValue() *
+                      TD.getTypeAllocSize(A->getAllocatedType());
+    } else {
+      const PointerType* PT = cast<PointerType>(cast<Argument>(*I)->getType());
+      pointerSize = TD.getTypeAllocSize(PT->getElementType());
+    }
+
+    // See if this pointer could alias it
+    AliasAnalysis::AliasResult A = AA.alias(*I, pointerSize,
+                                            killPointer, killPointerSize);
+
+    // If it must-alias and a store, we can delete it
+    if (isa<StoreInst>(BBI) && A == AliasAnalysis::MustAlias) {
+      StoreInst* S = cast<StoreInst>(BBI);
+
+      // Remove it!
+      BBI++;
+      DeleteDeadInstruction(S, &deadPointers);
+      NumFastStores++;
+      MadeChange = true;
+
+      continue;
+
+      // Otherwise, it is undead
+    } else if (A != AliasAnalysis::NoAlias)
+      undead.push_back(*I);
+  }
+
+  for (SmallVector<Value*, 16>::iterator I = undead.begin(), E = undead.end();
+       I != E; ++I)
+      deadPointers.erase(*I);
+  
+  return MadeChange;
+}
+
+/// DeleteDeadInstruction - Delete this instruction.  Before we do, go through
+/// and zero out all the operands of this instruction.  If any of them become
+/// dead, delete them and the computation tree that feeds them.
+///
+/// If ValueSet is non-null, remove any deleted instructions from it as well.
+///
+void DSE::DeleteDeadInstruction(Instruction *I,
+                                SmallPtrSet<Value*, 64> *ValueSet) {
+  SmallVector<Instruction*, 32> NowDeadInsts;
+  
+  NowDeadInsts.push_back(I);
+  --NumFastOther;
+
+  // Before we touch this instruction, remove it from memdep!
+  MemoryDependenceAnalysis &MDA = getAnalysis<MemoryDependenceAnalysis>();
+  while (!NowDeadInsts.empty()) {
+    Instruction *DeadInst = NowDeadInsts.back();
+    NowDeadInsts.pop_back();
+    
+    ++NumFastOther;
+    
+    // This instruction is dead, zap it, in stages.  Start by removing it from
+    // MemDep, which needs to know the operands and needs it to be in the
+    // function.
+    MDA.removeInstruction(DeadInst);
+    
+    for (unsigned op = 0, e = DeadInst->getNumOperands(); op != e; ++op) {
+      Value *Op = DeadInst->getOperand(op);
+      DeadInst->setOperand(op, 0);
+      
+      // If this operand just became dead, add it to the NowDeadInsts list.
+      if (!Op->use_empty()) continue;
+      
+      if (Instruction *OpI = dyn_cast<Instruction>(Op))
+        if (isInstructionTriviallyDead(OpI))
+          NowDeadInsts.push_back(OpI);
+    }
+    
+    DeadInst->eraseFromParent();
+    
+    if (ValueSet) ValueSet->erase(DeadInst);
+  }
+}
diff --git a/lib/Transforms/Scalar/GVN.cpp b/lib/Transforms/Scalar/GVN.cpp
new file mode 100644
index 0000000..733dfa9
--- /dev/null
+++ b/lib/Transforms/Scalar/GVN.cpp
@@ -0,0 +1,1738 @@
+//===- GVN.cpp - Eliminate redundant values and loads ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass performs global value numbering to eliminate fully redundant
+// instructions.  It also performs simple dead load elimination.
+//
+// Note that this pass does the value numbering itself; it does not use the
+// ValueNumbering analysis passes.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "gvn"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/BasicBlock.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Value.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/MemoryDependenceAnalysis.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include <cstdio>
+using namespace llvm;
+
+STATISTIC(NumGVNInstr,  "Number of instructions deleted");
+STATISTIC(NumGVNLoad,   "Number of loads deleted");
+STATISTIC(NumGVNPRE,    "Number of instructions PRE'd");
+STATISTIC(NumGVNBlocks, "Number of blocks merged");
+STATISTIC(NumPRELoad,   "Number of loads PRE'd");
+
+static cl::opt<bool> EnablePRE("enable-pre",
+                               cl::init(true), cl::Hidden);
+cl::opt<bool> EnableLoadPRE("enable-load-pre", cl::init(true));
+
+//===----------------------------------------------------------------------===//
+//                         ValueTable Class
+//===----------------------------------------------------------------------===//
+
+/// This class holds the mapping between values and value numbers.  It is used
+/// as an efficient mechanism to determine the expression-wise equivalence of
+/// two values.
+namespace {
+  struct VISIBILITY_HIDDEN Expression {
+    enum ExpressionOpcode { ADD, SUB, MUL, UDIV, SDIV, FDIV, UREM, SREM, 
+                            FREM, SHL, LSHR, ASHR, AND, OR, XOR, ICMPEQ, 
+                            ICMPNE, ICMPUGT, ICMPUGE, ICMPULT, ICMPULE, 
+                            ICMPSGT, ICMPSGE, ICMPSLT, ICMPSLE, FCMPOEQ, 
+                            FCMPOGT, FCMPOGE, FCMPOLT, FCMPOLE, FCMPONE, 
+                            FCMPORD, FCMPUNO, FCMPUEQ, FCMPUGT, FCMPUGE, 
+                            FCMPULT, FCMPULE, FCMPUNE, EXTRACT, INSERT,
+                            SHUFFLE, SELECT, TRUNC, ZEXT, SEXT, FPTOUI,
+                            FPTOSI, UITOFP, SITOFP, FPTRUNC, FPEXT, 
+                            PTRTOINT, INTTOPTR, BITCAST, GEP, CALL, CONSTANT,
+                            EMPTY, TOMBSTONE };
+
+    ExpressionOpcode opcode;
+    const Type* type;
+    uint32_t firstVN;
+    uint32_t secondVN;
+    uint32_t thirdVN;
+    SmallVector<uint32_t, 4> varargs;
+    Value* function;
+  
+    Expression() { }
+    Expression(ExpressionOpcode o) : opcode(o) { }
+  
+    bool operator==(const Expression &other) const {
+      if (opcode != other.opcode)
+        return false;
+      else if (opcode == EMPTY || opcode == TOMBSTONE)
+        return true;
+      else if (type != other.type)
+        return false;
+      else if (function != other.function)
+        return false;
+      else if (firstVN != other.firstVN)
+        return false;
+      else if (secondVN != other.secondVN)
+        return false;
+      else if (thirdVN != other.thirdVN)
+        return false;
+      else {
+        if (varargs.size() != other.varargs.size())
+          return false;
+      
+        for (size_t i = 0; i < varargs.size(); ++i)
+          if (varargs[i] != other.varargs[i])
+            return false;
+    
+        return true;
+      }
+    }
+  
+    bool operator!=(const Expression &other) const {
+      return !(*this == other);
+    }
+  };
+  
+  class VISIBILITY_HIDDEN ValueTable {
+    private:
+      DenseMap<Value*, uint32_t> valueNumbering;
+      DenseMap<Expression, uint32_t> expressionNumbering;
+      AliasAnalysis* AA;
+      MemoryDependenceAnalysis* MD;
+      DominatorTree* DT;
+  
+      uint32_t nextValueNumber;
+    
+      Expression::ExpressionOpcode getOpcode(BinaryOperator* BO);
+      Expression::ExpressionOpcode getOpcode(CmpInst* C);
+      Expression::ExpressionOpcode getOpcode(CastInst* C);
+      Expression create_expression(BinaryOperator* BO);
+      Expression create_expression(CmpInst* C);
+      Expression create_expression(ShuffleVectorInst* V);
+      Expression create_expression(ExtractElementInst* C);
+      Expression create_expression(InsertElementInst* V);
+      Expression create_expression(SelectInst* V);
+      Expression create_expression(CastInst* C);
+      Expression create_expression(GetElementPtrInst* G);
+      Expression create_expression(CallInst* C);
+      Expression create_expression(Constant* C);
+    public:
+      ValueTable() : nextValueNumber(1) { }
+      uint32_t lookup_or_add(Value* V);
+      uint32_t lookup(Value* V) const;
+      void add(Value* V, uint32_t num);
+      void clear();
+      void erase(Value* v);
+      unsigned size();
+      void setAliasAnalysis(AliasAnalysis* A) { AA = A; }
+      AliasAnalysis *getAliasAnalysis() const { return AA; }
+      void setMemDep(MemoryDependenceAnalysis* M) { MD = M; }
+      void setDomTree(DominatorTree* D) { DT = D; }
+      uint32_t getNextUnusedValueNumber() { return nextValueNumber; }
+      void verifyRemoved(const Value *) const;
+  };
+}
+
+namespace llvm {
+template <> struct DenseMapInfo<Expression> {
+  static inline Expression getEmptyKey() {
+    return Expression(Expression::EMPTY);
+  }
+  
+  static inline Expression getTombstoneKey() {
+    return Expression(Expression::TOMBSTONE);
+  }
+  
+  static unsigned getHashValue(const Expression e) {
+    unsigned hash = e.opcode;
+    
+    hash = e.firstVN + hash * 37;
+    hash = e.secondVN + hash * 37;
+    hash = e.thirdVN + hash * 37;
+    
+    hash = ((unsigned)((uintptr_t)e.type >> 4) ^
+            (unsigned)((uintptr_t)e.type >> 9)) +
+           hash * 37;
+    
+    for (SmallVector<uint32_t, 4>::const_iterator I = e.varargs.begin(),
+         E = e.varargs.end(); I != E; ++I)
+      hash = *I + hash * 37;
+    
+    hash = ((unsigned)((uintptr_t)e.function >> 4) ^
+            (unsigned)((uintptr_t)e.function >> 9)) +
+           hash * 37;
+    
+    return hash;
+  }
+  static bool isEqual(const Expression &LHS, const Expression &RHS) {
+    return LHS == RHS;
+  }
+  static bool isPod() { return true; }
+};
+}
+
+//===----------------------------------------------------------------------===//
+//                     ValueTable Internal Functions
+//===----------------------------------------------------------------------===//
+Expression::ExpressionOpcode ValueTable::getOpcode(BinaryOperator* BO) {
+  switch(BO->getOpcode()) {
+  default: // THIS SHOULD NEVER HAPPEN
+    assert(0 && "Binary operator with unknown opcode?");
+  case Instruction::Add:  return Expression::ADD;
+  case Instruction::Sub:  return Expression::SUB;
+  case Instruction::Mul:  return Expression::MUL;
+  case Instruction::UDiv: return Expression::UDIV;
+  case Instruction::SDiv: return Expression::SDIV;
+  case Instruction::FDiv: return Expression::FDIV;
+  case Instruction::URem: return Expression::UREM;
+  case Instruction::SRem: return Expression::SREM;
+  case Instruction::FRem: return Expression::FREM;
+  case Instruction::Shl:  return Expression::SHL;
+  case Instruction::LShr: return Expression::LSHR;
+  case Instruction::AShr: return Expression::ASHR;
+  case Instruction::And:  return Expression::AND;
+  case Instruction::Or:   return Expression::OR;
+  case Instruction::Xor:  return Expression::XOR;
+  }
+}
+
+Expression::ExpressionOpcode ValueTable::getOpcode(CmpInst* C) {
+  if (isa<ICmpInst>(C) || isa<VICmpInst>(C)) {
+    switch (C->getPredicate()) {
+    default:  // THIS SHOULD NEVER HAPPEN
+      assert(0 && "Comparison with unknown predicate?");
+    case ICmpInst::ICMP_EQ:  return Expression::ICMPEQ;
+    case ICmpInst::ICMP_NE:  return Expression::ICMPNE;
+    case ICmpInst::ICMP_UGT: return Expression::ICMPUGT;
+    case ICmpInst::ICMP_UGE: return Expression::ICMPUGE;
+    case ICmpInst::ICMP_ULT: return Expression::ICMPULT;
+    case ICmpInst::ICMP_ULE: return Expression::ICMPULE;
+    case ICmpInst::ICMP_SGT: return Expression::ICMPSGT;
+    case ICmpInst::ICMP_SGE: return Expression::ICMPSGE;
+    case ICmpInst::ICMP_SLT: return Expression::ICMPSLT;
+    case ICmpInst::ICMP_SLE: return Expression::ICMPSLE;
+    }
+  }
+  assert((isa<FCmpInst>(C) || isa<VFCmpInst>(C)) && "Unknown compare");
+  switch (C->getPredicate()) {
+  default: // THIS SHOULD NEVER HAPPEN
+    assert(0 && "Comparison with unknown predicate?");
+  case FCmpInst::FCMP_OEQ: return Expression::FCMPOEQ;
+  case FCmpInst::FCMP_OGT: return Expression::FCMPOGT;
+  case FCmpInst::FCMP_OGE: return Expression::FCMPOGE;
+  case FCmpInst::FCMP_OLT: return Expression::FCMPOLT;
+  case FCmpInst::FCMP_OLE: return Expression::FCMPOLE;
+  case FCmpInst::FCMP_ONE: return Expression::FCMPONE;
+  case FCmpInst::FCMP_ORD: return Expression::FCMPORD;
+  case FCmpInst::FCMP_UNO: return Expression::FCMPUNO;
+  case FCmpInst::FCMP_UEQ: return Expression::FCMPUEQ;
+  case FCmpInst::FCMP_UGT: return Expression::FCMPUGT;
+  case FCmpInst::FCMP_UGE: return Expression::FCMPUGE;
+  case FCmpInst::FCMP_ULT: return Expression::FCMPULT;
+  case FCmpInst::FCMP_ULE: return Expression::FCMPULE;
+  case FCmpInst::FCMP_UNE: return Expression::FCMPUNE;
+  }
+}
+
+Expression::ExpressionOpcode ValueTable::getOpcode(CastInst* C) {
+  switch(C->getOpcode()) {
+  default: // THIS SHOULD NEVER HAPPEN
+    assert(0 && "Cast operator with unknown opcode?");
+  case Instruction::Trunc:    return Expression::TRUNC;
+  case Instruction::ZExt:     return Expression::ZEXT;
+  case Instruction::SExt:     return Expression::SEXT;
+  case Instruction::FPToUI:   return Expression::FPTOUI;
+  case Instruction::FPToSI:   return Expression::FPTOSI;
+  case Instruction::UIToFP:   return Expression::UITOFP;
+  case Instruction::SIToFP:   return Expression::SITOFP;
+  case Instruction::FPTrunc:  return Expression::FPTRUNC;
+  case Instruction::FPExt:    return Expression::FPEXT;
+  case Instruction::PtrToInt: return Expression::PTRTOINT;
+  case Instruction::IntToPtr: return Expression::INTTOPTR;
+  case Instruction::BitCast:  return Expression::BITCAST;
+  }
+}
+
+Expression ValueTable::create_expression(CallInst* C) {
+  Expression e;
+  
+  e.type = C->getType();
+  e.firstVN = 0;
+  e.secondVN = 0;
+  e.thirdVN = 0;
+  e.function = C->getCalledFunction();
+  e.opcode = Expression::CALL;
+  
+  for (CallInst::op_iterator I = C->op_begin()+1, E = C->op_end();
+       I != E; ++I)
+    e.varargs.push_back(lookup_or_add(*I));
+  
+  return e;
+}
+
+Expression ValueTable::create_expression(BinaryOperator* BO) {
+  Expression e;
+    
+  e.firstVN = lookup_or_add(BO->getOperand(0));
+  e.secondVN = lookup_or_add(BO->getOperand(1));
+  e.thirdVN = 0;
+  e.function = 0;
+  e.type = BO->getType();
+  e.opcode = getOpcode(BO);
+  
+  return e;
+}
+
+Expression ValueTable::create_expression(CmpInst* C) {
+  Expression e;
+    
+  e.firstVN = lookup_or_add(C->getOperand(0));
+  e.secondVN = lookup_or_add(C->getOperand(1));
+  e.thirdVN = 0;
+  e.function = 0;
+  e.type = C->getType();
+  e.opcode = getOpcode(C);
+  
+  return e;
+}
+
+Expression ValueTable::create_expression(CastInst* C) {
+  Expression e;
+    
+  e.firstVN = lookup_or_add(C->getOperand(0));
+  e.secondVN = 0;
+  e.thirdVN = 0;
+  e.function = 0;
+  e.type = C->getType();
+  e.opcode = getOpcode(C);
+  
+  return e;
+}
+
+Expression ValueTable::create_expression(ShuffleVectorInst* S) {
+  Expression e;
+    
+  e.firstVN = lookup_or_add(S->getOperand(0));
+  e.secondVN = lookup_or_add(S->getOperand(1));
+  e.thirdVN = lookup_or_add(S->getOperand(2));
+  e.function = 0;
+  e.type = S->getType();
+  e.opcode = Expression::SHUFFLE;
+  
+  return e;
+}
+
+Expression ValueTable::create_expression(ExtractElementInst* E) {
+  Expression e;
+    
+  e.firstVN = lookup_or_add(E->getOperand(0));
+  e.secondVN = lookup_or_add(E->getOperand(1));
+  e.thirdVN = 0;
+  e.function = 0;
+  e.type = E->getType();
+  e.opcode = Expression::EXTRACT;
+  
+  return e;
+}
+
+Expression ValueTable::create_expression(InsertElementInst* I) {
+  Expression e;
+    
+  e.firstVN = lookup_or_add(I->getOperand(0));
+  e.secondVN = lookup_or_add(I->getOperand(1));
+  e.thirdVN = lookup_or_add(I->getOperand(2));
+  e.function = 0;
+  e.type = I->getType();
+  e.opcode = Expression::INSERT;
+  
+  return e;
+}
+
+Expression ValueTable::create_expression(SelectInst* I) {
+  Expression e;
+    
+  e.firstVN = lookup_or_add(I->getCondition());
+  e.secondVN = lookup_or_add(I->getTrueValue());
+  e.thirdVN = lookup_or_add(I->getFalseValue());
+  e.function = 0;
+  e.type = I->getType();
+  e.opcode = Expression::SELECT;
+  
+  return e;
+}
+
+Expression ValueTable::create_expression(GetElementPtrInst* G) {
+  Expression e;
+  
+  e.firstVN = lookup_or_add(G->getPointerOperand());
+  e.secondVN = 0;
+  e.thirdVN = 0;
+  e.function = 0;
+  e.type = G->getType();
+  e.opcode = Expression::GEP;
+  
+  for (GetElementPtrInst::op_iterator I = G->idx_begin(), E = G->idx_end();
+       I != E; ++I)
+    e.varargs.push_back(lookup_or_add(*I));
+  
+  return e;
+}
+
+//===----------------------------------------------------------------------===//
+//                     ValueTable External Functions
+//===----------------------------------------------------------------------===//
+
+/// add - Insert a value into the table with a specified value number.
+void ValueTable::add(Value* V, uint32_t num) {
+  valueNumbering.insert(std::make_pair(V, num));
+}
+
+/// lookup_or_add - Returns the value number for the specified value, assigning
+/// it a new number if it did not have one before.
+uint32_t ValueTable::lookup_or_add(Value* V) {
+  DenseMap<Value*, uint32_t>::iterator VI = valueNumbering.find(V);
+  if (VI != valueNumbering.end())
+    return VI->second;
+  
+  if (CallInst* C = dyn_cast<CallInst>(V)) {
+    if (AA->doesNotAccessMemory(C)) {
+      Expression e = create_expression(C);
+    
+      DenseMap<Expression, uint32_t>::iterator EI = expressionNumbering.find(e);
+      if (EI != expressionNumbering.end()) {
+        valueNumbering.insert(std::make_pair(V, EI->second));
+        return EI->second;
+      } else {
+        expressionNumbering.insert(std::make_pair(e, nextValueNumber));
+        valueNumbering.insert(std::make_pair(V, nextValueNumber));
+      
+        return nextValueNumber++;
+      }
+    } else if (AA->onlyReadsMemory(C)) {
+      Expression e = create_expression(C);
+      
+      if (expressionNumbering.find(e) == expressionNumbering.end()) {
+        expressionNumbering.insert(std::make_pair(e, nextValueNumber));
+        valueNumbering.insert(std::make_pair(V, nextValueNumber));
+        return nextValueNumber++;
+      }
+      
+      MemDepResult local_dep = MD->getDependency(C);
+      
+      if (!local_dep.isDef() && !local_dep.isNonLocal()) {
+        valueNumbering.insert(std::make_pair(V, nextValueNumber));
+        return nextValueNumber++;
+      }
+
+      if (local_dep.isDef()) {
+        CallInst* local_cdep = cast<CallInst>(local_dep.getInst());
+        
+        if (local_cdep->getNumOperands() != C->getNumOperands()) {
+          valueNumbering.insert(std::make_pair(V, nextValueNumber));
+          return nextValueNumber++;
+        }
+          
+        for (unsigned i = 1; i < C->getNumOperands(); ++i) {
+          uint32_t c_vn = lookup_or_add(C->getOperand(i));
+          uint32_t cd_vn = lookup_or_add(local_cdep->getOperand(i));
+          if (c_vn != cd_vn) {
+            valueNumbering.insert(std::make_pair(V, nextValueNumber));
+            return nextValueNumber++;
+          }
+        }
+      
+        uint32_t v = lookup_or_add(local_cdep);
+        valueNumbering.insert(std::make_pair(V, v));
+        return v;
+      }
+
+      // Non-local case.
+      const MemoryDependenceAnalysis::NonLocalDepInfo &deps = 
+        MD->getNonLocalCallDependency(CallSite(C));
+      // FIXME: call/call dependencies for readonly calls should return def, not
+      // clobber!  Move the checking logic to MemDep!
+      CallInst* cdep = 0;
+      
+      // Check to see if we have a single dominating call instruction that is
+      // identical to C.
+      for (unsigned i = 0, e = deps.size(); i != e; ++i) {
+        const MemoryDependenceAnalysis::NonLocalDepEntry *I = &deps[i];
+        // Ignore non-local dependencies.
+        if (I->second.isNonLocal())
+          continue;
+
+        // We don't handle non-depedencies.  If we already have a call, reject
+        // instruction dependencies.
+        if (I->second.isClobber() || cdep != 0) {
+          cdep = 0;
+          break;
+        }
+        
+        CallInst *NonLocalDepCall = dyn_cast<CallInst>(I->second.getInst());
+        // FIXME: All duplicated with non-local case.
+        if (NonLocalDepCall && DT->properlyDominates(I->first, C->getParent())){
+          cdep = NonLocalDepCall;
+          continue;
+        }
+        
+        cdep = 0;
+        break;
+      }
+      
+      if (!cdep) {
+        valueNumbering.insert(std::make_pair(V, nextValueNumber));
+        return nextValueNumber++;
+      }
+      
+      if (cdep->getNumOperands() != C->getNumOperands()) {
+        valueNumbering.insert(std::make_pair(V, nextValueNumber));
+        return nextValueNumber++;
+      }
+      for (unsigned i = 1; i < C->getNumOperands(); ++i) {
+        uint32_t c_vn = lookup_or_add(C->getOperand(i));
+        uint32_t cd_vn = lookup_or_add(cdep->getOperand(i));
+        if (c_vn != cd_vn) {
+          valueNumbering.insert(std::make_pair(V, nextValueNumber));
+          return nextValueNumber++;
+        }
+      }
+      
+      uint32_t v = lookup_or_add(cdep);
+      valueNumbering.insert(std::make_pair(V, v));
+      return v;
+      
+    } else {
+      valueNumbering.insert(std::make_pair(V, nextValueNumber));
+      return nextValueNumber++;
+    }
+  } else if (BinaryOperator* BO = dyn_cast<BinaryOperator>(V)) {
+    Expression e = create_expression(BO);
+    
+    DenseMap<Expression, uint32_t>::iterator EI = expressionNumbering.find(e);
+    if (EI != expressionNumbering.end()) {
+      valueNumbering.insert(std::make_pair(V, EI->second));
+      return EI->second;
+    } else {
+      expressionNumbering.insert(std::make_pair(e, nextValueNumber));
+      valueNumbering.insert(std::make_pair(V, nextValueNumber));
+      
+      return nextValueNumber++;
+    }
+  } else if (CmpInst* C = dyn_cast<CmpInst>(V)) {
+    Expression e = create_expression(C);
+    
+    DenseMap<Expression, uint32_t>::iterator EI = expressionNumbering.find(e);
+    if (EI != expressionNumbering.end()) {
+      valueNumbering.insert(std::make_pair(V, EI->second));
+      return EI->second;
+    } else {
+      expressionNumbering.insert(std::make_pair(e, nextValueNumber));
+      valueNumbering.insert(std::make_pair(V, nextValueNumber));
+      
+      return nextValueNumber++;
+    }
+  } else if (ShuffleVectorInst* U = dyn_cast<ShuffleVectorInst>(V)) {
+    Expression e = create_expression(U);
+    
+    DenseMap<Expression, uint32_t>::iterator EI = expressionNumbering.find(e);
+    if (EI != expressionNumbering.end()) {
+      valueNumbering.insert(std::make_pair(V, EI->second));
+      return EI->second;
+    } else {
+      expressionNumbering.insert(std::make_pair(e, nextValueNumber));
+      valueNumbering.insert(std::make_pair(V, nextValueNumber));
+      
+      return nextValueNumber++;
+    }
+  } else if (ExtractElementInst* U = dyn_cast<ExtractElementInst>(V)) {
+    Expression e = create_expression(U);
+    
+    DenseMap<Expression, uint32_t>::iterator EI = expressionNumbering.find(e);
+    if (EI != expressionNumbering.end()) {
+      valueNumbering.insert(std::make_pair(V, EI->second));
+      return EI->second;
+    } else {
+      expressionNumbering.insert(std::make_pair(e, nextValueNumber));
+      valueNumbering.insert(std::make_pair(V, nextValueNumber));
+      
+      return nextValueNumber++;
+    }
+  } else if (InsertElementInst* U = dyn_cast<InsertElementInst>(V)) {
+    Expression e = create_expression(U);
+    
+    DenseMap<Expression, uint32_t>::iterator EI = expressionNumbering.find(e);
+    if (EI != expressionNumbering.end()) {
+      valueNumbering.insert(std::make_pair(V, EI->second));
+      return EI->second;
+    } else {
+      expressionNumbering.insert(std::make_pair(e, nextValueNumber));
+      valueNumbering.insert(std::make_pair(V, nextValueNumber));
+      
+      return nextValueNumber++;
+    }
+  } else if (SelectInst* U = dyn_cast<SelectInst>(V)) {
+    Expression e = create_expression(U);
+    
+    DenseMap<Expression, uint32_t>::iterator EI = expressionNumbering.find(e);
+    if (EI != expressionNumbering.end()) {
+      valueNumbering.insert(std::make_pair(V, EI->second));
+      return EI->second;
+    } else {
+      expressionNumbering.insert(std::make_pair(e, nextValueNumber));
+      valueNumbering.insert(std::make_pair(V, nextValueNumber));
+      
+      return nextValueNumber++;
+    }
+  } else if (CastInst* U = dyn_cast<CastInst>(V)) {
+    Expression e = create_expression(U);
+    
+    DenseMap<Expression, uint32_t>::iterator EI = expressionNumbering.find(e);
+    if (EI != expressionNumbering.end()) {
+      valueNumbering.insert(std::make_pair(V, EI->second));
+      return EI->second;
+    } else {
+      expressionNumbering.insert(std::make_pair(e, nextValueNumber));
+      valueNumbering.insert(std::make_pair(V, nextValueNumber));
+      
+      return nextValueNumber++;
+    }
+  } else if (GetElementPtrInst* U = dyn_cast<GetElementPtrInst>(V)) {
+    Expression e = create_expression(U);
+    
+    DenseMap<Expression, uint32_t>::iterator EI = expressionNumbering.find(e);
+    if (EI != expressionNumbering.end()) {
+      valueNumbering.insert(std::make_pair(V, EI->second));
+      return EI->second;
+    } else {
+      expressionNumbering.insert(std::make_pair(e, nextValueNumber));
+      valueNumbering.insert(std::make_pair(V, nextValueNumber));
+      
+      return nextValueNumber++;
+    }
+  } else {
+    valueNumbering.insert(std::make_pair(V, nextValueNumber));
+    return nextValueNumber++;
+  }
+}
+
+/// lookup - Returns the value number of the specified value. Fails if
+/// the value has not yet been numbered.
+uint32_t ValueTable::lookup(Value* V) const {
+  DenseMap<Value*, uint32_t>::iterator VI = valueNumbering.find(V);
+  assert(VI != valueNumbering.end() && "Value not numbered?");
+  return VI->second;
+}
+
+/// clear - Remove all entries from the ValueTable
+void ValueTable::clear() {
+  valueNumbering.clear();
+  expressionNumbering.clear();
+  nextValueNumber = 1;
+}
+
+/// erase - Remove a value from the value numbering
+void ValueTable::erase(Value* V) {
+  valueNumbering.erase(V);
+}
+
+/// verifyRemoved - Verify that the value is removed from all internal data
+/// structures.
+void ValueTable::verifyRemoved(const Value *V) const {
+  for (DenseMap<Value*, uint32_t>::iterator
+         I = valueNumbering.begin(), E = valueNumbering.end(); I != E; ++I) {
+    assert(I->first != V && "Inst still occurs in value numbering map!");
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//                                GVN Pass
+//===----------------------------------------------------------------------===//
+
+namespace {
+  struct VISIBILITY_HIDDEN ValueNumberScope {
+    ValueNumberScope* parent;
+    DenseMap<uint32_t, Value*> table;
+    
+    ValueNumberScope(ValueNumberScope* p) : parent(p) { }
+  };
+}
+
+namespace {
+
+  class VISIBILITY_HIDDEN GVN : public FunctionPass {
+    bool runOnFunction(Function &F);
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    GVN() : FunctionPass(&ID) { }
+
+  private:
+    MemoryDependenceAnalysis *MD;
+    DominatorTree *DT;
+
+    ValueTable VN;
+    DenseMap<BasicBlock*, ValueNumberScope*> localAvail;
+    
+    typedef DenseMap<Value*, SmallPtrSet<Instruction*, 4> > PhiMapType;
+    PhiMapType phiMap;
+    
+    
+    // This transformation requires dominator postdominator info
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequired<DominatorTree>();
+      AU.addRequired<MemoryDependenceAnalysis>();
+      AU.addRequired<AliasAnalysis>();
+      
+      AU.addPreserved<DominatorTree>();
+      AU.addPreserved<AliasAnalysis>();
+    }
+  
+    // Helper fuctions
+    // FIXME: eliminate or document these better
+    bool processLoad(LoadInst* L,
+                     SmallVectorImpl<Instruction*> &toErase);
+    bool processInstruction(Instruction* I,
+                            SmallVectorImpl<Instruction*> &toErase);
+    bool processNonLocalLoad(LoadInst* L,
+                             SmallVectorImpl<Instruction*> &toErase);
+    bool processBlock(BasicBlock* BB);
+    Value *GetValueForBlock(BasicBlock *BB, Instruction* orig,
+                            DenseMap<BasicBlock*, Value*> &Phis,
+                            bool top_level = false);
+    void dump(DenseMap<uint32_t, Value*>& d);
+    bool iterateOnFunction(Function &F);
+    Value* CollapsePhi(PHINode* p);
+    bool isSafeReplacement(PHINode* p, Instruction* inst);
+    bool performPRE(Function& F);
+    Value* lookupNumber(BasicBlock* BB, uint32_t num);
+    bool mergeBlockIntoPredecessor(BasicBlock* BB);
+    Value* AttemptRedundancyElimination(Instruction* orig, unsigned valno);
+    void cleanupGlobalSets();
+    void verifyRemoved(const Instruction *I) const;
+  };
+  
+  char GVN::ID = 0;
+}
+
+// createGVNPass - The public interface to this file...
+FunctionPass *llvm::createGVNPass() { return new GVN(); }
+
+static RegisterPass<GVN> X("gvn",
+                           "Global Value Numbering");
+
+void GVN::dump(DenseMap<uint32_t, Value*>& d) {
+  printf("{\n");
+  for (DenseMap<uint32_t, Value*>::iterator I = d.begin(),
+       E = d.end(); I != E; ++I) {
+      printf("%d\n", I->first);
+      I->second->dump();
+  }
+  printf("}\n");
+}
+
+Value* GVN::CollapsePhi(PHINode* p) {
+  Value* constVal = p->hasConstantValue();
+  if (!constVal) return 0;
+  
+  Instruction* inst = dyn_cast<Instruction>(constVal);
+  if (!inst)
+    return constVal;
+    
+  if (DT->dominates(inst, p))
+    if (isSafeReplacement(p, inst))
+      return inst;
+  return 0;
+}
+
+bool GVN::isSafeReplacement(PHINode* p, Instruction* inst) {
+  if (!isa<PHINode>(inst))
+    return true;
+  
+  for (Instruction::use_iterator UI = p->use_begin(), E = p->use_end();
+       UI != E; ++UI)
+    if (PHINode* use_phi = dyn_cast<PHINode>(UI))
+      if (use_phi->getParent() == inst->getParent())
+        return false;
+  
+  return true;
+}
+
+/// GetValueForBlock - Get the value to use within the specified basic block.
+/// available values are in Phis.
+Value *GVN::GetValueForBlock(BasicBlock *BB, Instruction* orig,
+                             DenseMap<BasicBlock*, Value*> &Phis,
+                             bool top_level) { 
+                                 
+  // If we have already computed this value, return the previously computed val.
+  DenseMap<BasicBlock*, Value*>::iterator V = Phis.find(BB);
+  if (V != Phis.end() && !top_level) return V->second;
+  
+  // If the block is unreachable, just return undef, since this path
+  // can't actually occur at runtime.
+  if (!DT->isReachableFromEntry(BB))
+    return Phis[BB] = UndefValue::get(orig->getType());
+  
+  if (BasicBlock *Pred = BB->getSinglePredecessor()) {
+    Value *ret = GetValueForBlock(Pred, orig, Phis);
+    Phis[BB] = ret;
+    return ret;
+  }
+
+  // Get the number of predecessors of this block so we can reserve space later.
+  // If there is already a PHI in it, use the #preds from it, otherwise count.
+  // Getting it from the PHI is constant time.
+  unsigned NumPreds;
+  if (PHINode *ExistingPN = dyn_cast<PHINode>(BB->begin()))
+    NumPreds = ExistingPN->getNumIncomingValues();
+  else
+    NumPreds = std::distance(pred_begin(BB), pred_end(BB));
+  
+  // Otherwise, the idom is the loop, so we need to insert a PHI node.  Do so
+  // now, then get values to fill in the incoming values for the PHI.
+  PHINode *PN = PHINode::Create(orig->getType(), orig->getName()+".rle",
+                                BB->begin());
+  PN->reserveOperandSpace(NumPreds);
+  
+  Phis.insert(std::make_pair(BB, PN));
+  
+  // Fill in the incoming values for the block.
+  for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) {
+    Value* val = GetValueForBlock(*PI, orig, Phis);
+    PN->addIncoming(val, *PI);
+  }
+  
+  VN.getAliasAnalysis()->copyValue(orig, PN);
+  
+  // Attempt to collapse PHI nodes that are trivially redundant
+  Value* v = CollapsePhi(PN);
+  if (!v) {
+    // Cache our phi construction results
+    if (LoadInst* L = dyn_cast<LoadInst>(orig))
+      phiMap[L->getPointerOperand()].insert(PN);
+    else
+      phiMap[orig].insert(PN);
+    
+    return PN;
+  }
+    
+  PN->replaceAllUsesWith(v);
+  if (isa<PointerType>(v->getType()))
+    MD->invalidateCachedPointerInfo(v);
+
+  for (DenseMap<BasicBlock*, Value*>::iterator I = Phis.begin(),
+       E = Phis.end(); I != E; ++I)
+    if (I->second == PN)
+      I->second = v;
+
+  DEBUG(cerr << "GVN removed: " << *PN);
+  MD->removeInstruction(PN);
+  PN->eraseFromParent();
+  DEBUG(verifyRemoved(PN));
+
+  Phis[BB] = v;
+  return v;
+}
+
+/// IsValueFullyAvailableInBlock - Return true if we can prove that the value
+/// we're analyzing is fully available in the specified block.  As we go, keep
+/// track of which blocks we know are fully alive in FullyAvailableBlocks.  This
+/// map is actually a tri-state map with the following values:
+///   0) we know the block *is not* fully available.
+///   1) we know the block *is* fully available.
+///   2) we do not know whether the block is fully available or not, but we are
+///      currently speculating that it will be.
+///   3) we are speculating for this block and have used that to speculate for
+///      other blocks.
+static bool IsValueFullyAvailableInBlock(BasicBlock *BB, 
+                            DenseMap<BasicBlock*, char> &FullyAvailableBlocks) {
+  // Optimistically assume that the block is fully available and check to see
+  // if we already know about this block in one lookup.
+  std::pair<DenseMap<BasicBlock*, char>::iterator, char> IV = 
+    FullyAvailableBlocks.insert(std::make_pair(BB, 2));
+
+  // If the entry already existed for this block, return the precomputed value.
+  if (!IV.second) {
+    // If this is a speculative "available" value, mark it as being used for
+    // speculation of other blocks.
+    if (IV.first->second == 2)
+      IV.first->second = 3;
+    return IV.first->second != 0;
+  }
+  
+  // Otherwise, see if it is fully available in all predecessors.
+  pred_iterator PI = pred_begin(BB), PE = pred_end(BB);
+  
+  // If this block has no predecessors, it isn't live-in here.
+  if (PI == PE)
+    goto SpeculationFailure;
+  
+  for (; PI != PE; ++PI)
+    // If the value isn't fully available in one of our predecessors, then it
+    // isn't fully available in this block either.  Undo our previous
+    // optimistic assumption and bail out.
+    if (!IsValueFullyAvailableInBlock(*PI, FullyAvailableBlocks))
+      goto SpeculationFailure;
+  
+  return true;
+  
+// SpeculationFailure - If we get here, we found out that this is not, after
+// all, a fully-available block.  We have a problem if we speculated on this and
+// used the speculation to mark other blocks as available.
+SpeculationFailure:
+  char &BBVal = FullyAvailableBlocks[BB];
+  
+  // If we didn't speculate on this, just return with it set to false.
+  if (BBVal == 2) {
+    BBVal = 0;
+    return false;
+  }
+
+  // If we did speculate on this value, we could have blocks set to 1 that are
+  // incorrect.  Walk the (transitive) successors of this block and mark them as
+  // 0 if set to one.
+  SmallVector<BasicBlock*, 32> BBWorklist;
+  BBWorklist.push_back(BB);
+  
+  while (!BBWorklist.empty()) {
+    BasicBlock *Entry = BBWorklist.pop_back_val();
+    // Note that this sets blocks to 0 (unavailable) if they happen to not
+    // already be in FullyAvailableBlocks.  This is safe.
+    char &EntryVal = FullyAvailableBlocks[Entry];
+    if (EntryVal == 0) continue;  // Already unavailable.
+
+    // Mark as unavailable.
+    EntryVal = 0;
+    
+    for (succ_iterator I = succ_begin(Entry), E = succ_end(Entry); I != E; ++I)
+      BBWorklist.push_back(*I);
+  }
+  
+  return false;
+}
+
+/// processNonLocalLoad - Attempt to eliminate a load whose dependencies are
+/// non-local by performing PHI construction.
+bool GVN::processNonLocalLoad(LoadInst *LI,
+                              SmallVectorImpl<Instruction*> &toErase) {
+  // Find the non-local dependencies of the load.
+  SmallVector<MemoryDependenceAnalysis::NonLocalDepEntry, 64> Deps; 
+  MD->getNonLocalPointerDependency(LI->getOperand(0), true, LI->getParent(),
+                                   Deps);
+  //DEBUG(cerr << "INVESTIGATING NONLOCAL LOAD: " << Deps.size() << *LI);
+  
+  // If we had to process more than one hundred blocks to find the
+  // dependencies, this load isn't worth worrying about.  Optimizing
+  // it will be too expensive.
+  if (Deps.size() > 100)
+    return false;
+
+  // If we had a phi translation failure, we'll have a single entry which is a
+  // clobber in the current block.  Reject this early.
+  if (Deps.size() == 1 && Deps[0].second.isClobber())
+    return false;
+  
+  // Filter out useless results (non-locals, etc).  Keep track of the blocks
+  // where we have a value available in repl, also keep track of whether we see
+  // dependencies that produce an unknown value for the load (such as a call
+  // that could potentially clobber the load).
+  SmallVector<std::pair<BasicBlock*, Value*>, 16> ValuesPerBlock;
+  SmallVector<BasicBlock*, 16> UnavailableBlocks;
+  
+  for (unsigned i = 0, e = Deps.size(); i != e; ++i) {
+    BasicBlock *DepBB = Deps[i].first;
+    MemDepResult DepInfo = Deps[i].second;
+    
+    if (DepInfo.isClobber()) {
+      UnavailableBlocks.push_back(DepBB);
+      continue;
+    }
+    
+    Instruction *DepInst = DepInfo.getInst();
+    
+    // Loading the allocation -> undef.
+    if (isa<AllocationInst>(DepInst)) {
+      ValuesPerBlock.push_back(std::make_pair(DepBB, 
+                                              UndefValue::get(LI->getType())));
+      continue;
+    }
+  
+    if (StoreInst* S = dyn_cast<StoreInst>(DepInst)) {
+      // Reject loads and stores that are to the same address but are of 
+      // different types.
+      // NOTE: 403.gcc does have this case (e.g. in readonly_fields_p) because
+      // of bitfield access, it would be interesting to optimize for it at some
+      // point.
+      if (S->getOperand(0)->getType() != LI->getType()) {
+        UnavailableBlocks.push_back(DepBB);
+        continue;
+      }
+      
+      ValuesPerBlock.push_back(std::make_pair(DepBB, S->getOperand(0)));
+      
+    } else if (LoadInst* LD = dyn_cast<LoadInst>(DepInst)) {
+      if (LD->getType() != LI->getType()) {
+        UnavailableBlocks.push_back(DepBB);
+        continue;
+      }
+      ValuesPerBlock.push_back(std::make_pair(DepBB, LD));
+    } else {
+      UnavailableBlocks.push_back(DepBB);
+      continue;
+    }
+  }
+  
+  // If we have no predecessors that produce a known value for this load, exit
+  // early.
+  if (ValuesPerBlock.empty()) return false;
+  
+  // If all of the instructions we depend on produce a known value for this
+  // load, then it is fully redundant and we can use PHI insertion to compute
+  // its value.  Insert PHIs and remove the fully redundant value now.
+  if (UnavailableBlocks.empty()) {
+    // Use cached PHI construction information from previous runs
+    SmallPtrSet<Instruction*, 4> &p = phiMap[LI->getPointerOperand()];
+    // FIXME: What does phiMap do? Are we positive it isn't getting invalidated?
+    for (SmallPtrSet<Instruction*, 4>::iterator I = p.begin(), E = p.end();
+         I != E; ++I) {
+      if ((*I)->getParent() == LI->getParent()) {
+        DEBUG(cerr << "GVN REMOVING NONLOCAL LOAD #1: " << *LI);
+        LI->replaceAllUsesWith(*I);
+        if (isa<PointerType>((*I)->getType()))
+          MD->invalidateCachedPointerInfo(*I);
+        toErase.push_back(LI);
+        NumGVNLoad++;
+        return true;
+      }
+      
+      ValuesPerBlock.push_back(std::make_pair((*I)->getParent(), *I));
+    }
+    
+    DEBUG(cerr << "GVN REMOVING NONLOCAL LOAD: " << *LI);
+    
+    DenseMap<BasicBlock*, Value*> BlockReplValues;
+    BlockReplValues.insert(ValuesPerBlock.begin(), ValuesPerBlock.end());
+    // Perform PHI construction.
+    Value* v = GetValueForBlock(LI->getParent(), LI, BlockReplValues, true);
+    LI->replaceAllUsesWith(v);
+    
+    if (isa<PHINode>(v))
+      v->takeName(LI);
+    if (isa<PointerType>(v->getType()))
+      MD->invalidateCachedPointerInfo(v);
+    toErase.push_back(LI);
+    NumGVNLoad++;
+    return true;
+  }
+  
+  if (!EnablePRE || !EnableLoadPRE)
+    return false;
+
+  // Okay, we have *some* definitions of the value.  This means that the value
+  // is available in some of our (transitive) predecessors.  Lets think about
+  // doing PRE of this load.  This will involve inserting a new load into the
+  // predecessor when it's not available.  We could do this in general, but
+  // prefer to not increase code size.  As such, we only do this when we know
+  // that we only have to insert *one* load (which means we're basically moving
+  // the load, not inserting a new one).
+  
+  SmallPtrSet<BasicBlock *, 4> Blockers;
+  for (unsigned i = 0, e = UnavailableBlocks.size(); i != e; ++i)
+    Blockers.insert(UnavailableBlocks[i]);
+
+  // Lets find first basic block with more than one predecessor.  Walk backwards
+  // through predecessors if needed.
+  BasicBlock *LoadBB = LI->getParent();
+  BasicBlock *TmpBB = LoadBB;
+
+  bool isSinglePred = false;
+  while (TmpBB->getSinglePredecessor()) {
+    isSinglePred = true;
+    TmpBB = TmpBB->getSinglePredecessor();
+    if (!TmpBB) // If haven't found any, bail now.
+      return false;
+    if (TmpBB == LoadBB) // Infinite (unreachable) loop.
+      return false;
+    if (Blockers.count(TmpBB))
+      return false;
+  }
+  
+  assert(TmpBB);
+  LoadBB = TmpBB;
+  
+  // If we have a repl set with LI itself in it, this means we have a loop where
+  // at least one of the values is LI.  Since this means that we won't be able
+  // to eliminate LI even if we insert uses in the other predecessors, we will
+  // end up increasing code size.  Reject this by scanning for LI.
+  for (unsigned i = 0, e = ValuesPerBlock.size(); i != e; ++i)
+    if (ValuesPerBlock[i].second == LI)
+      return false;
+  
+  if (isSinglePred) {
+    bool isHot = false;
+    for (unsigned i = 0, e = ValuesPerBlock.size(); i != e; ++i)
+      if (Instruction *I = dyn_cast<Instruction>(ValuesPerBlock[i].second))
+	// "Hot" Instruction is in some loop (because it dominates its dep. 
+	// instruction).
+	if (DT->dominates(LI, I)) { 
+	  isHot = true;
+	  break;
+	}
+
+    // We are interested only in "hot" instructions. We don't want to do any
+    // mis-optimizations here.
+    if (!isHot)
+      return false;
+  }
+
+  // Okay, we have some hope :).  Check to see if the loaded value is fully
+  // available in all but one predecessor.
+  // FIXME: If we could restructure the CFG, we could make a common pred with
+  // all the preds that don't have an available LI and insert a new load into
+  // that one block.
+  BasicBlock *UnavailablePred = 0;
+
+  DenseMap<BasicBlock*, char> FullyAvailableBlocks;
+  for (unsigned i = 0, e = ValuesPerBlock.size(); i != e; ++i)
+    FullyAvailableBlocks[ValuesPerBlock[i].first] = true;
+  for (unsigned i = 0, e = UnavailableBlocks.size(); i != e; ++i)
+    FullyAvailableBlocks[UnavailableBlocks[i]] = false;
+
+  for (pred_iterator PI = pred_begin(LoadBB), E = pred_end(LoadBB);
+       PI != E; ++PI) {
+    if (IsValueFullyAvailableInBlock(*PI, FullyAvailableBlocks))
+      continue;
+    
+    // If this load is not available in multiple predecessors, reject it.
+    if (UnavailablePred && UnavailablePred != *PI)
+      return false;
+    UnavailablePred = *PI;
+  }
+  
+  assert(UnavailablePred != 0 &&
+         "Fully available value should be eliminated above!");
+  
+  // If the loaded pointer is PHI node defined in this block, do PHI translation
+  // to get its value in the predecessor.
+  Value *LoadPtr = LI->getOperand(0)->DoPHITranslation(LoadBB, UnavailablePred);
+  
+  // Make sure the value is live in the predecessor.  If it was defined by a
+  // non-PHI instruction in this block, we don't know how to recompute it above.
+  if (Instruction *LPInst = dyn_cast<Instruction>(LoadPtr))
+    if (!DT->dominates(LPInst->getParent(), UnavailablePred)) {
+      DEBUG(cerr << "COULDN'T PRE LOAD BECAUSE PTR IS UNAVAILABLE IN PRED: "
+                 << *LPInst << *LI << "\n");
+      return false;
+    }
+  
+  // We don't currently handle critical edges :(
+  if (UnavailablePred->getTerminator()->getNumSuccessors() != 1) {
+    DEBUG(cerr << "COULD NOT PRE LOAD BECAUSE OF CRITICAL EDGE '"
+                << UnavailablePred->getName() << "': " << *LI);
+    return false;
+  }
+  
+  // Okay, we can eliminate this load by inserting a reload in the predecessor
+  // and using PHI construction to get the value in the other predecessors, do
+  // it.
+  DEBUG(cerr << "GVN REMOVING PRE LOAD: " << *LI);
+  
+  Value *NewLoad = new LoadInst(LoadPtr, LI->getName()+".pre", false,
+                                LI->getAlignment(),
+                                UnavailablePred->getTerminator());
+  
+  SmallPtrSet<Instruction*, 4> &p = phiMap[LI->getPointerOperand()];
+  for (SmallPtrSet<Instruction*, 4>::iterator I = p.begin(), E = p.end();
+       I != E; ++I)
+    ValuesPerBlock.push_back(std::make_pair((*I)->getParent(), *I));
+  
+  DenseMap<BasicBlock*, Value*> BlockReplValues;
+  BlockReplValues.insert(ValuesPerBlock.begin(), ValuesPerBlock.end());
+  BlockReplValues[UnavailablePred] = NewLoad;
+  
+  // Perform PHI construction.
+  Value* v = GetValueForBlock(LI->getParent(), LI, BlockReplValues, true);
+  LI->replaceAllUsesWith(v);
+  if (isa<PHINode>(v))
+    v->takeName(LI);
+  if (isa<PointerType>(v->getType()))
+    MD->invalidateCachedPointerInfo(v);
+  toErase.push_back(LI);
+  NumPRELoad++;
+  return true;
+}
+
+/// processLoad - Attempt to eliminate a load, first by eliminating it
+/// locally, and then attempting non-local elimination if that fails.
+bool GVN::processLoad(LoadInst *L, SmallVectorImpl<Instruction*> &toErase) {
+  if (L->isVolatile())
+    return false;
+  
+  Value* pointer = L->getPointerOperand();
+
+  // ... to a pointer that has been loaded from before...
+  MemDepResult dep = MD->getDependency(L);
+  
+  // If the value isn't available, don't do anything!
+  if (dep.isClobber()) {
+    DEBUG(
+      // fast print dep, using operator<< on instruction would be too slow
+      DOUT << "GVN: load ";
+      WriteAsOperand(*DOUT.stream(), L);
+      Instruction *I = dep.getInst();
+      DOUT << " is clobbered by " << *I;
+    );
+    return false;
+  }
+
+  // If it is defined in another block, try harder.
+  if (dep.isNonLocal())
+    return processNonLocalLoad(L, toErase);
+
+  Instruction *DepInst = dep.getInst();
+  if (StoreInst *DepSI = dyn_cast<StoreInst>(DepInst)) {
+    // Only forward substitute stores to loads of the same type.
+    // FIXME: Could do better!
+    if (DepSI->getPointerOperand()->getType() != pointer->getType())
+      return false;
+    
+    // Remove it!
+    L->replaceAllUsesWith(DepSI->getOperand(0));
+    if (isa<PointerType>(DepSI->getOperand(0)->getType()))
+      MD->invalidateCachedPointerInfo(DepSI->getOperand(0));
+    toErase.push_back(L);
+    NumGVNLoad++;
+    return true;
+  }
+
+  if (LoadInst *DepLI = dyn_cast<LoadInst>(DepInst)) {
+    // Only forward substitute stores to loads of the same type.
+    // FIXME: Could do better! load i32 -> load i8 -> truncate on little endian.
+    if (DepLI->getType() != L->getType())
+      return false;
+    
+    // Remove it!
+    L->replaceAllUsesWith(DepLI);
+    if (isa<PointerType>(DepLI->getType()))
+      MD->invalidateCachedPointerInfo(DepLI);
+    toErase.push_back(L);
+    NumGVNLoad++;
+    return true;
+  }
+  
+  // If this load really doesn't depend on anything, then we must be loading an
+  // undef value.  This can happen when loading for a fresh allocation with no
+  // intervening stores, for example.
+  if (isa<AllocationInst>(DepInst)) {
+    L->replaceAllUsesWith(UndefValue::get(L->getType()));
+    toErase.push_back(L);
+    NumGVNLoad++;
+    return true;
+  }
+
+  return false;
+}
+
+Value* GVN::lookupNumber(BasicBlock* BB, uint32_t num) {
+  DenseMap<BasicBlock*, ValueNumberScope*>::iterator I = localAvail.find(BB);
+  if (I == localAvail.end())
+    return 0;
+  
+  ValueNumberScope* locals = I->second;
+  
+  while (locals) {
+    DenseMap<uint32_t, Value*>::iterator I = locals->table.find(num);
+    if (I != locals->table.end())
+      return I->second;
+    else
+      locals = locals->parent;
+  }
+  
+  return 0;
+}
+
+/// AttemptRedundancyElimination - If the "fast path" of redundancy elimination
+/// by inheritance from the dominator fails, see if we can perform phi 
+/// construction to eliminate the redundancy.
+Value* GVN::AttemptRedundancyElimination(Instruction* orig, unsigned valno) {
+  BasicBlock* BaseBlock = orig->getParent();
+  
+  SmallPtrSet<BasicBlock*, 4> Visited;
+  SmallVector<BasicBlock*, 8> Stack;
+  Stack.push_back(BaseBlock);
+  
+  DenseMap<BasicBlock*, Value*> Results;
+  
+  // Walk backwards through our predecessors, looking for instances of the
+  // value number we're looking for.  Instances are recorded in the Results
+  // map, which is then used to perform phi construction.
+  while (!Stack.empty()) {
+    BasicBlock* Current = Stack.back();
+    Stack.pop_back();
+    
+    // If we've walked all the way to a proper dominator, then give up. Cases
+    // where the instance is in the dominator will have been caught by the fast
+    // path, and any cases that require phi construction further than this are
+    // probably not worth it anyways.  Note that this is a SIGNIFICANT compile
+    // time improvement.
+    if (DT->properlyDominates(Current, orig->getParent())) return 0;
+    
+    DenseMap<BasicBlock*, ValueNumberScope*>::iterator LA =
+                                                       localAvail.find(Current);
+    if (LA == localAvail.end()) return 0;
+    DenseMap<uint32_t, Value*>::iterator V = LA->second->table.find(valno);
+    
+    if (V != LA->second->table.end()) {
+      // Found an instance, record it.
+      Results.insert(std::make_pair(Current, V->second));
+      continue;
+    }
+    
+    // If we reach the beginning of the function, then give up.
+    if (pred_begin(Current) == pred_end(Current))
+      return 0;
+    
+    for (pred_iterator PI = pred_begin(Current), PE = pred_end(Current);
+         PI != PE; ++PI)
+      if (Visited.insert(*PI))
+        Stack.push_back(*PI);
+  }
+  
+  // If we didn't find instances, give up.  Otherwise, perform phi construction.
+  if (Results.size() == 0)
+    return 0;
+  else
+    return GetValueForBlock(BaseBlock, orig, Results, true);
+}
+
+/// processInstruction - When calculating availability, handle an instruction
+/// by inserting it into the appropriate sets
+bool GVN::processInstruction(Instruction *I,
+                             SmallVectorImpl<Instruction*> &toErase) {
+  if (LoadInst* L = dyn_cast<LoadInst>(I)) {
+    bool changed = processLoad(L, toErase);
+    
+    if (!changed) {
+      unsigned num = VN.lookup_or_add(L);
+      localAvail[I->getParent()]->table.insert(std::make_pair(num, L));
+    }
+    
+    return changed;
+  }
+  
+  uint32_t nextNum = VN.getNextUnusedValueNumber();
+  unsigned num = VN.lookup_or_add(I);
+  
+  if (BranchInst* BI = dyn_cast<BranchInst>(I)) {
+    localAvail[I->getParent()]->table.insert(std::make_pair(num, I));
+    
+    if (!BI->isConditional() || isa<Constant>(BI->getCondition()))
+      return false;
+    
+    Value* branchCond = BI->getCondition();
+    uint32_t condVN = VN.lookup_or_add(branchCond);
+    
+    BasicBlock* trueSucc = BI->getSuccessor(0);
+    BasicBlock* falseSucc = BI->getSuccessor(1);
+    
+    if (trueSucc->getSinglePredecessor())
+      localAvail[trueSucc]->table[condVN] = ConstantInt::getTrue();
+    if (falseSucc->getSinglePredecessor())
+      localAvail[falseSucc]->table[condVN] = ConstantInt::getFalse();
+
+    return false;
+    
+  // Allocations are always uniquely numbered, so we can save time and memory
+  // by fast failing them.  
+  } else if (isa<AllocationInst>(I) || isa<TerminatorInst>(I)) {
+    localAvail[I->getParent()]->table.insert(std::make_pair(num, I));
+    return false;
+  }
+  
+  // Collapse PHI nodes
+  if (PHINode* p = dyn_cast<PHINode>(I)) {
+    Value* constVal = CollapsePhi(p);
+    
+    if (constVal) {
+      for (PhiMapType::iterator PI = phiMap.begin(), PE = phiMap.end();
+           PI != PE; ++PI)
+        PI->second.erase(p);
+        
+      p->replaceAllUsesWith(constVal);
+      if (isa<PointerType>(constVal->getType()))
+        MD->invalidateCachedPointerInfo(constVal);
+      VN.erase(p);
+      
+      toErase.push_back(p);
+    } else {
+      localAvail[I->getParent()]->table.insert(std::make_pair(num, I));
+    }
+  
+  // If the number we were assigned was a brand new VN, then we don't
+  // need to do a lookup to see if the number already exists
+  // somewhere in the domtree: it can't!
+  } else if (num == nextNum) {
+    localAvail[I->getParent()]->table.insert(std::make_pair(num, I));
+    
+  // Perform fast-path value-number based elimination of values inherited from
+  // dominators.
+  } else if (Value* repl = lookupNumber(I->getParent(), num)) {
+    // Remove it!
+    VN.erase(I);
+    I->replaceAllUsesWith(repl);
+    if (isa<PointerType>(repl->getType()))
+      MD->invalidateCachedPointerInfo(repl);
+    toErase.push_back(I);
+    return true;
+
+#if 0
+  // Perform slow-pathvalue-number based elimination with phi construction.
+  } else if (Value* repl = AttemptRedundancyElimination(I, num)) {
+    // Remove it!
+    VN.erase(I);
+    I->replaceAllUsesWith(repl);
+    if (isa<PointerType>(repl->getType()))
+      MD->invalidateCachedPointerInfo(repl);
+    toErase.push_back(I);
+    return true;
+#endif
+  } else {
+    localAvail[I->getParent()]->table.insert(std::make_pair(num, I));
+  }
+  
+  return false;
+}
+
+/// runOnFunction - This is the main transformation entry point for a function.
+bool GVN::runOnFunction(Function& F) {
+  MD = &getAnalysis<MemoryDependenceAnalysis>();
+  DT = &getAnalysis<DominatorTree>();
+  VN.setAliasAnalysis(&getAnalysis<AliasAnalysis>());
+  VN.setMemDep(MD);
+  VN.setDomTree(DT);
+  
+  bool changed = false;
+  bool shouldContinue = true;
+  
+  // Merge unconditional branches, allowing PRE to catch more
+  // optimization opportunities.
+  for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; ) {
+    BasicBlock* BB = FI;
+    ++FI;
+    bool removedBlock = MergeBlockIntoPredecessor(BB, this);
+    if (removedBlock) NumGVNBlocks++;
+    
+    changed |= removedBlock;
+  }
+  
+  unsigned Iteration = 0;
+  
+  while (shouldContinue) {
+    DEBUG(cerr << "GVN iteration: " << Iteration << "\n");
+    shouldContinue = iterateOnFunction(F);
+    changed |= shouldContinue;
+    ++Iteration;
+  }
+  
+  if (EnablePRE) {
+    bool PREChanged = true;
+    while (PREChanged) {
+      PREChanged = performPRE(F);
+      changed |= PREChanged;
+    }
+  }
+  // FIXME: Should perform GVN again after PRE does something.  PRE can move
+  // computations into blocks where they become fully redundant.  Note that
+  // we can't do this until PRE's critical edge splitting updates memdep.
+  // Actually, when this happens, we should just fully integrate PRE into GVN.
+
+  cleanupGlobalSets();
+
+  return changed;
+}
+
+
+bool GVN::processBlock(BasicBlock* BB) {
+  // FIXME: Kill off toErase by doing erasing eagerly in a helper function (and
+  // incrementing BI before processing an instruction).
+  SmallVector<Instruction*, 8> toErase;
+  bool changed_function = false;
+  
+  for (BasicBlock::iterator BI = BB->begin(), BE = BB->end();
+       BI != BE;) {
+    changed_function |= processInstruction(BI, toErase);
+    if (toErase.empty()) {
+      ++BI;
+      continue;
+    }
+    
+    // If we need some instructions deleted, do it now.
+    NumGVNInstr += toErase.size();
+    
+    // Avoid iterator invalidation.
+    bool AtStart = BI == BB->begin();
+    if (!AtStart)
+      --BI;
+
+    for (SmallVector<Instruction*, 4>::iterator I = toErase.begin(),
+         E = toErase.end(); I != E; ++I) {
+      DEBUG(cerr << "GVN removed: " << **I);
+      MD->removeInstruction(*I);
+      (*I)->eraseFromParent();
+      DEBUG(verifyRemoved(*I));
+    }
+    toErase.clear();
+
+    if (AtStart)
+      BI = BB->begin();
+    else
+      ++BI;
+  }
+  
+  return changed_function;
+}
+
+/// performPRE - Perform a purely local form of PRE that looks for diamond
+/// control flow patterns and attempts to perform simple PRE at the join point.
+bool GVN::performPRE(Function& F) {
+  bool Changed = false;
+  SmallVector<std::pair<TerminatorInst*, unsigned>, 4> toSplit;
+  DenseMap<BasicBlock*, Value*> predMap;
+  for (df_iterator<BasicBlock*> DI = df_begin(&F.getEntryBlock()),
+       DE = df_end(&F.getEntryBlock()); DI != DE; ++DI) {
+    BasicBlock* CurrentBlock = *DI;
+    
+    // Nothing to PRE in the entry block.
+    if (CurrentBlock == &F.getEntryBlock()) continue;
+    
+    for (BasicBlock::iterator BI = CurrentBlock->begin(),
+         BE = CurrentBlock->end(); BI != BE; ) {
+      Instruction *CurInst = BI++;
+
+      if (isa<AllocationInst>(CurInst) || isa<TerminatorInst>(CurInst) ||
+          isa<PHINode>(CurInst) || (CurInst->getType() == Type::VoidTy) ||
+          CurInst->mayReadFromMemory() || CurInst->mayHaveSideEffects() ||
+          isa<DbgInfoIntrinsic>(CurInst))
+        continue;
+
+      uint32_t valno = VN.lookup(CurInst);
+      
+      // Look for the predecessors for PRE opportunities.  We're
+      // only trying to solve the basic diamond case, where
+      // a value is computed in the successor and one predecessor,
+      // but not the other.  We also explicitly disallow cases
+      // where the successor is its own predecessor, because they're
+      // more complicated to get right.
+      unsigned numWith = 0;
+      unsigned numWithout = 0;
+      BasicBlock* PREPred = 0;
+      predMap.clear();
+
+      for (pred_iterator PI = pred_begin(CurrentBlock),
+           PE = pred_end(CurrentBlock); PI != PE; ++PI) {
+        // We're not interested in PRE where the block is its
+        // own predecessor, on in blocks with predecessors
+        // that are not reachable.
+        if (*PI == CurrentBlock) {
+          numWithout = 2;
+          break;
+        } else if (!localAvail.count(*PI))  {
+          numWithout = 2;
+          break;
+        }
+        
+        DenseMap<uint32_t, Value*>::iterator predV = 
+                                            localAvail[*PI]->table.find(valno);
+        if (predV == localAvail[*PI]->table.end()) {
+          PREPred = *PI;
+          numWithout++;
+        } else if (predV->second == CurInst) {
+          numWithout = 2;
+        } else {
+          predMap[*PI] = predV->second;
+          numWith++;
+        }
+      }
+      
+      // Don't do PRE when it might increase code size, i.e. when
+      // we would need to insert instructions in more than one pred.
+      if (numWithout != 1 || numWith == 0)
+        continue;
+      
+      // We can't do PRE safely on a critical edge, so instead we schedule
+      // the edge to be split and perform the PRE the next time we iterate
+      // on the function.
+      unsigned succNum = 0;
+      for (unsigned i = 0, e = PREPred->getTerminator()->getNumSuccessors();
+           i != e; ++i)
+        if (PREPred->getTerminator()->getSuccessor(i) == CurrentBlock) {
+          succNum = i;
+          break;
+        }
+        
+      if (isCriticalEdge(PREPred->getTerminator(), succNum)) {
+        toSplit.push_back(std::make_pair(PREPred->getTerminator(), succNum));
+        continue;
+      }
+      
+      // Instantiate the expression the in predecessor that lacked it.
+      // Because we are going top-down through the block, all value numbers
+      // will be available in the predecessor by the time we need them.  Any
+      // that weren't original present will have been instantiated earlier
+      // in this loop.
+      Instruction* PREInstr = CurInst->clone();
+      bool success = true;
+      for (unsigned i = 0, e = CurInst->getNumOperands(); i != e; ++i) {
+        Value *Op = PREInstr->getOperand(i);
+        if (isa<Argument>(Op) || isa<Constant>(Op) || isa<GlobalValue>(Op))
+          continue;
+        
+        if (Value *V = lookupNumber(PREPred, VN.lookup(Op))) {
+          PREInstr->setOperand(i, V);
+        } else {
+          success = false;
+          break;
+        }
+      }
+      
+      // Fail out if we encounter an operand that is not available in
+      // the PRE predecessor.  This is typically because of loads which 
+      // are not value numbered precisely.
+      if (!success) {
+        delete PREInstr;
+        DEBUG(verifyRemoved(PREInstr));
+        continue;
+      }
+      
+      PREInstr->insertBefore(PREPred->getTerminator());
+      PREInstr->setName(CurInst->getName() + ".pre");
+      predMap[PREPred] = PREInstr;
+      VN.add(PREInstr, valno);
+      NumGVNPRE++;
+      
+      // Update the availability map to include the new instruction.
+      localAvail[PREPred]->table.insert(std::make_pair(valno, PREInstr));
+      
+      // Create a PHI to make the value available in this block.
+      PHINode* Phi = PHINode::Create(CurInst->getType(),
+                                     CurInst->getName() + ".pre-phi",
+                                     CurrentBlock->begin());
+      for (pred_iterator PI = pred_begin(CurrentBlock),
+           PE = pred_end(CurrentBlock); PI != PE; ++PI)
+        Phi->addIncoming(predMap[*PI], *PI);
+      
+      VN.add(Phi, valno);
+      localAvail[CurrentBlock]->table[valno] = Phi;
+      
+      CurInst->replaceAllUsesWith(Phi);
+      if (isa<PointerType>(Phi->getType()))
+        MD->invalidateCachedPointerInfo(Phi);
+      VN.erase(CurInst);
+      
+      DEBUG(cerr << "GVN PRE removed: " << *CurInst);
+      MD->removeInstruction(CurInst);
+      CurInst->eraseFromParent();
+      DEBUG(verifyRemoved(CurInst));
+      Changed = true;
+    }
+  }
+  
+  for (SmallVector<std::pair<TerminatorInst*, unsigned>, 4>::iterator
+       I = toSplit.begin(), E = toSplit.end(); I != E; ++I)
+    SplitCriticalEdge(I->first, I->second, this);
+  
+  return Changed || toSplit.size();
+}
+
+/// iterateOnFunction - Executes one iteration of GVN
+bool GVN::iterateOnFunction(Function &F) {
+  cleanupGlobalSets();
+
+  for (df_iterator<DomTreeNode*> DI = df_begin(DT->getRootNode()),
+       DE = df_end(DT->getRootNode()); DI != DE; ++DI) {
+    if (DI->getIDom())
+      localAvail[DI->getBlock()] =
+                   new ValueNumberScope(localAvail[DI->getIDom()->getBlock()]);
+    else
+      localAvail[DI->getBlock()] = new ValueNumberScope(0);
+  }
+
+  // Top-down walk of the dominator tree
+  bool changed = false;
+#if 0
+  // Needed for value numbering with phi construction to work.
+  ReversePostOrderTraversal<Function*> RPOT(&F);
+  for (ReversePostOrderTraversal<Function*>::rpo_iterator RI = RPOT.begin(),
+       RE = RPOT.end(); RI != RE; ++RI)
+    changed |= processBlock(*RI);
+#else
+  for (df_iterator<DomTreeNode*> DI = df_begin(DT->getRootNode()),
+       DE = df_end(DT->getRootNode()); DI != DE; ++DI)
+    changed |= processBlock(DI->getBlock());
+#endif
+
+  return changed;
+}
+
+void GVN::cleanupGlobalSets() {
+  VN.clear();
+  phiMap.clear();
+
+  for (DenseMap<BasicBlock*, ValueNumberScope*>::iterator
+       I = localAvail.begin(), E = localAvail.end(); I != E; ++I)
+    delete I->second;
+  localAvail.clear();
+}
+
+/// verifyRemoved - Verify that the specified instruction does not occur in our
+/// internal data structures.
+void GVN::verifyRemoved(const Instruction *Inst) const {
+  VN.verifyRemoved(Inst);
+
+  // Walk through the PHI map to make sure the instruction isn't hiding in there
+  // somewhere.
+  for (PhiMapType::iterator
+         I = phiMap.begin(), E = phiMap.end(); I != E; ++I) {
+    assert(I->first != Inst && "Inst is still a key in PHI map!");
+
+    for (SmallPtrSet<Instruction*, 4>::iterator
+           II = I->second.begin(), IE = I->second.end(); II != IE; ++II) {
+      assert(*II != Inst && "Inst is still a value in PHI map!");
+    }
+  }
+
+  // Walk through the value number scope to make sure the instruction isn't
+  // ferreted away in it.
+  for (DenseMap<BasicBlock*, ValueNumberScope*>::iterator
+         I = localAvail.begin(), E = localAvail.end(); I != E; ++I) {
+    const ValueNumberScope *VNS = I->second;
+
+    while (VNS) {
+      for (DenseMap<uint32_t, Value*>::iterator
+             II = VNS->table.begin(), IE = VNS->table.end(); II != IE; ++II) {
+        assert(II->second != Inst && "Inst still in value numbering scope!");
+      }
+
+      VNS = VNS->parent;
+    }
+  }
+}
diff --git a/lib/Transforms/Scalar/GVNPRE.cpp b/lib/Transforms/Scalar/GVNPRE.cpp
new file mode 100644
index 0000000..e3b0937
--- /dev/null
+++ b/lib/Transforms/Scalar/GVNPRE.cpp
@@ -0,0 +1,1885 @@
+//===- GVNPRE.cpp - Eliminate redundant values and expressions ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass performs a hybrid of global value numbering and partial redundancy
+// elimination, known as GVN-PRE.  It performs partial redundancy elimination on
+// values, rather than lexical expressions, allowing a more comprehensive view 
+// the optimization.  It replaces redundant values with uses of earlier 
+// occurences of the same value.  While this is beneficial in that it eliminates
+// unneeded computation, it also increases register pressure by creating large
+// live ranges, and should be used with caution on platforms that are very 
+// sensitive to register pressure.
+//
+// Note that this pass does the value numbering itself, it does not use the
+// ValueNumbering analysis passes.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "gvnpre"
+#include "llvm/Value.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Instructions.h"
+#include "llvm/Function.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Transforms/Utils/UnifyFunctionExitNodes.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include <algorithm>
+#include <deque>
+#include <map>
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+//                         ValueTable Class
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+/// This class holds the mapping between values and value numbers.  It is used
+/// as an efficient mechanism to determine the expression-wise equivalence of
+/// two values.
+
+struct Expression {
+  enum ExpressionOpcode { ADD, SUB, MUL, UDIV, SDIV, FDIV, UREM, SREM, 
+                          FREM, SHL, LSHR, ASHR, AND, OR, XOR, ICMPEQ, 
+                          ICMPNE, ICMPUGT, ICMPUGE, ICMPULT, ICMPULE, 
+                          ICMPSGT, ICMPSGE, ICMPSLT, ICMPSLE, FCMPOEQ, 
+                          FCMPOGT, FCMPOGE, FCMPOLT, FCMPOLE, FCMPONE, 
+                          FCMPORD, FCMPUNO, FCMPUEQ, FCMPUGT, FCMPUGE, 
+                          FCMPULT, FCMPULE, FCMPUNE, EXTRACT, INSERT,
+                          SHUFFLE, SELECT, TRUNC, ZEXT, SEXT, FPTOUI,
+                          FPTOSI, UITOFP, SITOFP, FPTRUNC, FPEXT, 
+                          PTRTOINT, INTTOPTR, BITCAST, GEP, EMPTY,
+                          TOMBSTONE };
+
+  ExpressionOpcode opcode;
+  const Type* type;
+  uint32_t firstVN;
+  uint32_t secondVN;
+  uint32_t thirdVN;
+  SmallVector<uint32_t, 4> varargs;
+  
+  Expression() { }
+  explicit Expression(ExpressionOpcode o) : opcode(o) { }
+  
+  bool operator==(const Expression &other) const {
+    if (opcode != other.opcode)
+      return false;
+    else if (opcode == EMPTY || opcode == TOMBSTONE)
+      return true;
+    else if (type != other.type)
+      return false;
+    else if (firstVN != other.firstVN)
+      return false;
+    else if (secondVN != other.secondVN)
+      return false;
+    else if (thirdVN != other.thirdVN)
+      return false;
+    else {
+      if (varargs.size() != other.varargs.size())
+        return false;
+      
+      for (size_t i = 0; i < varargs.size(); ++i)
+        if (varargs[i] != other.varargs[i])
+          return false;
+    
+      return true;
+    }
+  }
+  
+  bool operator!=(const Expression &other) const {
+    if (opcode != other.opcode)
+      return true;
+    else if (opcode == EMPTY || opcode == TOMBSTONE)
+      return false;
+    else if (type != other.type)
+      return true;
+    else if (firstVN != other.firstVN)
+      return true;
+    else if (secondVN != other.secondVN)
+      return true;
+    else if (thirdVN != other.thirdVN)
+      return true;
+    else {
+      if (varargs.size() != other.varargs.size())
+        return true;
+      
+      for (size_t i = 0; i < varargs.size(); ++i)
+        if (varargs[i] != other.varargs[i])
+          return true;
+    
+      return false;
+    }
+  }
+};
+
+}
+
+namespace {
+  class VISIBILITY_HIDDEN ValueTable {
+    private:
+      DenseMap<Value*, uint32_t> valueNumbering;
+      DenseMap<Expression, uint32_t> expressionNumbering;
+  
+      uint32_t nextValueNumber;
+    
+      Expression::ExpressionOpcode getOpcode(BinaryOperator* BO);
+      Expression::ExpressionOpcode getOpcode(CmpInst* C);
+      Expression::ExpressionOpcode getOpcode(CastInst* C);
+      Expression create_expression(BinaryOperator* BO);
+      Expression create_expression(CmpInst* C);
+      Expression create_expression(ShuffleVectorInst* V);
+      Expression create_expression(ExtractElementInst* C);
+      Expression create_expression(InsertElementInst* V);
+      Expression create_expression(SelectInst* V);
+      Expression create_expression(CastInst* C);
+      Expression create_expression(GetElementPtrInst* G);
+    public:
+      ValueTable() { nextValueNumber = 1; }
+      uint32_t lookup_or_add(Value* V);
+      uint32_t lookup(Value* V) const;
+      void add(Value* V, uint32_t num);
+      void clear();
+      void erase(Value* v);
+      unsigned size();
+  };
+}
+
+namespace llvm {
+template <> struct DenseMapInfo<Expression> {
+  static inline Expression getEmptyKey() {
+    return Expression(Expression::EMPTY);
+  }
+  
+  static inline Expression getTombstoneKey() {
+    return Expression(Expression::TOMBSTONE);
+  }
+  
+  static unsigned getHashValue(const Expression e) {
+    unsigned hash = e.opcode;
+    
+    hash = e.firstVN + hash * 37;
+    hash = e.secondVN + hash * 37;
+    hash = e.thirdVN + hash * 37;
+    
+    hash = ((unsigned)((uintptr_t)e.type >> 4) ^
+            (unsigned)((uintptr_t)e.type >> 9)) +
+           hash * 37;
+    
+    for (SmallVector<uint32_t, 4>::const_iterator I = e.varargs.begin(),
+         E = e.varargs.end(); I != E; ++I)
+      hash = *I + hash * 37;
+    
+    return hash;
+  }
+  static bool isEqual(const Expression &LHS, const Expression &RHS) {
+    return LHS == RHS;
+  }
+  static bool isPod() { return true; }
+};
+}
+
+//===----------------------------------------------------------------------===//
+//                     ValueTable Internal Functions
+//===----------------------------------------------------------------------===//
+Expression::ExpressionOpcode 
+                             ValueTable::getOpcode(BinaryOperator* BO) {
+  switch(BO->getOpcode()) {
+    case Instruction::Add:
+      return Expression::ADD;
+    case Instruction::Sub:
+      return Expression::SUB;
+    case Instruction::Mul:
+      return Expression::MUL;
+    case Instruction::UDiv:
+      return Expression::UDIV;
+    case Instruction::SDiv:
+      return Expression::SDIV;
+    case Instruction::FDiv:
+      return Expression::FDIV;
+    case Instruction::URem:
+      return Expression::UREM;
+    case Instruction::SRem:
+      return Expression::SREM;
+    case Instruction::FRem:
+      return Expression::FREM;
+    case Instruction::Shl:
+      return Expression::SHL;
+    case Instruction::LShr:
+      return Expression::LSHR;
+    case Instruction::AShr:
+      return Expression::ASHR;
+    case Instruction::And:
+      return Expression::AND;
+    case Instruction::Or:
+      return Expression::OR;
+    case Instruction::Xor:
+      return Expression::XOR;
+    
+    // THIS SHOULD NEVER HAPPEN
+    default:
+      assert(0 && "Binary operator with unknown opcode?");
+      return Expression::ADD;
+  }
+}
+
+Expression::ExpressionOpcode ValueTable::getOpcode(CmpInst* C) {
+  if (C->getOpcode() == Instruction::ICmp) {
+    switch (C->getPredicate()) {
+      case ICmpInst::ICMP_EQ:
+        return Expression::ICMPEQ;
+      case ICmpInst::ICMP_NE:
+        return Expression::ICMPNE;
+      case ICmpInst::ICMP_UGT:
+        return Expression::ICMPUGT;
+      case ICmpInst::ICMP_UGE:
+        return Expression::ICMPUGE;
+      case ICmpInst::ICMP_ULT:
+        return Expression::ICMPULT;
+      case ICmpInst::ICMP_ULE:
+        return Expression::ICMPULE;
+      case ICmpInst::ICMP_SGT:
+        return Expression::ICMPSGT;
+      case ICmpInst::ICMP_SGE:
+        return Expression::ICMPSGE;
+      case ICmpInst::ICMP_SLT:
+        return Expression::ICMPSLT;
+      case ICmpInst::ICMP_SLE:
+        return Expression::ICMPSLE;
+      
+      // THIS SHOULD NEVER HAPPEN
+      default:
+        assert(0 && "Comparison with unknown predicate?");
+        return Expression::ICMPEQ;
+    }
+  } else {
+    switch (C->getPredicate()) {
+      case FCmpInst::FCMP_OEQ:
+        return Expression::FCMPOEQ;
+      case FCmpInst::FCMP_OGT:
+        return Expression::FCMPOGT;
+      case FCmpInst::FCMP_OGE:
+        return Expression::FCMPOGE;
+      case FCmpInst::FCMP_OLT:
+        return Expression::FCMPOLT;
+      case FCmpInst::FCMP_OLE:
+        return Expression::FCMPOLE;
+      case FCmpInst::FCMP_ONE:
+        return Expression::FCMPONE;
+      case FCmpInst::FCMP_ORD:
+        return Expression::FCMPORD;
+      case FCmpInst::FCMP_UNO:
+        return Expression::FCMPUNO;
+      case FCmpInst::FCMP_UEQ:
+        return Expression::FCMPUEQ;
+      case FCmpInst::FCMP_UGT:
+        return Expression::FCMPUGT;
+      case FCmpInst::FCMP_UGE:
+        return Expression::FCMPUGE;
+      case FCmpInst::FCMP_ULT:
+        return Expression::FCMPULT;
+      case FCmpInst::FCMP_ULE:
+        return Expression::FCMPULE;
+      case FCmpInst::FCMP_UNE:
+        return Expression::FCMPUNE;
+      
+      // THIS SHOULD NEVER HAPPEN
+      default:
+        assert(0 && "Comparison with unknown predicate?");
+        return Expression::FCMPOEQ;
+    }
+  }
+}
+
+Expression::ExpressionOpcode 
+                             ValueTable::getOpcode(CastInst* C) {
+  switch(C->getOpcode()) {
+    case Instruction::Trunc:
+      return Expression::TRUNC;
+    case Instruction::ZExt:
+      return Expression::ZEXT;
+    case Instruction::SExt:
+      return Expression::SEXT;
+    case Instruction::FPToUI:
+      return Expression::FPTOUI;
+    case Instruction::FPToSI:
+      return Expression::FPTOSI;
+    case Instruction::UIToFP:
+      return Expression::UITOFP;
+    case Instruction::SIToFP:
+      return Expression::SITOFP;
+    case Instruction::FPTrunc:
+      return Expression::FPTRUNC;
+    case Instruction::FPExt:
+      return Expression::FPEXT;
+    case Instruction::PtrToInt:
+      return Expression::PTRTOINT;
+    case Instruction::IntToPtr:
+      return Expression::INTTOPTR;
+    case Instruction::BitCast:
+      return Expression::BITCAST;
+    
+    // THIS SHOULD NEVER HAPPEN
+    default:
+      assert(0 && "Cast operator with unknown opcode?");
+      return Expression::BITCAST;
+  }
+}
+
+Expression ValueTable::create_expression(BinaryOperator* BO) {
+  Expression e;
+    
+  e.firstVN = lookup_or_add(BO->getOperand(0));
+  e.secondVN = lookup_or_add(BO->getOperand(1));
+  e.thirdVN = 0;
+  e.type = BO->getType();
+  e.opcode = getOpcode(BO);
+  
+  return e;
+}
+
+Expression ValueTable::create_expression(CmpInst* C) {
+  Expression e;
+    
+  e.firstVN = lookup_or_add(C->getOperand(0));
+  e.secondVN = lookup_or_add(C->getOperand(1));
+  e.thirdVN = 0;
+  e.type = C->getType();
+  e.opcode = getOpcode(C);
+  
+  return e;
+}
+
+Expression ValueTable::create_expression(CastInst* C) {
+  Expression e;
+    
+  e.firstVN = lookup_or_add(C->getOperand(0));
+  e.secondVN = 0;
+  e.thirdVN = 0;
+  e.type = C->getType();
+  e.opcode = getOpcode(C);
+  
+  return e;
+}
+
+Expression ValueTable::create_expression(ShuffleVectorInst* S) {
+  Expression e;
+    
+  e.firstVN = lookup_or_add(S->getOperand(0));
+  e.secondVN = lookup_or_add(S->getOperand(1));
+  e.thirdVN = lookup_or_add(S->getOperand(2));
+  e.type = S->getType();
+  e.opcode = Expression::SHUFFLE;
+  
+  return e;
+}
+
+Expression ValueTable::create_expression(ExtractElementInst* E) {
+  Expression e;
+    
+  e.firstVN = lookup_or_add(E->getOperand(0));
+  e.secondVN = lookup_or_add(E->getOperand(1));
+  e.thirdVN = 0;
+  e.type = E->getType();
+  e.opcode = Expression::EXTRACT;
+  
+  return e;
+}
+
+Expression ValueTable::create_expression(InsertElementInst* I) {
+  Expression e;
+    
+  e.firstVN = lookup_or_add(I->getOperand(0));
+  e.secondVN = lookup_or_add(I->getOperand(1));
+  e.thirdVN = lookup_or_add(I->getOperand(2));
+  e.type = I->getType();
+  e.opcode = Expression::INSERT;
+  
+  return e;
+}
+
+Expression ValueTable::create_expression(SelectInst* I) {
+  Expression e;
+    
+  e.firstVN = lookup_or_add(I->getCondition());
+  e.secondVN = lookup_or_add(I->getTrueValue());
+  e.thirdVN = lookup_or_add(I->getFalseValue());
+  e.type = I->getType();
+  e.opcode = Expression::SELECT;
+  
+  return e;
+}
+
+Expression ValueTable::create_expression(GetElementPtrInst* G) {
+  Expression e;
+    
+  e.firstVN = lookup_or_add(G->getPointerOperand());
+  e.secondVN = 0;
+  e.thirdVN = 0;
+  e.type = G->getType();
+  e.opcode = Expression::GEP;
+  
+  for (GetElementPtrInst::op_iterator I = G->idx_begin(), E = G->idx_end();
+       I != E; ++I)
+    e.varargs.push_back(lookup_or_add(*I));
+  
+  return e;
+}
+
+//===----------------------------------------------------------------------===//
+//                     ValueTable External Functions
+//===----------------------------------------------------------------------===//
+
+/// lookup_or_add - Returns the value number for the specified value, assigning
+/// it a new number if it did not have one before.
+uint32_t ValueTable::lookup_or_add(Value* V) {
+  DenseMap<Value*, uint32_t>::iterator VI = valueNumbering.find(V);
+  if (VI != valueNumbering.end())
+    return VI->second;
+  
+  
+  if (BinaryOperator* BO = dyn_cast<BinaryOperator>(V)) {
+    Expression e = create_expression(BO);
+    
+    DenseMap<Expression, uint32_t>::iterator EI = expressionNumbering.find(e);
+    if (EI != expressionNumbering.end()) {
+      valueNumbering.insert(std::make_pair(V, EI->second));
+      return EI->second;
+    } else {
+      expressionNumbering.insert(std::make_pair(e, nextValueNumber));
+      valueNumbering.insert(std::make_pair(V, nextValueNumber));
+      
+      return nextValueNumber++;
+    }
+  } else if (CmpInst* C = dyn_cast<CmpInst>(V)) {
+    Expression e = create_expression(C);
+    
+    DenseMap<Expression, uint32_t>::iterator EI = expressionNumbering.find(e);
+    if (EI != expressionNumbering.end()) {
+      valueNumbering.insert(std::make_pair(V, EI->second));
+      return EI->second;
+    } else {
+      expressionNumbering.insert(std::make_pair(e, nextValueNumber));
+      valueNumbering.insert(std::make_pair(V, nextValueNumber));
+      
+      return nextValueNumber++;
+    }
+  } else if (ShuffleVectorInst* U = dyn_cast<ShuffleVectorInst>(V)) {
+    Expression e = create_expression(U);
+    
+    DenseMap<Expression, uint32_t>::iterator EI = expressionNumbering.find(e);
+    if (EI != expressionNumbering.end()) {
+      valueNumbering.insert(std::make_pair(V, EI->second));
+      return EI->second;
+    } else {
+      expressionNumbering.insert(std::make_pair(e, nextValueNumber));
+      valueNumbering.insert(std::make_pair(V, nextValueNumber));
+      
+      return nextValueNumber++;
+    }
+  } else if (ExtractElementInst* U = dyn_cast<ExtractElementInst>(V)) {
+    Expression e = create_expression(U);
+    
+    DenseMap<Expression, uint32_t>::iterator EI = expressionNumbering.find(e);
+    if (EI != expressionNumbering.end()) {
+      valueNumbering.insert(std::make_pair(V, EI->second));
+      return EI->second;
+    } else {
+      expressionNumbering.insert(std::make_pair(e, nextValueNumber));
+      valueNumbering.insert(std::make_pair(V, nextValueNumber));
+      
+      return nextValueNumber++;
+    }
+  } else if (InsertElementInst* U = dyn_cast<InsertElementInst>(V)) {
+    Expression e = create_expression(U);
+    
+    DenseMap<Expression, uint32_t>::iterator EI = expressionNumbering.find(e);
+    if (EI != expressionNumbering.end()) {
+      valueNumbering.insert(std::make_pair(V, EI->second));
+      return EI->second;
+    } else {
+      expressionNumbering.insert(std::make_pair(e, nextValueNumber));
+      valueNumbering.insert(std::make_pair(V, nextValueNumber));
+      
+      return nextValueNumber++;
+    }
+  } else if (SelectInst* U = dyn_cast<SelectInst>(V)) {
+    Expression e = create_expression(U);
+    
+    DenseMap<Expression, uint32_t>::iterator EI = expressionNumbering.find(e);
+    if (EI != expressionNumbering.end()) {
+      valueNumbering.insert(std::make_pair(V, EI->second));
+      return EI->second;
+    } else {
+      expressionNumbering.insert(std::make_pair(e, nextValueNumber));
+      valueNumbering.insert(std::make_pair(V, nextValueNumber));
+      
+      return nextValueNumber++;
+    }
+  } else if (CastInst* U = dyn_cast<CastInst>(V)) {
+    Expression e = create_expression(U);
+    
+    DenseMap<Expression, uint32_t>::iterator EI = expressionNumbering.find(e);
+    if (EI != expressionNumbering.end()) {
+      valueNumbering.insert(std::make_pair(V, EI->second));
+      return EI->second;
+    } else {
+      expressionNumbering.insert(std::make_pair(e, nextValueNumber));
+      valueNumbering.insert(std::make_pair(V, nextValueNumber));
+      
+      return nextValueNumber++;
+    }
+  } else if (GetElementPtrInst* U = dyn_cast<GetElementPtrInst>(V)) {
+    Expression e = create_expression(U);
+    
+    DenseMap<Expression, uint32_t>::iterator EI = expressionNumbering.find(e);
+    if (EI != expressionNumbering.end()) {
+      valueNumbering.insert(std::make_pair(V, EI->second));
+      return EI->second;
+    } else {
+      expressionNumbering.insert(std::make_pair(e, nextValueNumber));
+      valueNumbering.insert(std::make_pair(V, nextValueNumber));
+      
+      return nextValueNumber++;
+    }
+  } else {
+    valueNumbering.insert(std::make_pair(V, nextValueNumber));
+    return nextValueNumber++;
+  }
+}
+
+/// lookup - Returns the value number of the specified value. Fails if
+/// the value has not yet been numbered.
+uint32_t ValueTable::lookup(Value* V) const {
+  DenseMap<Value*, uint32_t>::iterator VI = valueNumbering.find(V);
+  if (VI != valueNumbering.end())
+    return VI->second;
+  else
+    assert(0 && "Value not numbered?");
+  
+  return 0;
+}
+
+/// add - Add the specified value with the given value number, removing
+/// its old number, if any
+void ValueTable::add(Value* V, uint32_t num) {
+  DenseMap<Value*, uint32_t>::iterator VI = valueNumbering.find(V);
+  if (VI != valueNumbering.end())
+    valueNumbering.erase(VI);
+  valueNumbering.insert(std::make_pair(V, num));
+}
+
+/// clear - Remove all entries from the ValueTable
+void ValueTable::clear() {
+  valueNumbering.clear();
+  expressionNumbering.clear();
+  nextValueNumber = 1;
+}
+
+/// erase - Remove a value from the value numbering
+void ValueTable::erase(Value* V) {
+  valueNumbering.erase(V);
+}
+
+/// size - Return the number of assigned value numbers
+unsigned ValueTable::size() {
+  // NOTE: zero is never assigned
+  return nextValueNumber;
+}
+
+namespace {
+
+//===----------------------------------------------------------------------===//
+//                       ValueNumberedSet Class
+//===----------------------------------------------------------------------===//
+
+class ValueNumberedSet {
+  private:
+    SmallPtrSet<Value*, 8> contents;
+    BitVector numbers;
+  public:
+    ValueNumberedSet() { numbers.resize(1); }
+    ValueNumberedSet(const ValueNumberedSet& other) {
+      numbers = other.numbers;
+      contents = other.contents;
+    }
+    
+    typedef SmallPtrSet<Value*, 8>::iterator iterator;
+    
+    iterator begin() { return contents.begin(); }
+    iterator end() { return contents.end(); }
+    
+    bool insert(Value* v) { return contents.insert(v); }
+    void insert(iterator I, iterator E) { contents.insert(I, E); }
+    void erase(Value* v) { contents.erase(v); }
+    unsigned count(Value* v) { return contents.count(v); }
+    size_t size() { return contents.size(); }
+    
+    void set(unsigned i)  {
+      if (i >= numbers.size())
+        numbers.resize(i+1);
+      
+      numbers.set(i);
+    }
+    
+    void operator=(const ValueNumberedSet& other) {
+      contents = other.contents;
+      numbers = other.numbers;
+    }
+    
+    void reset(unsigned i)  {
+      if (i < numbers.size())
+        numbers.reset(i);
+    }
+    
+    bool test(unsigned i)  {
+      if (i >= numbers.size())
+        return false;
+      
+      return numbers.test(i);
+    }
+    
+    void clear() {
+      contents.clear();
+      numbers.clear();
+    }
+};
+
+}
+
+//===----------------------------------------------------------------------===//
+//                         GVNPRE Pass
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+  class VISIBILITY_HIDDEN GVNPRE : public FunctionPass {
+    bool runOnFunction(Function &F);
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    GVNPRE() : FunctionPass(&ID) {}
+
+  private:
+    ValueTable VN;
+    SmallVector<Instruction*, 8> createdExpressions;
+    
+    DenseMap<BasicBlock*, ValueNumberedSet> availableOut;
+    DenseMap<BasicBlock*, ValueNumberedSet> anticipatedIn;
+    DenseMap<BasicBlock*, ValueNumberedSet> generatedPhis;
+    
+    // This transformation requires dominator postdominator info
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesCFG();
+      AU.addRequiredID(BreakCriticalEdgesID);
+      AU.addRequired<UnifyFunctionExitNodes>();
+      AU.addRequired<DominatorTree>();
+    }
+  
+    // Helper fuctions
+    // FIXME: eliminate or document these better
+    void dump(ValueNumberedSet& s) const ;
+    void clean(ValueNumberedSet& set) ;
+    Value* find_leader(ValueNumberedSet& vals, uint32_t v) ;
+    Value* phi_translate(Value* V, BasicBlock* pred, BasicBlock* succ) ;
+    void phi_translate_set(ValueNumberedSet& anticIn, BasicBlock* pred,
+                           BasicBlock* succ, ValueNumberedSet& out) ;
+    
+    void topo_sort(ValueNumberedSet& set,
+                   SmallVector<Value*, 8>& vec) ;
+    
+    void cleanup() ;
+    bool elimination() ;
+    
+    void val_insert(ValueNumberedSet& s, Value* v) ;
+    void val_replace(ValueNumberedSet& s, Value* v) ;
+    bool dependsOnInvoke(Value* V) ;
+    void buildsets_availout(BasicBlock::iterator I,
+                            ValueNumberedSet& currAvail,
+                            ValueNumberedSet& currPhis,
+                            ValueNumberedSet& currExps,
+                            SmallPtrSet<Value*, 16>& currTemps);
+    bool buildsets_anticout(BasicBlock* BB,
+                            ValueNumberedSet& anticOut,
+                            SmallPtrSet<BasicBlock*, 8>& visited);
+    unsigned buildsets_anticin(BasicBlock* BB,
+                           ValueNumberedSet& anticOut,
+                           ValueNumberedSet& currExps,
+                           SmallPtrSet<Value*, 16>& currTemps,
+                           SmallPtrSet<BasicBlock*, 8>& visited);
+    void buildsets(Function& F) ;
+    
+    void insertion_pre(Value* e, BasicBlock* BB,
+                       DenseMap<BasicBlock*, Value*>& avail,
+                       std::map<BasicBlock*,ValueNumberedSet>& new_set);
+    unsigned insertion_mergepoint(SmallVector<Value*, 8>& workList,
+                                  df_iterator<DomTreeNode*>& D,
+                      std::map<BasicBlock*, ValueNumberedSet>& new_set);
+    bool insertion(Function& F) ;
+  
+  };
+  
+  char GVNPRE::ID = 0;
+  
+}
+
+// createGVNPREPass - The public interface to this file...
+FunctionPass *llvm::createGVNPREPass() { return new GVNPRE(); }
+
+static RegisterPass<GVNPRE> X("gvnpre",
+                      "Global Value Numbering/Partial Redundancy Elimination");
+
+
+STATISTIC(NumInsertedVals, "Number of values inserted");
+STATISTIC(NumInsertedPhis, "Number of PHI nodes inserted");
+STATISTIC(NumEliminated, "Number of redundant instructions eliminated");
+
+/// find_leader - Given a set and a value number, return the first
+/// element of the set with that value number, or 0 if no such element
+/// is present
+Value* GVNPRE::find_leader(ValueNumberedSet& vals, uint32_t v) {
+  if (!vals.test(v))
+    return 0;
+  
+  for (ValueNumberedSet::iterator I = vals.begin(), E = vals.end();
+       I != E; ++I)
+    if (v == VN.lookup(*I))
+      return *I;
+  
+  assert(0 && "No leader found, but present bit is set?");
+  return 0;
+}
+
+/// val_insert - Insert a value into a set only if there is not a value
+/// with the same value number already in the set
+void GVNPRE::val_insert(ValueNumberedSet& s, Value* v) {
+  uint32_t num = VN.lookup(v);
+  if (!s.test(num))
+    s.insert(v);
+}
+
+/// val_replace - Insert a value into a set, replacing any values already in
+/// the set that have the same value number
+void GVNPRE::val_replace(ValueNumberedSet& s, Value* v) {
+  if (s.count(v)) return;
+  
+  uint32_t num = VN.lookup(v);
+  Value* leader = find_leader(s, num);
+  if (leader != 0)
+    s.erase(leader);
+  s.insert(v);
+  s.set(num);
+}
+
+/// phi_translate - Given a value, its parent block, and a predecessor of its
+/// parent, translate the value into legal for the predecessor block.  This 
+/// means translating its operands (and recursively, their operands) through
+/// any phi nodes in the parent into values available in the predecessor
+Value* GVNPRE::phi_translate(Value* V, BasicBlock* pred, BasicBlock* succ) {
+  if (V == 0)
+    return 0;
+  
+  // Unary Operations
+  if (CastInst* U = dyn_cast<CastInst>(V)) {
+    Value* newOp1 = 0;
+    if (isa<Instruction>(U->getOperand(0)))
+      newOp1 = phi_translate(U->getOperand(0), pred, succ);
+    else
+      newOp1 = U->getOperand(0);
+    
+    if (newOp1 == 0)
+      return 0;
+    
+    if (newOp1 != U->getOperand(0)) {
+      Instruction* newVal = 0;
+      if (CastInst* C = dyn_cast<CastInst>(U))
+        newVal = CastInst::Create(C->getOpcode(),
+                                  newOp1, C->getType(),
+                                  C->getName()+".expr");
+      
+      uint32_t v = VN.lookup_or_add(newVal);
+      
+      Value* leader = find_leader(availableOut[pred], v);
+      if (leader == 0) {
+        createdExpressions.push_back(newVal);
+        return newVal;
+      } else {
+        VN.erase(newVal);
+        delete newVal;
+        return leader;
+      }
+    }
+  
+  // Binary Operations
+  } if (isa<BinaryOperator>(V) || isa<CmpInst>(V) || 
+      isa<ExtractElementInst>(V)) {
+    User* U = cast<User>(V);
+    
+    Value* newOp1 = 0;
+    if (isa<Instruction>(U->getOperand(0)))
+      newOp1 = phi_translate(U->getOperand(0), pred, succ);
+    else
+      newOp1 = U->getOperand(0);
+    
+    if (newOp1 == 0)
+      return 0;
+    
+    Value* newOp2 = 0;
+    if (isa<Instruction>(U->getOperand(1)))
+      newOp2 = phi_translate(U->getOperand(1), pred, succ);
+    else
+      newOp2 = U->getOperand(1);
+    
+    if (newOp2 == 0)
+      return 0;
+    
+    if (newOp1 != U->getOperand(0) || newOp2 != U->getOperand(1)) {
+      Instruction* newVal = 0;
+      if (BinaryOperator* BO = dyn_cast<BinaryOperator>(U))
+        newVal = BinaryOperator::Create(BO->getOpcode(),
+                                        newOp1, newOp2,
+                                        BO->getName()+".expr");
+      else if (CmpInst* C = dyn_cast<CmpInst>(U))
+        newVal = CmpInst::Create(C->getOpcode(),
+                                 C->getPredicate(),
+                                 newOp1, newOp2,
+                                 C->getName()+".expr");
+      else if (ExtractElementInst* E = dyn_cast<ExtractElementInst>(U))
+        newVal = new ExtractElementInst(newOp1, newOp2, E->getName()+".expr");
+      
+      uint32_t v = VN.lookup_or_add(newVal);
+      
+      Value* leader = find_leader(availableOut[pred], v);
+      if (leader == 0) {
+        createdExpressions.push_back(newVal);
+        return newVal;
+      } else {
+        VN.erase(newVal);
+        delete newVal;
+        return leader;
+      }
+    }
+  
+  // Ternary Operations
+  } else if (isa<ShuffleVectorInst>(V) || isa<InsertElementInst>(V) ||
+             isa<SelectInst>(V)) {
+    User* U = cast<User>(V);
+    
+    Value* newOp1 = 0;
+    if (isa<Instruction>(U->getOperand(0)))
+      newOp1 = phi_translate(U->getOperand(0), pred, succ);
+    else
+      newOp1 = U->getOperand(0);
+    
+    if (newOp1 == 0)
+      return 0;
+    
+    Value* newOp2 = 0;
+    if (isa<Instruction>(U->getOperand(1)))
+      newOp2 = phi_translate(U->getOperand(1), pred, succ);
+    else
+      newOp2 = U->getOperand(1);
+    
+    if (newOp2 == 0)
+      return 0;
+    
+    Value* newOp3 = 0;
+    if (isa<Instruction>(U->getOperand(2)))
+      newOp3 = phi_translate(U->getOperand(2), pred, succ);
+    else
+      newOp3 = U->getOperand(2);
+    
+    if (newOp3 == 0)
+      return 0;
+    
+    if (newOp1 != U->getOperand(0) ||
+        newOp2 != U->getOperand(1) ||
+        newOp3 != U->getOperand(2)) {
+      Instruction* newVal = 0;
+      if (ShuffleVectorInst* S = dyn_cast<ShuffleVectorInst>(U))
+        newVal = new ShuffleVectorInst(newOp1, newOp2, newOp3,
+                                       S->getName() + ".expr");
+      else if (InsertElementInst* I = dyn_cast<InsertElementInst>(U))
+        newVal = InsertElementInst::Create(newOp1, newOp2, newOp3,
+                                           I->getName() + ".expr");
+      else if (SelectInst* I = dyn_cast<SelectInst>(U))
+        newVal = SelectInst::Create(newOp1, newOp2, newOp3,
+                                    I->getName() + ".expr");
+      
+      uint32_t v = VN.lookup_or_add(newVal);
+      
+      Value* leader = find_leader(availableOut[pred], v);
+      if (leader == 0) {
+        createdExpressions.push_back(newVal);
+        return newVal;
+      } else {
+        VN.erase(newVal);
+        delete newVal;
+        return leader;
+      }
+    }
+  
+  // Varargs operators
+  } else if (GetElementPtrInst* U = dyn_cast<GetElementPtrInst>(V)) {
+    Value* newOp1 = 0;
+    if (isa<Instruction>(U->getPointerOperand()))
+      newOp1 = phi_translate(U->getPointerOperand(), pred, succ);
+    else
+      newOp1 = U->getPointerOperand();
+    
+    if (newOp1 == 0)
+      return 0;
+    
+    bool changed_idx = false;
+    SmallVector<Value*, 4> newIdx;
+    for (GetElementPtrInst::op_iterator I = U->idx_begin(), E = U->idx_end();
+         I != E; ++I)
+      if (isa<Instruction>(*I)) {
+        Value* newVal = phi_translate(*I, pred, succ);
+        newIdx.push_back(newVal);
+        if (newVal != *I)
+          changed_idx = true;
+      } else {
+        newIdx.push_back(*I);
+      }
+    
+    if (newOp1 != U->getPointerOperand() || changed_idx) {
+      Instruction* newVal =
+          GetElementPtrInst::Create(newOp1,
+                                    newIdx.begin(), newIdx.end(),
+                                    U->getName()+".expr");
+      
+      uint32_t v = VN.lookup_or_add(newVal);
+      
+      Value* leader = find_leader(availableOut[pred], v);
+      if (leader == 0) {
+        createdExpressions.push_back(newVal);
+        return newVal;
+      } else {
+        VN.erase(newVal);
+        delete newVal;
+        return leader;
+      }
+    }
+  
+  // PHI Nodes
+  } else if (PHINode* P = dyn_cast<PHINode>(V)) {
+    if (P->getParent() == succ)
+      return P->getIncomingValueForBlock(pred);
+  }
+  
+  return V;
+}
+
+/// phi_translate_set - Perform phi translation on every element of a set
+void GVNPRE::phi_translate_set(ValueNumberedSet& anticIn,
+                              BasicBlock* pred, BasicBlock* succ,
+                              ValueNumberedSet& out) {
+  for (ValueNumberedSet::iterator I = anticIn.begin(),
+       E = anticIn.end(); I != E; ++I) {
+    Value* V = phi_translate(*I, pred, succ);
+    if (V != 0 && !out.test(VN.lookup_or_add(V))) {
+      out.insert(V);
+      out.set(VN.lookup(V));
+    }
+  }
+}
+
+/// dependsOnInvoke - Test if a value has an phi node as an operand, any of 
+/// whose inputs is an invoke instruction.  If this is true, we cannot safely
+/// PRE the instruction or anything that depends on it.
+bool GVNPRE::dependsOnInvoke(Value* V) {
+  if (PHINode* p = dyn_cast<PHINode>(V)) {
+    for (PHINode::op_iterator I = p->op_begin(), E = p->op_end(); I != E; ++I)
+      if (isa<InvokeInst>(*I))
+        return true;
+    return false;
+  } else {
+    return false;
+  }
+}
+
+/// clean - Remove all non-opaque values from the set whose operands are not
+/// themselves in the set, as well as all values that depend on invokes (see 
+/// above)
+void GVNPRE::clean(ValueNumberedSet& set) {
+  SmallVector<Value*, 8> worklist;
+  worklist.reserve(set.size());
+  topo_sort(set, worklist);
+  
+  for (unsigned i = 0; i < worklist.size(); ++i) {
+    Value* v = worklist[i];
+    
+    // Handle unary ops
+    if (CastInst* U = dyn_cast<CastInst>(v)) {
+      bool lhsValid = !isa<Instruction>(U->getOperand(0));
+      lhsValid |= set.test(VN.lookup(U->getOperand(0)));
+      if (lhsValid)
+        lhsValid = !dependsOnInvoke(U->getOperand(0));
+      
+      if (!lhsValid) {
+        set.erase(U);
+        set.reset(VN.lookup(U));
+      }
+    
+    // Handle binary ops
+    } else if (isa<BinaryOperator>(v) || isa<CmpInst>(v) ||
+        isa<ExtractElementInst>(v)) {
+      User* U = cast<User>(v);
+      
+      bool lhsValid = !isa<Instruction>(U->getOperand(0));
+      lhsValid |= set.test(VN.lookup(U->getOperand(0)));
+      if (lhsValid)
+        lhsValid = !dependsOnInvoke(U->getOperand(0));
+    
+      bool rhsValid = !isa<Instruction>(U->getOperand(1));
+      rhsValid |= set.test(VN.lookup(U->getOperand(1)));
+      if (rhsValid)
+        rhsValid = !dependsOnInvoke(U->getOperand(1));
+      
+      if (!lhsValid || !rhsValid) {
+        set.erase(U);
+        set.reset(VN.lookup(U));
+      }
+    
+    // Handle ternary ops
+    } else if (isa<ShuffleVectorInst>(v) || isa<InsertElementInst>(v) ||
+               isa<SelectInst>(v)) {
+      User* U = cast<User>(v);
+    
+      bool lhsValid = !isa<Instruction>(U->getOperand(0));
+      lhsValid |= set.test(VN.lookup(U->getOperand(0)));
+      if (lhsValid)
+        lhsValid = !dependsOnInvoke(U->getOperand(0));
+      
+      bool rhsValid = !isa<Instruction>(U->getOperand(1));
+      rhsValid |= set.test(VN.lookup(U->getOperand(1)));
+      if (rhsValid)
+        rhsValid = !dependsOnInvoke(U->getOperand(1));
+      
+      bool thirdValid = !isa<Instruction>(U->getOperand(2));
+      thirdValid |= set.test(VN.lookup(U->getOperand(2)));
+      if (thirdValid)
+        thirdValid = !dependsOnInvoke(U->getOperand(2));
+    
+      if (!lhsValid || !rhsValid || !thirdValid) {
+        set.erase(U);
+        set.reset(VN.lookup(U));
+      }
+    
+    // Handle varargs ops
+    } else if (GetElementPtrInst* U = dyn_cast<GetElementPtrInst>(v)) {
+      bool ptrValid = !isa<Instruction>(U->getPointerOperand());
+      ptrValid |= set.test(VN.lookup(U->getPointerOperand()));
+      if (ptrValid)
+        ptrValid = !dependsOnInvoke(U->getPointerOperand());
+      
+      bool varValid = true;
+      for (GetElementPtrInst::op_iterator I = U->idx_begin(), E = U->idx_end();
+           I != E; ++I)
+        if (varValid) {
+          varValid &= !isa<Instruction>(*I) || set.test(VN.lookup(*I));
+          varValid &= !dependsOnInvoke(*I);
+        }
+    
+      if (!ptrValid || !varValid) {
+        set.erase(U);
+        set.reset(VN.lookup(U));
+      }
+    }
+  }
+}
+
+/// topo_sort - Given a set of values, sort them by topological
+/// order into the provided vector.
+void GVNPRE::topo_sort(ValueNumberedSet& set, SmallVector<Value*, 8>& vec) {
+  SmallPtrSet<Value*, 16> visited;
+  SmallVector<Value*, 8> stack;
+  for (ValueNumberedSet::iterator I = set.begin(), E = set.end();
+       I != E; ++I) {
+    if (visited.count(*I) == 0)
+      stack.push_back(*I);
+    
+    while (!stack.empty()) {
+      Value* e = stack.back();
+      
+      // Handle unary ops
+      if (CastInst* U = dyn_cast<CastInst>(e)) {
+        Value* l = find_leader(set, VN.lookup(U->getOperand(0)));
+    
+        if (l != 0 && isa<Instruction>(l) &&
+            visited.count(l) == 0)
+          stack.push_back(l);
+        else {
+          vec.push_back(e);
+          visited.insert(e);
+          stack.pop_back();
+        }
+      
+      // Handle binary ops
+      } else if (isa<BinaryOperator>(e) || isa<CmpInst>(e) ||
+          isa<ExtractElementInst>(e)) {
+        User* U = cast<User>(e);
+        Value* l = find_leader(set, VN.lookup(U->getOperand(0)));
+        Value* r = find_leader(set, VN.lookup(U->getOperand(1)));
+    
+        if (l != 0 && isa<Instruction>(l) &&
+            visited.count(l) == 0)
+          stack.push_back(l);
+        else if (r != 0 && isa<Instruction>(r) &&
+                 visited.count(r) == 0)
+          stack.push_back(r);
+        else {
+          vec.push_back(e);
+          visited.insert(e);
+          stack.pop_back();
+        }
+      
+      // Handle ternary ops
+      } else if (isa<InsertElementInst>(e) || isa<ShuffleVectorInst>(e) ||
+                 isa<SelectInst>(e)) {
+        User* U = cast<User>(e);
+        Value* l = find_leader(set, VN.lookup(U->getOperand(0)));
+        Value* r = find_leader(set, VN.lookup(U->getOperand(1)));
+        Value* m = find_leader(set, VN.lookup(U->getOperand(2)));
+      
+        if (l != 0 && isa<Instruction>(l) &&
+            visited.count(l) == 0)
+          stack.push_back(l);
+        else if (r != 0 && isa<Instruction>(r) &&
+                 visited.count(r) == 0)
+          stack.push_back(r);
+        else if (m != 0 && isa<Instruction>(m) &&
+                 visited.count(m) == 0)
+          stack.push_back(m);
+        else {
+          vec.push_back(e);
+          visited.insert(e);
+          stack.pop_back();
+        }
+      
+      // Handle vararg ops
+      } else if (GetElementPtrInst* U = dyn_cast<GetElementPtrInst>(e)) {
+        Value* p = find_leader(set, VN.lookup(U->getPointerOperand()));
+        
+        if (p != 0 && isa<Instruction>(p) &&
+            visited.count(p) == 0)
+          stack.push_back(p);
+        else {
+          bool push_va = false;
+          for (GetElementPtrInst::op_iterator I = U->idx_begin(),
+               E = U->idx_end(); I != E; ++I) {
+            Value * v = find_leader(set, VN.lookup(*I));
+            if (v != 0 && isa<Instruction>(v) && visited.count(v) == 0) {
+              stack.push_back(v);
+              push_va = true;
+            }
+          }
+          
+          if (!push_va) {
+            vec.push_back(e);
+            visited.insert(e);
+            stack.pop_back();
+          }
+        }
+      
+      // Handle opaque ops
+      } else {
+        visited.insert(e);
+        vec.push_back(e);
+        stack.pop_back();
+      }
+    }
+    
+    stack.clear();
+  }
+}
+
+/// dump - Dump a set of values to standard error
+void GVNPRE::dump(ValueNumberedSet& s) const {
+  DOUT << "{ ";
+  for (ValueNumberedSet::iterator I = s.begin(), E = s.end();
+       I != E; ++I) {
+    DOUT << "" << VN.lookup(*I) << ": ";
+    DEBUG((*I)->dump());
+  }
+  DOUT << "}\n\n";
+}
+
+/// elimination - Phase 3 of the main algorithm.  Perform full redundancy 
+/// elimination by walking the dominator tree and removing any instruction that 
+/// is dominated by another instruction with the same value number.
+bool GVNPRE::elimination() {
+  bool changed_function = false;
+  
+  SmallVector<std::pair<Instruction*, Value*>, 8> replace;
+  SmallVector<Instruction*, 8> erase;
+  
+  DominatorTree& DT = getAnalysis<DominatorTree>();
+  
+  for (df_iterator<DomTreeNode*> DI = df_begin(DT.getRootNode()),
+         E = df_end(DT.getRootNode()); DI != E; ++DI) {
+    BasicBlock* BB = DI->getBlock();
+    
+    for (BasicBlock::iterator BI = BB->begin(), BE = BB->end();
+         BI != BE; ++BI) {
+
+      if (isa<BinaryOperator>(BI) || isa<CmpInst>(BI) ||
+          isa<ShuffleVectorInst>(BI) || isa<InsertElementInst>(BI) ||
+          isa<ExtractElementInst>(BI) || isa<SelectInst>(BI) ||
+          isa<CastInst>(BI) || isa<GetElementPtrInst>(BI)) {
+        
+        if (availableOut[BB].test(VN.lookup(BI)) &&
+            !availableOut[BB].count(BI)) {
+          Value *leader = find_leader(availableOut[BB], VN.lookup(BI));
+          if (Instruction* Instr = dyn_cast<Instruction>(leader))
+            if (Instr->getParent() != 0 && Instr != BI) {
+              replace.push_back(std::make_pair(BI, leader));
+              erase.push_back(BI);
+              ++NumEliminated;
+            }
+        }
+      }
+    }
+  }
+  
+  while (!replace.empty()) {
+    std::pair<Instruction*, Value*> rep = replace.back();
+    replace.pop_back();
+    rep.first->replaceAllUsesWith(rep.second);
+    changed_function = true;
+  }
+    
+  for (SmallVector<Instruction*, 8>::iterator I = erase.begin(),
+       E = erase.end(); I != E; ++I)
+     (*I)->eraseFromParent();
+  
+  return changed_function;
+}
+
+/// cleanup - Delete any extraneous values that were created to represent
+/// expressions without leaders.
+void GVNPRE::cleanup() {
+  while (!createdExpressions.empty()) {
+    Instruction* I = createdExpressions.back();
+    createdExpressions.pop_back();
+    
+    delete I;
+  }
+}
+
+/// buildsets_availout - When calculating availability, handle an instruction
+/// by inserting it into the appropriate sets
+void GVNPRE::buildsets_availout(BasicBlock::iterator I,
+                                ValueNumberedSet& currAvail,
+                                ValueNumberedSet& currPhis,
+                                ValueNumberedSet& currExps,
+                                SmallPtrSet<Value*, 16>& currTemps) {
+  // Handle PHI nodes
+  if (PHINode* p = dyn_cast<PHINode>(I)) {
+    unsigned num = VN.lookup_or_add(p);
+    
+    currPhis.insert(p);
+    currPhis.set(num);
+  
+  // Handle unary ops
+  } else if (CastInst* U = dyn_cast<CastInst>(I)) {
+    Value* leftValue = U->getOperand(0);
+    
+    unsigned num = VN.lookup_or_add(U);
+      
+    if (isa<Instruction>(leftValue))
+      if (!currExps.test(VN.lookup(leftValue))) {
+        currExps.insert(leftValue);
+        currExps.set(VN.lookup(leftValue));
+      }
+    
+    if (!currExps.test(num)) {
+      currExps.insert(U);
+      currExps.set(num);
+    }
+  
+  // Handle binary ops
+  } else if (isa<BinaryOperator>(I) || isa<CmpInst>(I) ||
+             isa<ExtractElementInst>(I)) {
+    User* U = cast<User>(I);
+    Value* leftValue = U->getOperand(0);
+    Value* rightValue = U->getOperand(1);
+    
+    unsigned num = VN.lookup_or_add(U);
+      
+    if (isa<Instruction>(leftValue))
+      if (!currExps.test(VN.lookup(leftValue))) {
+        currExps.insert(leftValue);
+        currExps.set(VN.lookup(leftValue));
+      }
+    
+    if (isa<Instruction>(rightValue))
+      if (!currExps.test(VN.lookup(rightValue))) {
+        currExps.insert(rightValue);
+        currExps.set(VN.lookup(rightValue));
+      }
+    
+    if (!currExps.test(num)) {
+      currExps.insert(U);
+      currExps.set(num);
+    }
+    
+  // Handle ternary ops
+  } else if (isa<InsertElementInst>(I) || isa<ShuffleVectorInst>(I) ||
+             isa<SelectInst>(I)) {
+    User* U = cast<User>(I);
+    Value* leftValue = U->getOperand(0);
+    Value* rightValue = U->getOperand(1);
+    Value* thirdValue = U->getOperand(2);
+      
+    VN.lookup_or_add(U);
+    
+    unsigned num = VN.lookup_or_add(U);
+    
+    if (isa<Instruction>(leftValue))
+      if (!currExps.test(VN.lookup(leftValue))) {
+        currExps.insert(leftValue);
+        currExps.set(VN.lookup(leftValue));
+      }
+    if (isa<Instruction>(rightValue))
+      if (!currExps.test(VN.lookup(rightValue))) {
+        currExps.insert(rightValue);
+        currExps.set(VN.lookup(rightValue));
+      }
+    if (isa<Instruction>(thirdValue))
+      if (!currExps.test(VN.lookup(thirdValue))) {
+        currExps.insert(thirdValue);
+        currExps.set(VN.lookup(thirdValue));
+      }
+    
+    if (!currExps.test(num)) {
+      currExps.insert(U);
+      currExps.set(num);
+    }
+    
+  // Handle vararg ops
+  } else if (GetElementPtrInst* U = dyn_cast<GetElementPtrInst>(I)) {
+    Value* ptrValue = U->getPointerOperand();
+      
+    VN.lookup_or_add(U);
+    
+    unsigned num = VN.lookup_or_add(U);
+    
+    if (isa<Instruction>(ptrValue))
+      if (!currExps.test(VN.lookup(ptrValue))) {
+        currExps.insert(ptrValue);
+        currExps.set(VN.lookup(ptrValue));
+      }
+    
+    for (GetElementPtrInst::op_iterator OI = U->idx_begin(), OE = U->idx_end();
+         OI != OE; ++OI)
+      if (isa<Instruction>(*OI) && !currExps.test(VN.lookup(*OI))) {
+        currExps.insert(*OI);
+        currExps.set(VN.lookup(*OI));
+      }
+    
+    if (!currExps.test(VN.lookup(U))) {
+      currExps.insert(U);
+      currExps.set(num);
+    }
+    
+  // Handle opaque ops
+  } else if (!I->isTerminator()){
+    VN.lookup_or_add(I);
+    
+    currTemps.insert(I);
+  }
+    
+  if (!I->isTerminator())
+    if (!currAvail.test(VN.lookup(I))) {
+      currAvail.insert(I);
+      currAvail.set(VN.lookup(I));
+    }
+}
+
+/// buildsets_anticout - When walking the postdom tree, calculate the ANTIC_OUT
+/// set as a function of the ANTIC_IN set of the block's predecessors
+bool GVNPRE::buildsets_anticout(BasicBlock* BB,
+                                ValueNumberedSet& anticOut,
+                                SmallPtrSet<BasicBlock*, 8>& visited) {
+  if (BB->getTerminator()->getNumSuccessors() == 1) {
+    if (BB->getTerminator()->getSuccessor(0) != BB &&
+        visited.count(BB->getTerminator()->getSuccessor(0)) == 0) {
+      return true;
+    }
+    else {
+      phi_translate_set(anticipatedIn[BB->getTerminator()->getSuccessor(0)],
+                        BB,  BB->getTerminator()->getSuccessor(0), anticOut);
+    }
+  } else if (BB->getTerminator()->getNumSuccessors() > 1) {
+    BasicBlock* first = BB->getTerminator()->getSuccessor(0);
+    for (ValueNumberedSet::iterator I = anticipatedIn[first].begin(),
+         E = anticipatedIn[first].end(); I != E; ++I) {
+      anticOut.insert(*I);
+      anticOut.set(VN.lookup(*I));
+    }
+    
+    for (unsigned i = 1; i < BB->getTerminator()->getNumSuccessors(); ++i) {
+      BasicBlock* currSucc = BB->getTerminator()->getSuccessor(i);
+      ValueNumberedSet& succAnticIn = anticipatedIn[currSucc];
+      
+      SmallVector<Value*, 16> temp;
+      
+      for (ValueNumberedSet::iterator I = anticOut.begin(),
+           E = anticOut.end(); I != E; ++I)
+        if (!succAnticIn.test(VN.lookup(*I)))
+          temp.push_back(*I);
+
+      for (SmallVector<Value*, 16>::iterator I = temp.begin(), E = temp.end();
+           I != E; ++I) {
+        anticOut.erase(*I);
+        anticOut.reset(VN.lookup(*I));
+      }
+    }
+  }
+  
+  return false;
+}
+
+/// buildsets_anticin - Walk the postdom tree, calculating ANTIC_OUT for
+/// each block.  ANTIC_IN is then a function of ANTIC_OUT and the GEN
+/// sets populated in buildsets_availout
+unsigned GVNPRE::buildsets_anticin(BasicBlock* BB,
+                               ValueNumberedSet& anticOut,
+                               ValueNumberedSet& currExps,
+                               SmallPtrSet<Value*, 16>& currTemps,
+                               SmallPtrSet<BasicBlock*, 8>& visited) {
+  ValueNumberedSet& anticIn = anticipatedIn[BB];
+  unsigned old = anticIn.size();
+      
+  bool defer = buildsets_anticout(BB, anticOut, visited);
+  if (defer)
+    return 0;
+  
+  anticIn.clear();
+  
+  for (ValueNumberedSet::iterator I = anticOut.begin(),
+       E = anticOut.end(); I != E; ++I) {
+    anticIn.insert(*I);
+    anticIn.set(VN.lookup(*I));
+  }
+  for (ValueNumberedSet::iterator I = currExps.begin(),
+       E = currExps.end(); I != E; ++I) {
+    if (!anticIn.test(VN.lookup(*I))) {
+      anticIn.insert(*I);
+      anticIn.set(VN.lookup(*I));
+    }
+  } 
+  
+  for (SmallPtrSet<Value*, 16>::iterator I = currTemps.begin(),
+       E = currTemps.end(); I != E; ++I) {
+    anticIn.erase(*I);
+    anticIn.reset(VN.lookup(*I));
+  }
+  
+  clean(anticIn);
+  anticOut.clear();
+  
+  if (old != anticIn.size())
+    return 2;
+  else
+    return 1;
+}
+
+/// buildsets - Phase 1 of the main algorithm.  Construct the AVAIL_OUT
+/// and the ANTIC_IN sets.
+void GVNPRE::buildsets(Function& F) {
+  DenseMap<BasicBlock*, ValueNumberedSet> generatedExpressions;
+  DenseMap<BasicBlock*, SmallPtrSet<Value*, 16> > generatedTemporaries;
+
+  DominatorTree &DT = getAnalysis<DominatorTree>();   
+  
+  // Phase 1, Part 1: calculate AVAIL_OUT
+  
+  // Top-down walk of the dominator tree
+  for (df_iterator<DomTreeNode*> DI = df_begin(DT.getRootNode()),
+         E = df_end(DT.getRootNode()); DI != E; ++DI) {
+    
+    // Get the sets to update for this block
+    ValueNumberedSet& currExps = generatedExpressions[DI->getBlock()];
+    ValueNumberedSet& currPhis = generatedPhis[DI->getBlock()];
+    SmallPtrSet<Value*, 16>& currTemps = generatedTemporaries[DI->getBlock()];
+    ValueNumberedSet& currAvail = availableOut[DI->getBlock()];     
+    
+    BasicBlock* BB = DI->getBlock();
+  
+    // A block inherits AVAIL_OUT from its dominator
+    if (DI->getIDom() != 0)
+      currAvail = availableOut[DI->getIDom()->getBlock()];
+
+    for (BasicBlock::iterator BI = BB->begin(), BE = BB->end();
+         BI != BE; ++BI)
+      buildsets_availout(BI, currAvail, currPhis, currExps,
+                         currTemps);
+      
+  }
+
+  // Phase 1, Part 2: calculate ANTIC_IN
+  
+  SmallPtrSet<BasicBlock*, 8> visited;
+  SmallPtrSet<BasicBlock*, 4> block_changed;
+  for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; ++FI)
+    block_changed.insert(FI);
+  
+  bool changed = true;
+  unsigned iterations = 0;
+  
+  while (changed) {
+    changed = false;
+    ValueNumberedSet anticOut;
+    
+    // Postorder walk of the CFG
+    for (po_iterator<BasicBlock*> BBI = po_begin(&F.getEntryBlock()),
+         BBE = po_end(&F.getEntryBlock()); BBI != BBE; ++BBI) {
+      BasicBlock* BB = *BBI;
+      
+      if (block_changed.count(BB) != 0) {
+        unsigned ret = buildsets_anticin(BB, anticOut,generatedExpressions[BB],
+                                         generatedTemporaries[BB], visited);
+      
+        if (ret == 0) {
+          changed = true;
+          continue;
+        } else {
+          visited.insert(BB);
+        
+          if (ret == 2)
+           for (pred_iterator PI = pred_begin(BB), PE = pred_end(BB);
+                 PI != PE; ++PI) {
+              block_changed.insert(*PI);
+           }
+          else
+            block_changed.erase(BB);
+        
+          changed |= (ret == 2);
+        }
+      }
+    }
+    
+    iterations++;
+  }
+}
+
+/// insertion_pre - When a partial redundancy has been identified, eliminate it
+/// by inserting appropriate values into the predecessors and a phi node in
+/// the main block
+void GVNPRE::insertion_pre(Value* e, BasicBlock* BB,
+                           DenseMap<BasicBlock*, Value*>& avail,
+                    std::map<BasicBlock*, ValueNumberedSet>& new_sets) {
+  for (pred_iterator PI = pred_begin(BB), PE = pred_end(BB); PI != PE; ++PI) {
+    Value* e2 = avail[*PI];
+    if (!availableOut[*PI].test(VN.lookup(e2))) {
+      User* U = cast<User>(e2);
+      
+      Value* s1 = 0;
+      if (isa<BinaryOperator>(U->getOperand(0)) || 
+          isa<CmpInst>(U->getOperand(0)) ||
+          isa<ShuffleVectorInst>(U->getOperand(0)) ||
+          isa<ExtractElementInst>(U->getOperand(0)) ||
+          isa<InsertElementInst>(U->getOperand(0)) ||
+          isa<SelectInst>(U->getOperand(0)) ||
+          isa<CastInst>(U->getOperand(0)) ||
+          isa<GetElementPtrInst>(U->getOperand(0)))
+        s1 = find_leader(availableOut[*PI], VN.lookup(U->getOperand(0)));
+      else
+        s1 = U->getOperand(0);
+      
+      Value* s2 = 0;
+      
+      if (isa<BinaryOperator>(U) || 
+          isa<CmpInst>(U) ||
+          isa<ShuffleVectorInst>(U) ||
+          isa<ExtractElementInst>(U) ||
+          isa<InsertElementInst>(U) ||
+          isa<SelectInst>(U)) {
+        if (isa<BinaryOperator>(U->getOperand(1)) || 
+            isa<CmpInst>(U->getOperand(1)) ||
+            isa<ShuffleVectorInst>(U->getOperand(1)) ||
+            isa<ExtractElementInst>(U->getOperand(1)) ||
+            isa<InsertElementInst>(U->getOperand(1)) ||
+            isa<SelectInst>(U->getOperand(1)) ||
+            isa<CastInst>(U->getOperand(1)) ||
+            isa<GetElementPtrInst>(U->getOperand(1))) {
+          s2 = find_leader(availableOut[*PI], VN.lookup(U->getOperand(1)));
+        } else {
+          s2 = U->getOperand(1);
+        }
+      }
+      
+      // Ternary Operators
+      Value* s3 = 0;
+      if (isa<ShuffleVectorInst>(U) ||
+          isa<InsertElementInst>(U) ||
+          isa<SelectInst>(U)) {
+        if (isa<BinaryOperator>(U->getOperand(2)) || 
+            isa<CmpInst>(U->getOperand(2)) ||
+            isa<ShuffleVectorInst>(U->getOperand(2)) ||
+            isa<ExtractElementInst>(U->getOperand(2)) ||
+            isa<InsertElementInst>(U->getOperand(2)) ||
+            isa<SelectInst>(U->getOperand(2)) ||
+            isa<CastInst>(U->getOperand(2)) ||
+            isa<GetElementPtrInst>(U->getOperand(2))) {
+          s3 = find_leader(availableOut[*PI], VN.lookup(U->getOperand(2)));
+        } else {
+          s3 = U->getOperand(2);
+        }
+      }
+      
+      // Vararg operators
+      SmallVector<Value*, 4> sVarargs;
+      if (GetElementPtrInst* G = dyn_cast<GetElementPtrInst>(U)) {
+        for (GetElementPtrInst::op_iterator OI = G->idx_begin(),
+             OE = G->idx_end(); OI != OE; ++OI) {
+          if (isa<BinaryOperator>(*OI) || 
+              isa<CmpInst>(*OI) ||
+              isa<ShuffleVectorInst>(*OI) ||
+              isa<ExtractElementInst>(*OI) ||
+              isa<InsertElementInst>(*OI) ||
+              isa<SelectInst>(*OI) ||
+              isa<CastInst>(*OI) ||
+              isa<GetElementPtrInst>(*OI)) {
+            sVarargs.push_back(find_leader(availableOut[*PI], 
+                               VN.lookup(*OI)));
+          } else {
+            sVarargs.push_back(*OI);
+          }
+        }
+      }
+      
+      Value* newVal = 0;
+      if (BinaryOperator* BO = dyn_cast<BinaryOperator>(U))
+        newVal = BinaryOperator::Create(BO->getOpcode(), s1, s2,
+                                        BO->getName()+".gvnpre",
+                                        (*PI)->getTerminator());
+      else if (CmpInst* C = dyn_cast<CmpInst>(U))
+        newVal = CmpInst::Create(C->getOpcode(), C->getPredicate(), s1, s2,
+                                 C->getName()+".gvnpre", 
+                                 (*PI)->getTerminator());
+      else if (ShuffleVectorInst* S = dyn_cast<ShuffleVectorInst>(U))
+        newVal = new ShuffleVectorInst(s1, s2, s3, S->getName()+".gvnpre",
+                                       (*PI)->getTerminator());
+      else if (InsertElementInst* S = dyn_cast<InsertElementInst>(U))
+        newVal = InsertElementInst::Create(s1, s2, s3, S->getName()+".gvnpre",
+                                           (*PI)->getTerminator());
+      else if (ExtractElementInst* S = dyn_cast<ExtractElementInst>(U))
+        newVal = new ExtractElementInst(s1, s2, S->getName()+".gvnpre",
+                                        (*PI)->getTerminator());
+      else if (SelectInst* S = dyn_cast<SelectInst>(U))
+        newVal = SelectInst::Create(s1, s2, s3, S->getName()+".gvnpre",
+                                    (*PI)->getTerminator());
+      else if (CastInst* C = dyn_cast<CastInst>(U))
+        newVal = CastInst::Create(C->getOpcode(), s1, C->getType(),
+                                  C->getName()+".gvnpre", 
+                                  (*PI)->getTerminator());
+      else if (GetElementPtrInst* G = dyn_cast<GetElementPtrInst>(U))
+        newVal = GetElementPtrInst::Create(s1, sVarargs.begin(), sVarargs.end(),
+                                           G->getName()+".gvnpre", 
+                                           (*PI)->getTerminator());
+
+      VN.add(newVal, VN.lookup(U));
+                  
+      ValueNumberedSet& predAvail = availableOut[*PI];
+      val_replace(predAvail, newVal);
+      val_replace(new_sets[*PI], newVal);
+      predAvail.set(VN.lookup(newVal));
+            
+      DenseMap<BasicBlock*, Value*>::iterator av = avail.find(*PI);
+      if (av != avail.end())
+        avail.erase(av);
+      avail.insert(std::make_pair(*PI, newVal));
+                  
+      ++NumInsertedVals;
+    }
+  }
+              
+  PHINode* p = 0;
+              
+  for (pred_iterator PI = pred_begin(BB), PE = pred_end(BB); PI != PE; ++PI) {
+    if (p == 0)
+      p = PHINode::Create(avail[*PI]->getType(), "gvnpre-join", BB->begin());
+    
+    p->addIncoming(avail[*PI], *PI);
+  }
+
+  VN.add(p, VN.lookup(e));
+  val_replace(availableOut[BB], p);
+  availableOut[BB].set(VN.lookup(e));
+  generatedPhis[BB].insert(p);
+  generatedPhis[BB].set(VN.lookup(e));
+  new_sets[BB].insert(p);
+  new_sets[BB].set(VN.lookup(e));
+              
+  ++NumInsertedPhis;
+}
+
+/// insertion_mergepoint - When walking the dom tree, check at each merge
+/// block for the possibility of a partial redundancy.  If present, eliminate it
+unsigned GVNPRE::insertion_mergepoint(SmallVector<Value*, 8>& workList,
+                                      df_iterator<DomTreeNode*>& D,
+                    std::map<BasicBlock*, ValueNumberedSet >& new_sets) {
+  bool changed_function = false;
+  bool new_stuff = false;
+  
+  BasicBlock* BB = D->getBlock();
+  for (unsigned i = 0; i < workList.size(); ++i) {
+    Value* e = workList[i];
+          
+    if (isa<BinaryOperator>(e) || isa<CmpInst>(e) ||
+        isa<ExtractElementInst>(e) || isa<InsertElementInst>(e) ||
+        isa<ShuffleVectorInst>(e) || isa<SelectInst>(e) || isa<CastInst>(e) ||
+        isa<GetElementPtrInst>(e)) {
+      if (availableOut[D->getIDom()->getBlock()].test(VN.lookup(e)))
+        continue;
+            
+      DenseMap<BasicBlock*, Value*> avail;
+      bool by_some = false;
+      bool all_same = true;
+      Value * first_s = 0;
+            
+      for (pred_iterator PI = pred_begin(BB), PE = pred_end(BB); PI != PE;
+           ++PI) {
+        Value *e2 = phi_translate(e, *PI, BB);
+        Value *e3 = find_leader(availableOut[*PI], VN.lookup(e2));
+              
+        if (e3 == 0) {
+          DenseMap<BasicBlock*, Value*>::iterator av = avail.find(*PI);
+          if (av != avail.end())
+            avail.erase(av);
+          avail.insert(std::make_pair(*PI, e2));
+          all_same = false;
+        } else {
+          DenseMap<BasicBlock*, Value*>::iterator av = avail.find(*PI);
+          if (av != avail.end())
+            avail.erase(av);
+          avail.insert(std::make_pair(*PI, e3));
+                
+          by_some = true;
+          if (first_s == 0)
+            first_s = e3;
+          else if (first_s != e3)
+            all_same = false;
+        }
+      }
+            
+      if (by_some && !all_same &&
+          !generatedPhis[BB].test(VN.lookup(e))) {
+        insertion_pre(e, BB, avail, new_sets);
+              
+        changed_function = true;
+        new_stuff = true;
+      }
+    }
+  }
+  
+  unsigned retval = 0;
+  if (changed_function)
+    retval += 1;
+  if (new_stuff)
+    retval += 2;
+  
+  return retval;
+}
+
+/// insert - Phase 2 of the main algorithm.  Walk the dominator tree looking for
+/// merge points.  When one is found, check for a partial redundancy.  If one is
+/// present, eliminate it.  Repeat this walk until no changes are made.
+bool GVNPRE::insertion(Function& F) {
+  bool changed_function = false;
+
+  DominatorTree &DT = getAnalysis<DominatorTree>();  
+  
+  std::map<BasicBlock*, ValueNumberedSet> new_sets;
+  bool new_stuff = true;
+  while (new_stuff) {
+    new_stuff = false;
+    for (df_iterator<DomTreeNode*> DI = df_begin(DT.getRootNode()),
+         E = df_end(DT.getRootNode()); DI != E; ++DI) {
+      BasicBlock* BB = DI->getBlock();
+      
+      if (BB == 0)
+        continue;
+      
+      ValueNumberedSet& availOut = availableOut[BB];
+      ValueNumberedSet& anticIn = anticipatedIn[BB];
+      
+      // Replace leaders with leaders inherited from dominator
+      if (DI->getIDom() != 0) {
+        ValueNumberedSet& dom_set = new_sets[DI->getIDom()->getBlock()];
+        for (ValueNumberedSet::iterator I = dom_set.begin(),
+             E = dom_set.end(); I != E; ++I) {
+          val_replace(new_sets[BB], *I);
+          val_replace(availOut, *I);
+        }
+      }
+      
+      // If there is more than one predecessor...
+      if (pred_begin(BB) != pred_end(BB) && ++pred_begin(BB) != pred_end(BB)) {
+        SmallVector<Value*, 8> workList;
+        workList.reserve(anticIn.size());
+        topo_sort(anticIn, workList);
+        
+        unsigned result = insertion_mergepoint(workList, DI, new_sets);
+        if (result & 1)
+          changed_function = true;
+        if (result & 2)
+          new_stuff = true;
+      }
+    }
+  }
+  
+  return changed_function;
+}
+
+// GVNPRE::runOnFunction - This is the main transformation entry point for a
+// function.
+//
+bool GVNPRE::runOnFunction(Function &F) {
+  // Clean out global sets from any previous functions
+  VN.clear();
+  createdExpressions.clear();
+  availableOut.clear();
+  anticipatedIn.clear();
+  generatedPhis.clear();
+ 
+  bool changed_function = false;
+  
+  // Phase 1: BuildSets
+  // This phase calculates the AVAIL_OUT and ANTIC_IN sets
+  buildsets(F);
+  
+  // Phase 2: Insert
+  // This phase inserts values to make partially redundant values
+  // fully redundant
+  changed_function |= insertion(F);
+  
+  // Phase 3: Eliminate
+  // This phase performs trivial full redundancy elimination
+  changed_function |= elimination();
+  
+  // Phase 4: Cleanup
+  // This phase cleans up values that were created solely
+  // as leaders for expressions
+  cleanup();
+  
+  return changed_function;
+}
diff --git a/lib/Transforms/Scalar/IndVarSimplify.cpp b/lib/Transforms/Scalar/IndVarSimplify.cpp
new file mode 100644
index 0000000..ca7aa7b
--- /dev/null
+++ b/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -0,0 +1,880 @@
+//===- IndVarSimplify.cpp - Induction Variable Elimination ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This transformation analyzes and transforms the induction variables (and
+// computations derived from them) into simpler forms suitable for subsequent
+// analysis and transformation.
+//
+// This transformation makes the following changes to each loop with an
+// identifiable induction variable:
+//   1. All loops are transformed to have a SINGLE canonical induction variable
+//      which starts at zero and steps by one.
+//   2. The canonical induction variable is guaranteed to be the first PHI node
+//      in the loop header block.
+//   3. Any pointer arithmetic recurrences are raised to use array subscripts.
+//
+// If the trip count of a loop is computable, this pass also makes the following
+// changes:
+//   1. The exit condition for the loop is canonicalized to compare the
+//      induction value against the exit value.  This turns loops like:
+//        'for (i = 7; i*i < 1000; ++i)' into 'for (i = 0; i != 25; ++i)'
+//   2. Any use outside of the loop of an expression derived from the indvar
+//      is changed to compute the derived value outside of the loop, eliminating
+//      the dependence on the exit value of the induction variable.  If the only
+//      purpose of the loop is to compute the exit value of some derived
+//      expression, this transformation will make the loop dead.
+//
+// This transformation should be followed by strength reduction after all of the
+// desired loop transformations have been performed.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "indvars"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/BasicBlock.h"
+#include "llvm/Constants.h"
+#include "llvm/Instructions.h"
+#include "llvm/Type.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/IVUsers.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/STLExtras.h"
+using namespace llvm;
+
+STATISTIC(NumRemoved , "Number of aux indvars removed");
+STATISTIC(NumInserted, "Number of canonical indvars added");
+STATISTIC(NumReplaced, "Number of exit values replaced");
+STATISTIC(NumLFTR    , "Number of loop exit tests replaced");
+
+namespace {
+  class VISIBILITY_HIDDEN IndVarSimplify : public LoopPass {
+    IVUsers         *IU;
+    LoopInfo        *LI;
+    ScalarEvolution *SE;
+    bool Changed;
+  public:
+
+   static char ID; // Pass identification, replacement for typeid
+   IndVarSimplify() : LoopPass(&ID) {}
+
+   virtual bool runOnLoop(Loop *L, LPPassManager &LPM);
+
+   virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+     AU.addRequired<DominatorTree>();
+     AU.addRequired<ScalarEvolution>();
+     AU.addRequiredID(LCSSAID);
+     AU.addRequiredID(LoopSimplifyID);
+     AU.addRequired<LoopInfo>();
+     AU.addRequired<IVUsers>();
+     AU.addPreserved<ScalarEvolution>();
+     AU.addPreservedID(LoopSimplifyID);
+     AU.addPreserved<IVUsers>();
+     AU.addPreservedID(LCSSAID);
+     AU.setPreservesCFG();
+   }
+
+  private:
+
+    void RewriteNonIntegerIVs(Loop *L);
+
+    ICmpInst *LinearFunctionTestReplace(Loop *L, SCEVHandle BackedgeTakenCount,
+                                   Value *IndVar,
+                                   BasicBlock *ExitingBlock,
+                                   BranchInst *BI,
+                                   SCEVExpander &Rewriter);
+    void RewriteLoopExitValues(Loop *L, const SCEV *BackedgeTakenCount);
+
+    void RewriteIVExpressions(Loop *L, const Type *LargestType,
+                              SCEVExpander &Rewriter);
+
+    void SinkUnusedInvariants(Loop *L, SCEVExpander &Rewriter);
+
+    void FixUsesBeforeDefs(Loop *L, SCEVExpander &Rewriter);
+
+    void HandleFloatingPointIV(Loop *L, PHINode *PH);
+  };
+}
+
+char IndVarSimplify::ID = 0;
+static RegisterPass<IndVarSimplify>
+X("indvars", "Canonicalize Induction Variables");
+
+Pass *llvm::createIndVarSimplifyPass() {
+  return new IndVarSimplify();
+}
+
+/// LinearFunctionTestReplace - This method rewrites the exit condition of the
+/// loop to be a canonical != comparison against the incremented loop induction
+/// variable.  This pass is able to rewrite the exit tests of any loop where the
+/// SCEV analysis can determine a loop-invariant trip count of the loop, which
+/// is actually a much broader range than just linear tests.
+ICmpInst *IndVarSimplify::LinearFunctionTestReplace(Loop *L,
+                                   SCEVHandle BackedgeTakenCount,
+                                   Value *IndVar,
+                                   BasicBlock *ExitingBlock,
+                                   BranchInst *BI,
+                                   SCEVExpander &Rewriter) {
+  // If the exiting block is not the same as the backedge block, we must compare
+  // against the preincremented value, otherwise we prefer to compare against
+  // the post-incremented value.
+  Value *CmpIndVar;
+  SCEVHandle RHS = BackedgeTakenCount;
+  if (ExitingBlock == L->getLoopLatch()) {
+    // Add one to the "backedge-taken" count to get the trip count.
+    // If this addition may overflow, we have to be more pessimistic and
+    // cast the induction variable before doing the add.
+    SCEVHandle Zero = SE->getIntegerSCEV(0, BackedgeTakenCount->getType());
+    SCEVHandle N =
+      SE->getAddExpr(BackedgeTakenCount,
+                     SE->getIntegerSCEV(1, BackedgeTakenCount->getType()));
+    if ((isa<SCEVConstant>(N) && !N->isZero()) ||
+        SE->isLoopGuardedByCond(L, ICmpInst::ICMP_NE, N, Zero)) {
+      // No overflow. Cast the sum.
+      RHS = SE->getTruncateOrZeroExtend(N, IndVar->getType());
+    } else {
+      // Potential overflow. Cast before doing the add.
+      RHS = SE->getTruncateOrZeroExtend(BackedgeTakenCount,
+                                        IndVar->getType());
+      RHS = SE->getAddExpr(RHS,
+                           SE->getIntegerSCEV(1, IndVar->getType()));
+    }
+
+    // The BackedgeTaken expression contains the number of times that the
+    // backedge branches to the loop header.  This is one less than the
+    // number of times the loop executes, so use the incremented indvar.
+    CmpIndVar = L->getCanonicalInductionVariableIncrement();
+  } else {
+    // We have to use the preincremented value...
+    RHS = SE->getTruncateOrZeroExtend(BackedgeTakenCount,
+                                      IndVar->getType());
+    CmpIndVar = IndVar;
+  }
+
+  // Expand the code for the iteration count into the preheader of the loop.
+  BasicBlock *Preheader = L->getLoopPreheader();
+  Value *ExitCnt = Rewriter.expandCodeFor(RHS, CmpIndVar->getType(),
+                                          Preheader->getTerminator());
+
+  // Insert a new icmp_ne or icmp_eq instruction before the branch.
+  ICmpInst::Predicate Opcode;
+  if (L->contains(BI->getSuccessor(0)))
+    Opcode = ICmpInst::ICMP_NE;
+  else
+    Opcode = ICmpInst::ICMP_EQ;
+
+  DOUT << "INDVARS: Rewriting loop exit condition to:\n"
+       << "      LHS:" << *CmpIndVar // includes a newline
+       << "       op:\t"
+       << (Opcode == ICmpInst::ICMP_NE ? "!=" : "==") << "\n"
+       << "      RHS:\t" << *RHS << "\n";
+
+  ICmpInst *Cond = new ICmpInst(Opcode, CmpIndVar, ExitCnt, "exitcond", BI);
+
+  Instruction *OrigCond = cast<Instruction>(BI->getCondition());
+  // It's tempting to use replaceAllUsesWith here to fully replace the old
+  // comparison, but that's not immediately safe, since users of the old
+  // comparison may not be dominated by the new comparison. Instead, just
+  // update the branch to use the new comparison; in the common case this
+  // will make old comparison dead.
+  BI->setCondition(Cond);
+  RecursivelyDeleteTriviallyDeadInstructions(OrigCond);
+
+  ++NumLFTR;
+  Changed = true;
+  return Cond;
+}
+
+/// RewriteLoopExitValues - Check to see if this loop has a computable
+/// loop-invariant execution count.  If so, this means that we can compute the
+/// final value of any expressions that are recurrent in the loop, and
+/// substitute the exit values from the loop into any instructions outside of
+/// the loop that use the final values of the current expressions.
+///
+/// This is mostly redundant with the regular IndVarSimplify activities that
+/// happen later, except that it's more powerful in some cases, because it's
+/// able to brute-force evaluate arbitrary instructions as long as they have
+/// constant operands at the beginning of the loop.
+void IndVarSimplify::RewriteLoopExitValues(Loop *L,
+                                           const SCEV *BackedgeTakenCount) {
+  // Verify the input to the pass in already in LCSSA form.
+  assert(L->isLCSSAForm());
+
+  BasicBlock *Preheader = L->getLoopPreheader();
+
+  // Scan all of the instructions in the loop, looking at those that have
+  // extra-loop users and which are recurrences.
+  SCEVExpander Rewriter(*SE);
+
+  // We insert the code into the preheader of the loop if the loop contains
+  // multiple exit blocks, or in the exit block if there is exactly one.
+  BasicBlock *BlockToInsertInto;
+  SmallVector<BasicBlock*, 8> ExitBlocks;
+  L->getUniqueExitBlocks(ExitBlocks);
+  if (ExitBlocks.size() == 1)
+    BlockToInsertInto = ExitBlocks[0];
+  else
+    BlockToInsertInto = Preheader;
+  BasicBlock::iterator InsertPt = BlockToInsertInto->getFirstNonPHI();
+
+  std::map<Instruction*, Value*> ExitValues;
+
+  // Find all values that are computed inside the loop, but used outside of it.
+  // Because of LCSSA, these values will only occur in LCSSA PHI Nodes.  Scan
+  // the exit blocks of the loop to find them.
+  for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) {
+    BasicBlock *ExitBB = ExitBlocks[i];
+
+    // If there are no PHI nodes in this exit block, then no values defined
+    // inside the loop are used on this path, skip it.
+    PHINode *PN = dyn_cast<PHINode>(ExitBB->begin());
+    if (!PN) continue;
+
+    unsigned NumPreds = PN->getNumIncomingValues();
+
+    // Iterate over all of the PHI nodes.
+    BasicBlock::iterator BBI = ExitBB->begin();
+    while ((PN = dyn_cast<PHINode>(BBI++))) {
+      if (PN->use_empty())
+        continue; // dead use, don't replace it
+      // Iterate over all of the values in all the PHI nodes.
+      for (unsigned i = 0; i != NumPreds; ++i) {
+        // If the value being merged in is not integer or is not defined
+        // in the loop, skip it.
+        Value *InVal = PN->getIncomingValue(i);
+        if (!isa<Instruction>(InVal) ||
+            // SCEV only supports integer expressions for now.
+            (!isa<IntegerType>(InVal->getType()) &&
+             !isa<PointerType>(InVal->getType())))
+          continue;
+
+        // If this pred is for a subloop, not L itself, skip it.
+        if (LI->getLoopFor(PN->getIncomingBlock(i)) != L)
+          continue; // The Block is in a subloop, skip it.
+
+        // Check that InVal is defined in the loop.
+        Instruction *Inst = cast<Instruction>(InVal);
+        if (!L->contains(Inst->getParent()))
+          continue;
+
+        // Okay, this instruction has a user outside of the current loop
+        // and varies predictably *inside* the loop.  Evaluate the value it
+        // contains when the loop exits, if possible.
+        SCEVHandle ExitValue = SE->getSCEVAtScope(Inst, L->getParentLoop());
+        if (!ExitValue->isLoopInvariant(L))
+          continue;
+
+        Changed = true;
+        ++NumReplaced;
+
+        // See if we already computed the exit value for the instruction, if so,
+        // just reuse it.
+        Value *&ExitVal = ExitValues[Inst];
+        if (!ExitVal)
+          ExitVal = Rewriter.expandCodeFor(ExitValue, PN->getType(), InsertPt);
+
+        DOUT << "INDVARS: RLEV: AfterLoopVal = " << *ExitVal
+             << "  LoopVal = " << *Inst << "\n";
+
+        PN->setIncomingValue(i, ExitVal);
+
+        // If this instruction is dead now, delete it.
+        RecursivelyDeleteTriviallyDeadInstructions(Inst);
+
+        // See if this is a single-entry LCSSA PHI node.  If so, we can (and
+        // have to) remove
+        // the PHI entirely.  This is safe, because the NewVal won't be variant
+        // in the loop, so we don't need an LCSSA phi node anymore.
+        if (NumPreds == 1) {
+          PN->replaceAllUsesWith(ExitVal);
+          RecursivelyDeleteTriviallyDeadInstructions(PN);
+          break;
+        }
+      }
+    }
+  }
+}
+
+void IndVarSimplify::RewriteNonIntegerIVs(Loop *L) {
+  // First step.  Check to see if there are any floating-point recurrences.
+  // If there are, change them into integer recurrences, permitting analysis by
+  // the SCEV routines.
+  //
+  BasicBlock *Header    = L->getHeader();
+
+  SmallVector<WeakVH, 8> PHIs;
+  for (BasicBlock::iterator I = Header->begin();
+       PHINode *PN = dyn_cast<PHINode>(I); ++I)
+    PHIs.push_back(PN);
+
+  for (unsigned i = 0, e = PHIs.size(); i != e; ++i)
+    if (PHINode *PN = dyn_cast_or_null<PHINode>(PHIs[i]))
+      HandleFloatingPointIV(L, PN);
+
+  // If the loop previously had floating-point IV, ScalarEvolution
+  // may not have been able to compute a trip count. Now that we've done some
+  // re-writing, the trip count may be computable.
+  if (Changed)
+    SE->forgetLoopBackedgeTakenCount(L);
+}
+
+bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
+  IU = &getAnalysis<IVUsers>();
+  LI = &getAnalysis<LoopInfo>();
+  SE = &getAnalysis<ScalarEvolution>();
+  Changed = false;
+
+  // If there are any floating-point recurrences, attempt to
+  // transform them to use integer recurrences.
+  RewriteNonIntegerIVs(L);
+
+  BasicBlock *Header       = L->getHeader();
+  BasicBlock *ExitingBlock = L->getExitingBlock(); // may be null
+  SCEVHandle BackedgeTakenCount = SE->getBackedgeTakenCount(L);
+
+  // Check to see if this loop has a computable loop-invariant execution count.
+  // If so, this means that we can compute the final value of any expressions
+  // that are recurrent in the loop, and substitute the exit values from the
+  // loop into any instructions outside of the loop that use the final values of
+  // the current expressions.
+  //
+  if (!isa<SCEVCouldNotCompute>(BackedgeTakenCount))
+    RewriteLoopExitValues(L, BackedgeTakenCount);
+
+  // Compute the type of the largest recurrence expression, and decide whether
+  // a canonical induction variable should be inserted.
+  const Type *LargestType = 0;
+  bool NeedCannIV = false;
+  if (!isa<SCEVCouldNotCompute>(BackedgeTakenCount)) {
+    LargestType = BackedgeTakenCount->getType();
+    LargestType = SE->getEffectiveSCEVType(LargestType);
+    // If we have a known trip count and a single exit block, we'll be
+    // rewriting the loop exit test condition below, which requires a
+    // canonical induction variable.
+    if (ExitingBlock)
+      NeedCannIV = true;
+  }
+  for (unsigned i = 0, e = IU->StrideOrder.size(); i != e; ++i) {
+    SCEVHandle Stride = IU->StrideOrder[i];
+    const Type *Ty = SE->getEffectiveSCEVType(Stride->getType());
+    if (!LargestType ||
+        SE->getTypeSizeInBits(Ty) >
+          SE->getTypeSizeInBits(LargestType))
+      LargestType = Ty;
+
+    std::map<SCEVHandle, IVUsersOfOneStride *>::iterator SI =
+      IU->IVUsesByStride.find(IU->StrideOrder[i]);
+    assert(SI != IU->IVUsesByStride.end() && "Stride doesn't exist!");
+
+    if (!SI->second->Users.empty())
+      NeedCannIV = true;
+  }
+
+  // Create a rewriter object which we'll use to transform the code with.
+  SCEVExpander Rewriter(*SE);
+
+  // Now that we know the largest of of the induction variable expressions
+  // in this loop, insert a canonical induction variable of the largest size.
+  Value *IndVar = 0;
+  if (NeedCannIV) {
+    IndVar = Rewriter.getOrInsertCanonicalInductionVariable(L,LargestType);
+    ++NumInserted;
+    Changed = true;
+    DOUT << "INDVARS: New CanIV: " << *IndVar;
+  }
+
+  // If we have a trip count expression, rewrite the loop's exit condition
+  // using it.  We can currently only handle loops with a single exit.
+  ICmpInst *NewICmp = 0;
+  if (!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && ExitingBlock) {
+    assert(NeedCannIV &&
+           "LinearFunctionTestReplace requires a canonical induction variable");
+    // Can't rewrite non-branch yet.
+    if (BranchInst *BI = dyn_cast<BranchInst>(ExitingBlock->getTerminator()))
+      NewICmp = LinearFunctionTestReplace(L, BackedgeTakenCount, IndVar,
+                                          ExitingBlock, BI, Rewriter);
+  }
+
+  Rewriter.setInsertionPoint(Header->getFirstNonPHI());
+
+  // Rewrite IV-derived expressions. Clears the rewriter cache.
+  RewriteIVExpressions(L, LargestType, Rewriter);
+
+  // The Rewriter may only be used for isInsertedInstruction queries from this
+  // point on.
+
+  // Loop-invariant instructions in the preheader that aren't used in the
+  // loop may be sunk below the loop to reduce register pressure.
+  SinkUnusedInvariants(L, Rewriter);
+
+  // Reorder instructions to avoid use-before-def conditions.
+  FixUsesBeforeDefs(L, Rewriter);
+
+  // For completeness, inform IVUsers of the IV use in the newly-created
+  // loop exit test instruction.
+  if (NewICmp)
+    IU->AddUsersIfInteresting(cast<Instruction>(NewICmp->getOperand(0)));
+
+  // Clean up dead instructions.
+  DeleteDeadPHIs(L->getHeader());
+  // Check a post-condition.
+  assert(L->isLCSSAForm() && "Indvars did not leave the loop in lcssa form!");
+  return Changed;
+}
+
+void IndVarSimplify::RewriteIVExpressions(Loop *L, const Type *LargestType,
+                                          SCEVExpander &Rewriter) {
+  SmallVector<WeakVH, 16> DeadInsts;
+
+  // Rewrite all induction variable expressions in terms of the canonical
+  // induction variable.
+  //
+  // If there were induction variables of other sizes or offsets, manually
+  // add the offsets to the primary induction variable and cast, avoiding
+  // the need for the code evaluation methods to insert induction variables
+  // of different sizes.
+  for (unsigned i = 0, e = IU->StrideOrder.size(); i != e; ++i) {
+    SCEVHandle Stride = IU->StrideOrder[i];
+
+    std::map<SCEVHandle, IVUsersOfOneStride *>::iterator SI =
+      IU->IVUsesByStride.find(IU->StrideOrder[i]);
+    assert(SI != IU->IVUsesByStride.end() && "Stride doesn't exist!");
+    ilist<IVStrideUse> &List = SI->second->Users;
+    for (ilist<IVStrideUse>::iterator UI = List.begin(),
+         E = List.end(); UI != E; ++UI) {
+      SCEVHandle Offset = UI->getOffset();
+      Value *Op = UI->getOperandValToReplace();
+      Instruction *User = UI->getUser();
+      bool isSigned = UI->isSigned();
+
+      // Compute the final addrec to expand into code.
+      SCEVHandle AR = IU->getReplacementExpr(*UI);
+
+      // FIXME: It is an extremely bad idea to indvar substitute anything more
+      // complex than affine induction variables.  Doing so will put expensive
+      // polynomial evaluations inside of the loop, and the str reduction pass
+      // currently can only reduce affine polynomials.  For now just disable
+      // indvar subst on anything more complex than an affine addrec, unless
+      // it can be expanded to a trivial value.
+      if (!Stride->isLoopInvariant(L) &&
+          !isa<SCEVConstant>(AR) &&
+          L->contains(User->getParent()))
+        continue;
+
+      Value *NewVal = 0;
+      if (AR->isLoopInvariant(L)) {
+        BasicBlock::iterator I = Rewriter.getInsertionPoint();
+        // Expand loop-invariant values in the loop preheader. They will
+        // be sunk to the exit block later, if possible.
+        NewVal =
+          Rewriter.expandCodeFor(AR, LargestType,
+                                 L->getLoopPreheader()->getTerminator());
+        Rewriter.setInsertionPoint(I);
+        ++NumReplaced;
+      } else {
+        const Type *IVTy = Offset->getType();
+        const Type *UseTy = Op->getType();
+
+        // Promote the Offset and Stride up to the canonical induction
+        // variable's bit width.
+        SCEVHandle PromotedOffset = Offset;
+        SCEVHandle PromotedStride = Stride;
+        if (SE->getTypeSizeInBits(IVTy) != SE->getTypeSizeInBits(LargestType)) {
+          // It doesn't matter for correctness whether zero or sign extension
+          // is used here, since the value is truncated away below, but if the
+          // value is signed, sign extension is more likely to be folded.
+          if (isSigned) {
+            PromotedOffset = SE->getSignExtendExpr(PromotedOffset, LargestType);
+            PromotedStride = SE->getSignExtendExpr(PromotedStride, LargestType);
+          } else {
+            PromotedOffset = SE->getZeroExtendExpr(PromotedOffset, LargestType);
+            // If the stride is obviously negative, use sign extension to
+            // produce things like x-1 instead of x+255.
+            if (isa<SCEVConstant>(PromotedStride) &&
+                cast<SCEVConstant>(PromotedStride)
+                  ->getValue()->getValue().isNegative())
+              PromotedStride = SE->getSignExtendExpr(PromotedStride,
+                                                     LargestType);
+            else
+              PromotedStride = SE->getZeroExtendExpr(PromotedStride,
+                                                     LargestType);
+          }
+        }
+
+        // Create the SCEV representing the offset from the canonical
+        // induction variable, still in the canonical induction variable's
+        // type, so that all expanded arithmetic is done in the same type.
+        SCEVHandle NewAR = SE->getAddRecExpr(SE->getIntegerSCEV(0, LargestType),
+                                           PromotedStride, L);
+        // Add the PromotedOffset as a separate step, because it may not be
+        // loop-invariant.
+        NewAR = SE->getAddExpr(NewAR, PromotedOffset);
+
+        // Expand the addrec into instructions.
+        Value *V = Rewriter.expandCodeFor(NewAR);
+
+        // Insert an explicit cast if necessary to truncate the value
+        // down to the original stride type. This is done outside of
+        // SCEVExpander because in SCEV expressions, a truncate of an
+        // addrec is always folded.
+        if (LargestType != IVTy) {
+          if (SE->getTypeSizeInBits(IVTy) != SE->getTypeSizeInBits(LargestType))
+            NewAR = SE->getTruncateExpr(NewAR, IVTy);
+          if (Rewriter.isInsertedExpression(NewAR))
+            V = Rewriter.expandCodeFor(NewAR);
+          else {
+            V = Rewriter.InsertCastOfTo(CastInst::getCastOpcode(V, false,
+                                                                IVTy, false),
+                                        V, IVTy);
+            assert(!isa<SExtInst>(V) && !isa<ZExtInst>(V) &&
+                   "LargestType wasn't actually the largest type!");
+            // Force the rewriter to use this trunc whenever this addrec
+            // appears so that it doesn't insert new phi nodes or
+            // arithmetic in a different type.
+            Rewriter.addInsertedValue(V, NewAR);
+          }
+        }
+
+        DOUT << "INDVARS: Made offset-and-trunc IV for offset "
+             << *IVTy << " " << *Offset << ": ";
+        DEBUG(WriteAsOperand(*DOUT, V, false));
+        DOUT << "\n";
+
+        // Now expand it into actual Instructions and patch it into place.
+        NewVal = Rewriter.expandCodeFor(AR, UseTy);
+      }
+
+      // Patch the new value into place.
+      if (Op->hasName())
+        NewVal->takeName(Op);
+      User->replaceUsesOfWith(Op, NewVal);
+      UI->setOperandValToReplace(NewVal);
+      DOUT << "INDVARS: Rewrote IV '" << *AR << "' " << *Op
+           << "   into = " << *NewVal << "\n";
+      ++NumRemoved;
+      Changed = true;
+
+      // The old value may be dead now.
+      DeadInsts.push_back(Op);
+    }
+  }
+
+  // Clear the rewriter cache, because values that are in the rewriter's cache
+  // can be deleted in the loop below, causing the AssertingVH in the cache to
+  // trigger.
+  Rewriter.clear();
+  // Now that we're done iterating through lists, clean up any instructions
+  // which are now dead.
+  while (!DeadInsts.empty()) {
+    Instruction *Inst = dyn_cast_or_null<Instruction>(DeadInsts.pop_back_val());
+    if (Inst)
+      RecursivelyDeleteTriviallyDeadInstructions(Inst);
+  }
+}
+
+/// If there's a single exit block, sink any loop-invariant values that
+/// were defined in the preheader but not used inside the loop into the
+/// exit block to reduce register pressure in the loop.
+void IndVarSimplify::SinkUnusedInvariants(Loop *L, SCEVExpander &Rewriter) {
+  BasicBlock *ExitBlock = L->getExitBlock();
+  if (!ExitBlock) return;
+
+  Instruction *NonPHI = ExitBlock->getFirstNonPHI();
+  BasicBlock *Preheader = L->getLoopPreheader();
+  BasicBlock::iterator I = Preheader->getTerminator();
+  while (I != Preheader->begin()) {
+    --I;
+    // New instructions were inserted at the end of the preheader. Only
+    // consider those new instructions.
+    if (!Rewriter.isInsertedInstruction(I))
+      break;
+    // Determine if there is a use in or before the loop (direct or
+    // otherwise).
+    bool UsedInLoop = false;
+    for (Value::use_iterator UI = I->use_begin(), UE = I->use_end();
+         UI != UE; ++UI) {
+      BasicBlock *UseBB = cast<Instruction>(UI)->getParent();
+      if (PHINode *P = dyn_cast<PHINode>(UI)) {
+        unsigned i =
+          PHINode::getIncomingValueNumForOperand(UI.getOperandNo());
+        UseBB = P->getIncomingBlock(i);
+      }
+      if (UseBB == Preheader || L->contains(UseBB)) {
+        UsedInLoop = true;
+        break;
+      }
+    }
+    // If there is, the def must remain in the preheader.
+    if (UsedInLoop)
+      continue;
+    // Otherwise, sink it to the exit block.
+    Instruction *ToMove = I;
+    bool Done = false;
+    if (I != Preheader->begin())
+      --I;
+    else
+      Done = true;
+    ToMove->moveBefore(NonPHI);
+    if (Done)
+      break;
+  }
+}
+
+/// Re-schedule the inserted instructions to put defs before uses. This
+/// fixes problems that arrise when SCEV expressions contain loop-variant
+/// values unrelated to the induction variable which are defined inside the
+/// loop. FIXME: It would be better to insert instructions in the right
+/// place so that this step isn't needed.
+void IndVarSimplify::FixUsesBeforeDefs(Loop *L, SCEVExpander &Rewriter) {
+  // Visit all the blocks in the loop in pre-order dom-tree dfs order.
+  DominatorTree *DT = &getAnalysis<DominatorTree>();
+  std::map<Instruction *, unsigned> NumPredsLeft;
+  SmallVector<DomTreeNode *, 16> Worklist;
+  Worklist.push_back(DT->getNode(L->getHeader()));
+  do {
+    DomTreeNode *Node = Worklist.pop_back_val();
+    for (DomTreeNode::iterator I = Node->begin(), E = Node->end(); I != E; ++I)
+      if (L->contains((*I)->getBlock()))
+        Worklist.push_back(*I);
+    BasicBlock *BB = Node->getBlock();
+    // Visit all the instructions in the block top down.
+    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
+      // Count the number of operands that aren't properly dominating.
+      unsigned NumPreds = 0;
+      if (Rewriter.isInsertedInstruction(I) && !isa<PHINode>(I))
+        for (User::op_iterator OI = I->op_begin(), OE = I->op_end();
+             OI != OE; ++OI)
+          if (Instruction *Inst = dyn_cast<Instruction>(OI))
+            if (L->contains(Inst->getParent()) && !NumPredsLeft.count(Inst))
+              ++NumPreds;
+      NumPredsLeft[I] = NumPreds;
+      // Notify uses of the position of this instruction, and move the
+      // users (and their dependents, recursively) into place after this
+      // instruction if it is their last outstanding operand.
+      for (Value::use_iterator UI = I->use_begin(), UE = I->use_end();
+           UI != UE; ++UI) {
+        Instruction *Inst = cast<Instruction>(UI);
+        std::map<Instruction *, unsigned>::iterator Z = NumPredsLeft.find(Inst);
+        if (Z != NumPredsLeft.end() && Z->second != 0 && --Z->second == 0) {
+          SmallVector<Instruction *, 4> UseWorkList;
+          UseWorkList.push_back(Inst);
+          BasicBlock::iterator InsertPt = I;
+          if (InvokeInst *II = dyn_cast<InvokeInst>(InsertPt))
+            InsertPt = II->getNormalDest()->begin();
+          else
+            ++InsertPt;
+          while (isa<PHINode>(InsertPt)) ++InsertPt;
+          do {
+            Instruction *Use = UseWorkList.pop_back_val();
+            Use->moveBefore(InsertPt);
+            NumPredsLeft.erase(Use);
+            for (Value::use_iterator IUI = Use->use_begin(),
+                 IUE = Use->use_end(); IUI != IUE; ++IUI) {
+              Instruction *IUIInst = cast<Instruction>(IUI);
+              if (L->contains(IUIInst->getParent()) &&
+                  Rewriter.isInsertedInstruction(IUIInst) &&
+                  !isa<PHINode>(IUIInst))
+                UseWorkList.push_back(IUIInst);
+            }
+          } while (!UseWorkList.empty());
+        }
+      }
+    }
+  } while (!Worklist.empty());
+}
+
+/// Return true if it is OK to use SIToFPInst for an inducation variable
+/// with given inital and exit values.
+static bool useSIToFPInst(ConstantFP &InitV, ConstantFP &ExitV,
+                          uint64_t intIV, uint64_t intEV) {
+
+  if (InitV.getValueAPF().isNegative() || ExitV.getValueAPF().isNegative())
+    return true;
+
+  // If the iteration range can be handled by SIToFPInst then use it.
+  APInt Max = APInt::getSignedMaxValue(32);
+  if (Max.getZExtValue() > static_cast<uint64_t>(abs64(intEV - intIV)))
+    return true;
+
+  return false;
+}
+
+/// convertToInt - Convert APF to an integer, if possible.
+static bool convertToInt(const APFloat &APF, uint64_t *intVal) {
+
+  bool isExact = false;
+  if (&APF.getSemantics() == &APFloat::PPCDoubleDouble)
+    return false;
+  if (APF.convertToInteger(intVal, 32, APF.isNegative(),
+                           APFloat::rmTowardZero, &isExact)
+      != APFloat::opOK)
+    return false;
+  if (!isExact)
+    return false;
+  return true;
+
+}
+
+/// HandleFloatingPointIV - If the loop has floating induction variable
+/// then insert corresponding integer induction variable if possible.
+/// For example,
+/// for(double i = 0; i < 10000; ++i)
+///   bar(i)
+/// is converted into
+/// for(int i = 0; i < 10000; ++i)
+///   bar((double)i);
+///
+void IndVarSimplify::HandleFloatingPointIV(Loop *L, PHINode *PH) {
+
+  unsigned IncomingEdge = L->contains(PH->getIncomingBlock(0));
+  unsigned BackEdge     = IncomingEdge^1;
+
+  // Check incoming value.
+  ConstantFP *InitValue = dyn_cast<ConstantFP>(PH->getIncomingValue(IncomingEdge));
+  if (!InitValue) return;
+  uint64_t newInitValue = Type::Int32Ty->getPrimitiveSizeInBits();
+  if (!convertToInt(InitValue->getValueAPF(), &newInitValue))
+    return;
+
+  // Check IV increment. Reject this PH if increement operation is not
+  // an add or increment value can not be represented by an integer.
+  BinaryOperator *Incr =
+    dyn_cast<BinaryOperator>(PH->getIncomingValue(BackEdge));
+  if (!Incr) return;
+  if (Incr->getOpcode() != Instruction::Add) return;
+  ConstantFP *IncrValue = NULL;
+  unsigned IncrVIndex = 1;
+  if (Incr->getOperand(1) == PH)
+    IncrVIndex = 0;
+  IncrValue = dyn_cast<ConstantFP>(Incr->getOperand(IncrVIndex));
+  if (!IncrValue) return;
+  uint64_t newIncrValue = Type::Int32Ty->getPrimitiveSizeInBits();
+  if (!convertToInt(IncrValue->getValueAPF(), &newIncrValue))
+    return;
+
+  // Check Incr uses. One user is PH and the other users is exit condition used
+  // by the conditional terminator.
+  Value::use_iterator IncrUse = Incr->use_begin();
+  Instruction *U1 = cast<Instruction>(IncrUse++);
+  if (IncrUse == Incr->use_end()) return;
+  Instruction *U2 = cast<Instruction>(IncrUse++);
+  if (IncrUse != Incr->use_end()) return;
+
+  // Find exit condition.
+  FCmpInst *EC = dyn_cast<FCmpInst>(U1);
+  if (!EC)
+    EC = dyn_cast<FCmpInst>(U2);
+  if (!EC) return;
+
+  if (BranchInst *BI = dyn_cast<BranchInst>(EC->getParent()->getTerminator())) {
+    if (!BI->isConditional()) return;
+    if (BI->getCondition() != EC) return;
+  }
+
+  // Find exit value. If exit value can not be represented as an interger then
+  // do not handle this floating point PH.
+  ConstantFP *EV = NULL;
+  unsigned EVIndex = 1;
+  if (EC->getOperand(1) == Incr)
+    EVIndex = 0;
+  EV = dyn_cast<ConstantFP>(EC->getOperand(EVIndex));
+  if (!EV) return;
+  uint64_t intEV = Type::Int32Ty->getPrimitiveSizeInBits();
+  if (!convertToInt(EV->getValueAPF(), &intEV))
+    return;
+
+  // Find new predicate for integer comparison.
+  CmpInst::Predicate NewPred = CmpInst::BAD_ICMP_PREDICATE;
+  switch (EC->getPredicate()) {
+  case CmpInst::FCMP_OEQ:
+  case CmpInst::FCMP_UEQ:
+    NewPred = CmpInst::ICMP_EQ;
+    break;
+  case CmpInst::FCMP_OGT:
+  case CmpInst::FCMP_UGT:
+    NewPred = CmpInst::ICMP_UGT;
+    break;
+  case CmpInst::FCMP_OGE:
+  case CmpInst::FCMP_UGE:
+    NewPred = CmpInst::ICMP_UGE;
+    break;
+  case CmpInst::FCMP_OLT:
+  case CmpInst::FCMP_ULT:
+    NewPred = CmpInst::ICMP_ULT;
+    break;
+  case CmpInst::FCMP_OLE:
+  case CmpInst::FCMP_ULE:
+    NewPred = CmpInst::ICMP_ULE;
+    break;
+  default:
+    break;
+  }
+  if (NewPred == CmpInst::BAD_ICMP_PREDICATE) return;
+
+  // Insert new integer induction variable.
+  PHINode *NewPHI = PHINode::Create(Type::Int32Ty,
+                                    PH->getName()+".int", PH);
+  NewPHI->addIncoming(ConstantInt::get(Type::Int32Ty, newInitValue),
+                      PH->getIncomingBlock(IncomingEdge));
+
+  Value *NewAdd = BinaryOperator::CreateAdd(NewPHI,
+                                            ConstantInt::get(Type::Int32Ty,
+                                                             newIncrValue),
+                                            Incr->getName()+".int", Incr);
+  NewPHI->addIncoming(NewAdd, PH->getIncomingBlock(BackEdge));
+
+  // The back edge is edge 1 of newPHI, whatever it may have been in the
+  // original PHI.
+  ConstantInt *NewEV = ConstantInt::get(Type::Int32Ty, intEV);
+  Value *LHS = (EVIndex == 1 ? NewPHI->getIncomingValue(1) : NewEV);
+  Value *RHS = (EVIndex == 1 ? NewEV : NewPHI->getIncomingValue(1));
+  ICmpInst *NewEC = new ICmpInst(NewPred, LHS, RHS, EC->getNameStart(),
+                                 EC->getParent()->getTerminator());
+
+  // In the following deltions, PH may become dead and may be deleted.
+  // Use a WeakVH to observe whether this happens.
+  WeakVH WeakPH = PH;
+
+  // Delete old, floating point, exit comparision instruction.
+  NewEC->takeName(EC);
+  EC->replaceAllUsesWith(NewEC);
+  RecursivelyDeleteTriviallyDeadInstructions(EC);
+
+  // Delete old, floating point, increment instruction.
+  Incr->replaceAllUsesWith(UndefValue::get(Incr->getType()));
+  RecursivelyDeleteTriviallyDeadInstructions(Incr);
+
+  // Replace floating induction variable, if it isn't already deleted.
+  // Give SIToFPInst preference over UIToFPInst because it is faster on
+  // platforms that are widely used.
+  if (WeakPH && !PH->use_empty()) {
+    if (useSIToFPInst(*InitValue, *EV, newInitValue, intEV)) {
+      SIToFPInst *Conv = new SIToFPInst(NewPHI, PH->getType(), "indvar.conv",
+                                        PH->getParent()->getFirstNonPHI());
+      PH->replaceAllUsesWith(Conv);
+    } else {
+      UIToFPInst *Conv = new UIToFPInst(NewPHI, PH->getType(), "indvar.conv",
+                                        PH->getParent()->getFirstNonPHI());
+      PH->replaceAllUsesWith(Conv);
+    }
+    RecursivelyDeleteTriviallyDeadInstructions(PH);
+  }
+
+  // Add a new IVUsers entry for the newly-created integer PHI.
+  IU->AddUsersIfInteresting(NewPHI);
+}
diff --git a/lib/Transforms/Scalar/InstructionCombining.cpp b/lib/Transforms/Scalar/InstructionCombining.cpp
new file mode 100644
index 0000000..e6f854f
--- /dev/null
+++ b/lib/Transforms/Scalar/InstructionCombining.cpp
@@ -0,0 +1,12919 @@
+//===- InstructionCombining.cpp - Combine multiple instructions -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// InstructionCombining - Combine instructions to form fewer, simple
+// instructions.  This pass does not modify the CFG.  This pass is where
+// algebraic simplification happens.
+//
+// This pass combines things like:
+//    %Y = add i32 %X, 1
+//    %Z = add i32 %Y, 1
+// into:
+//    %Z = add i32 %X, 2
+//
+// This is a simple worklist driven algorithm.
+//
+// This pass guarantees that the following canonicalizations are performed on
+// the program:
+//    1. If a binary operator has a constant operand, it is moved to the RHS
+//    2. Bitwise operators with constant operands are always grouped so that
+//       shifts are performed first, then or's, then and's, then xor's.
+//    3. Compare instructions are converted from <,>,<=,>= to ==,!= if possible
+//    4. All cmp instructions on boolean values are replaced with logical ops
+//    5. add X, X is represented as (X*2) => (X << 1)
+//    6. Multiplies with a power-of-two constant argument are transformed into
+//       shifts.
+//   ... etc.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "instcombine"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Pass.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/GlobalVariable.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Support/CallSite.h"
+#include "llvm/Support/ConstantRange.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/GetElementPtrTypeIterator.h"
+#include "llvm/Support/InstVisitor.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/PatternMatch.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/STLExtras.h"
+#include <algorithm>
+#include <climits>
+#include <sstream>
+using namespace llvm;
+using namespace llvm::PatternMatch;
+
+STATISTIC(NumCombined , "Number of insts combined");
+STATISTIC(NumConstProp, "Number of constant folds");
+STATISTIC(NumDeadInst , "Number of dead inst eliminated");
+STATISTIC(NumDeadStore, "Number of dead stores eliminated");
+STATISTIC(NumSunkInst , "Number of instructions sunk");
+
+namespace {
+  class VISIBILITY_HIDDEN InstCombiner
+    : public FunctionPass,
+      public InstVisitor<InstCombiner, Instruction*> {
+    // Worklist of all of the instructions that need to be simplified.
+    SmallVector<Instruction*, 256> Worklist;
+    DenseMap<Instruction*, unsigned> WorklistMap;
+    TargetData *TD;
+    bool MustPreserveLCSSA;
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    InstCombiner() : FunctionPass(&ID) {}
+
+    /// AddToWorkList - Add the specified instruction to the worklist if it
+    /// isn't already in it.
+    void AddToWorkList(Instruction *I) {
+      if (WorklistMap.insert(std::make_pair(I, Worklist.size())).second)
+        Worklist.push_back(I);
+    }
+    
+    // RemoveFromWorkList - remove I from the worklist if it exists.
+    void RemoveFromWorkList(Instruction *I) {
+      DenseMap<Instruction*, unsigned>::iterator It = WorklistMap.find(I);
+      if (It == WorklistMap.end()) return; // Not in worklist.
+      
+      // Don't bother moving everything down, just null out the slot.
+      Worklist[It->second] = 0;
+      
+      WorklistMap.erase(It);
+    }
+    
+    Instruction *RemoveOneFromWorkList() {
+      Instruction *I = Worklist.back();
+      Worklist.pop_back();
+      WorklistMap.erase(I);
+      return I;
+    }
+
+    
+    /// AddUsersToWorkList - When an instruction is simplified, add all users of
+    /// the instruction to the work lists because they might get more simplified
+    /// now.
+    ///
+    void AddUsersToWorkList(Value &I) {
+      for (Value::use_iterator UI = I.use_begin(), UE = I.use_end();
+           UI != UE; ++UI)
+        AddToWorkList(cast<Instruction>(*UI));
+    }
+
+    /// AddUsesToWorkList - When an instruction is simplified, add operands to
+    /// the work lists because they might get more simplified now.
+    ///
+    void AddUsesToWorkList(Instruction &I) {
+      for (User::op_iterator i = I.op_begin(), e = I.op_end(); i != e; ++i)
+        if (Instruction *Op = dyn_cast<Instruction>(*i))
+          AddToWorkList(Op);
+    }
+    
+    /// AddSoonDeadInstToWorklist - The specified instruction is about to become
+    /// dead.  Add all of its operands to the worklist, turning them into
+    /// undef's to reduce the number of uses of those instructions.
+    ///
+    /// Return the specified operand before it is turned into an undef.
+    ///
+    Value *AddSoonDeadInstToWorklist(Instruction &I, unsigned op) {
+      Value *R = I.getOperand(op);
+      
+      for (User::op_iterator i = I.op_begin(), e = I.op_end(); i != e; ++i)
+        if (Instruction *Op = dyn_cast<Instruction>(*i)) {
+          AddToWorkList(Op);
+          // Set the operand to undef to drop the use.
+          *i = UndefValue::get(Op->getType());
+        }
+      
+      return R;
+    }
+
+  public:
+    virtual bool runOnFunction(Function &F);
+    
+    bool DoOneIteration(Function &F, unsigned ItNum);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequired<TargetData>();
+      AU.addPreservedID(LCSSAID);
+      AU.setPreservesCFG();
+    }
+
+    TargetData &getTargetData() const { return *TD; }
+
+    // Visitation implementation - Implement instruction combining for different
+    // instruction types.  The semantics are as follows:
+    // Return Value:
+    //    null        - No change was made
+    //     I          - Change was made, I is still valid, I may be dead though
+    //   otherwise    - Change was made, replace I with returned instruction
+    //
+    Instruction *visitAdd(BinaryOperator &I);
+    Instruction *visitSub(BinaryOperator &I);
+    Instruction *visitMul(BinaryOperator &I);
+    Instruction *visitURem(BinaryOperator &I);
+    Instruction *visitSRem(BinaryOperator &I);
+    Instruction *visitFRem(BinaryOperator &I);
+    bool SimplifyDivRemOfSelect(BinaryOperator &I);
+    Instruction *commonRemTransforms(BinaryOperator &I);
+    Instruction *commonIRemTransforms(BinaryOperator &I);
+    Instruction *commonDivTransforms(BinaryOperator &I);
+    Instruction *commonIDivTransforms(BinaryOperator &I);
+    Instruction *visitUDiv(BinaryOperator &I);
+    Instruction *visitSDiv(BinaryOperator &I);
+    Instruction *visitFDiv(BinaryOperator &I);
+    Instruction *FoldAndOfICmps(Instruction &I, ICmpInst *LHS, ICmpInst *RHS);
+    Instruction *visitAnd(BinaryOperator &I);
+    Instruction *FoldOrOfICmps(Instruction &I, ICmpInst *LHS, ICmpInst *RHS);
+    Instruction *FoldOrWithConstants(BinaryOperator &I, Value *Op,
+                                     Value *A, Value *B, Value *C);
+    Instruction *visitOr (BinaryOperator &I);
+    Instruction *visitXor(BinaryOperator &I);
+    Instruction *visitShl(BinaryOperator &I);
+    Instruction *visitAShr(BinaryOperator &I);
+    Instruction *visitLShr(BinaryOperator &I);
+    Instruction *commonShiftTransforms(BinaryOperator &I);
+    Instruction *FoldFCmp_IntToFP_Cst(FCmpInst &I, Instruction *LHSI,
+                                      Constant *RHSC);
+    Instruction *visitFCmpInst(FCmpInst &I);
+    Instruction *visitICmpInst(ICmpInst &I);
+    Instruction *visitICmpInstWithCastAndCast(ICmpInst &ICI);
+    Instruction *visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
+                                                Instruction *LHS,
+                                                ConstantInt *RHS);
+    Instruction *FoldICmpDivCst(ICmpInst &ICI, BinaryOperator *DivI,
+                                ConstantInt *DivRHS);
+
+    Instruction *FoldGEPICmp(User *GEPLHS, Value *RHS,
+                             ICmpInst::Predicate Cond, Instruction &I);
+    Instruction *FoldShiftByConstant(Value *Op0, ConstantInt *Op1,
+                                     BinaryOperator &I);
+    Instruction *commonCastTransforms(CastInst &CI);
+    Instruction *commonIntCastTransforms(CastInst &CI);
+    Instruction *commonPointerCastTransforms(CastInst &CI);
+    Instruction *visitTrunc(TruncInst &CI);
+    Instruction *visitZExt(ZExtInst &CI);
+    Instruction *visitSExt(SExtInst &CI);
+    Instruction *visitFPTrunc(FPTruncInst &CI);
+    Instruction *visitFPExt(CastInst &CI);
+    Instruction *visitFPToUI(FPToUIInst &FI);
+    Instruction *visitFPToSI(FPToSIInst &FI);
+    Instruction *visitUIToFP(CastInst &CI);
+    Instruction *visitSIToFP(CastInst &CI);
+    Instruction *visitPtrToInt(PtrToIntInst &CI);
+    Instruction *visitIntToPtr(IntToPtrInst &CI);
+    Instruction *visitBitCast(BitCastInst &CI);
+    Instruction *FoldSelectOpOp(SelectInst &SI, Instruction *TI,
+                                Instruction *FI);
+    Instruction *FoldSelectIntoOp(SelectInst &SI, Value*, Value*);
+    Instruction *visitSelectInst(SelectInst &SI);
+    Instruction *visitSelectInstWithICmp(SelectInst &SI, ICmpInst *ICI);
+    Instruction *visitCallInst(CallInst &CI);
+    Instruction *visitInvokeInst(InvokeInst &II);
+    Instruction *visitPHINode(PHINode &PN);
+    Instruction *visitGetElementPtrInst(GetElementPtrInst &GEP);
+    Instruction *visitAllocationInst(AllocationInst &AI);
+    Instruction *visitFreeInst(FreeInst &FI);
+    Instruction *visitLoadInst(LoadInst &LI);
+    Instruction *visitStoreInst(StoreInst &SI);
+    Instruction *visitBranchInst(BranchInst &BI);
+    Instruction *visitSwitchInst(SwitchInst &SI);
+    Instruction *visitInsertElementInst(InsertElementInst &IE);
+    Instruction *visitExtractElementInst(ExtractElementInst &EI);
+    Instruction *visitShuffleVectorInst(ShuffleVectorInst &SVI);
+    Instruction *visitExtractValueInst(ExtractValueInst &EV);
+
+    // visitInstruction - Specify what to return for unhandled instructions...
+    Instruction *visitInstruction(Instruction &I) { return 0; }
+
+  private:
+    Instruction *visitCallSite(CallSite CS);
+    bool transformConstExprCastCall(CallSite CS);
+    Instruction *transformCallThroughTrampoline(CallSite CS);
+    Instruction *transformZExtICmp(ICmpInst *ICI, Instruction &CI,
+                                   bool DoXform = true);
+    bool WillNotOverflowSignedAdd(Value *LHS, Value *RHS);
+    DbgDeclareInst *hasOneUsePlusDeclare(Value *V);
+
+
+  public:
+    // InsertNewInstBefore - insert an instruction New before instruction Old
+    // in the program.  Add the new instruction to the worklist.
+    //
+    Instruction *InsertNewInstBefore(Instruction *New, Instruction &Old) {
+      assert(New && New->getParent() == 0 &&
+             "New instruction already inserted into a basic block!");
+      BasicBlock *BB = Old.getParent();
+      BB->getInstList().insert(&Old, New);  // Insert inst
+      AddToWorkList(New);
+      return New;
+    }
+
+    /// InsertCastBefore - Insert a cast of V to TY before the instruction POS.
+    /// This also adds the cast to the worklist.  Finally, this returns the
+    /// cast.
+    Value *InsertCastBefore(Instruction::CastOps opc, Value *V, const Type *Ty,
+                            Instruction &Pos) {
+      if (V->getType() == Ty) return V;
+
+      if (Constant *CV = dyn_cast<Constant>(V))
+        return ConstantExpr::getCast(opc, CV, Ty);
+      
+      Instruction *C = CastInst::Create(opc, V, Ty, V->getName(), &Pos);
+      AddToWorkList(C);
+      return C;
+    }
+        
+    Value *InsertBitCastBefore(Value *V, const Type *Ty, Instruction &Pos) {
+      return InsertCastBefore(Instruction::BitCast, V, Ty, Pos);
+    }
+
+
+    // ReplaceInstUsesWith - This method is to be used when an instruction is
+    // found to be dead, replacable with another preexisting expression.  Here
+    // we add all uses of I to the worklist, replace all uses of I with the new
+    // value, then return I, so that the inst combiner will know that I was
+    // modified.
+    //
+    Instruction *ReplaceInstUsesWith(Instruction &I, Value *V) {
+      AddUsersToWorkList(I);         // Add all modified instrs to worklist
+      if (&I != V) {
+        I.replaceAllUsesWith(V);
+        return &I;
+      } else {
+        // If we are replacing the instruction with itself, this must be in a
+        // segment of unreachable code, so just clobber the instruction.
+        I.replaceAllUsesWith(UndefValue::get(I.getType()));
+        return &I;
+      }
+    }
+
+    // EraseInstFromFunction - When dealing with an instruction that has side
+    // effects or produces a void value, we can't rely on DCE to delete the
+    // instruction.  Instead, visit methods should return the value returned by
+    // this function.
+    Instruction *EraseInstFromFunction(Instruction &I) {
+      assert(I.use_empty() && "Cannot erase instruction that is used!");
+      AddUsesToWorkList(I);
+      RemoveFromWorkList(&I);
+      I.eraseFromParent();
+      return 0;  // Don't do anything with FI
+    }
+        
+    void ComputeMaskedBits(Value *V, const APInt &Mask, APInt &KnownZero,
+                           APInt &KnownOne, unsigned Depth = 0) const {
+      return llvm::ComputeMaskedBits(V, Mask, KnownZero, KnownOne, TD, Depth);
+    }
+    
+    bool MaskedValueIsZero(Value *V, const APInt &Mask, 
+                           unsigned Depth = 0) const {
+      return llvm::MaskedValueIsZero(V, Mask, TD, Depth);
+    }
+    unsigned ComputeNumSignBits(Value *Op, unsigned Depth = 0) const {
+      return llvm::ComputeNumSignBits(Op, TD, Depth);
+    }
+
+  private:
+
+    /// SimplifyCommutative - This performs a few simplifications for 
+    /// commutative operators.
+    bool SimplifyCommutative(BinaryOperator &I);
+
+    /// SimplifyCompare - This reorders the operands of a CmpInst to get them in
+    /// most-complex to least-complex order.
+    bool SimplifyCompare(CmpInst &I);
+
+    /// SimplifyDemandedUseBits - Attempts to replace V with a simpler value
+    /// based on the demanded bits.
+    Value *SimplifyDemandedUseBits(Value *V, APInt DemandedMask, 
+                                   APInt& KnownZero, APInt& KnownOne,
+                                   unsigned Depth);
+    bool SimplifyDemandedBits(Use &U, APInt DemandedMask, 
+                              APInt& KnownZero, APInt& KnownOne,
+                              unsigned Depth=0);
+        
+    /// SimplifyDemandedInstructionBits - Inst is an integer instruction that
+    /// SimplifyDemandedBits knows about.  See if the instruction has any
+    /// properties that allow us to simplify its operands.
+    bool SimplifyDemandedInstructionBits(Instruction &Inst);
+        
+    Value *SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
+                                      APInt& UndefElts, unsigned Depth = 0);
+      
+    // FoldOpIntoPhi - Given a binary operator or cast instruction which has a
+    // PHI node as operand #0, see if we can fold the instruction into the PHI
+    // (which is only possible if all operands to the PHI are constants).
+    Instruction *FoldOpIntoPhi(Instruction &I);
+
+    // FoldPHIArgOpIntoPHI - If all operands to a PHI node are the same "unary"
+    // operator and they all are only used by the PHI, PHI together their
+    // inputs, and do the operation once, to the result of the PHI.
+    Instruction *FoldPHIArgOpIntoPHI(PHINode &PN);
+    Instruction *FoldPHIArgBinOpIntoPHI(PHINode &PN);
+    Instruction *FoldPHIArgGEPIntoPHI(PHINode &PN);
+
+    
+    Instruction *OptAndOp(Instruction *Op, ConstantInt *OpRHS,
+                          ConstantInt *AndRHS, BinaryOperator &TheAnd);
+    
+    Value *FoldLogicalPlusAnd(Value *LHS, Value *RHS, ConstantInt *Mask,
+                              bool isSub, Instruction &I);
+    Instruction *InsertRangeTest(Value *V, Constant *Lo, Constant *Hi,
+                                 bool isSigned, bool Inside, Instruction &IB);
+    Instruction *PromoteCastOfAllocation(BitCastInst &CI, AllocationInst &AI);
+    Instruction *MatchBSwap(BinaryOperator &I);
+    bool SimplifyStoreAtEndOfBlock(StoreInst &SI);
+    Instruction *SimplifyMemTransfer(MemIntrinsic *MI);
+    Instruction *SimplifyMemSet(MemSetInst *MI);
+
+
+    Value *EvaluateInDifferentType(Value *V, const Type *Ty, bool isSigned);
+
+    bool CanEvaluateInDifferentType(Value *V, const IntegerType *Ty,
+                                    unsigned CastOpc, int &NumCastsRemoved);
+    unsigned GetOrEnforceKnownAlignment(Value *V,
+                                        unsigned PrefAlign = 0);
+
+  };
+}
+
+char InstCombiner::ID = 0;
+static RegisterPass<InstCombiner>
+X("instcombine", "Combine redundant instructions");
+
+// getComplexity:  Assign a complexity or rank value to LLVM Values...
+//   0 -> undef, 1 -> Const, 2 -> Other, 3 -> Arg, 3 -> Unary, 4 -> OtherInst
+static unsigned getComplexity(Value *V) {
+  if (isa<Instruction>(V)) {
+    if (BinaryOperator::isNeg(V) || BinaryOperator::isNot(V))
+      return 3;
+    return 4;
+  }
+  if (isa<Argument>(V)) return 3;
+  return isa<Constant>(V) ? (isa<UndefValue>(V) ? 0 : 1) : 2;
+}
+
+// isOnlyUse - Return true if this instruction will be deleted if we stop using
+// it.
+static bool isOnlyUse(Value *V) {
+  return V->hasOneUse() || isa<Constant>(V);
+}
+
+// getPromotedType - Return the specified type promoted as it would be to pass
+// though a va_arg area...
+static const Type *getPromotedType(const Type *Ty) {
+  if (const IntegerType* ITy = dyn_cast<IntegerType>(Ty)) {
+    if (ITy->getBitWidth() < 32)
+      return Type::Int32Ty;
+  }
+  return Ty;
+}
+
+/// getBitCastOperand - If the specified operand is a CastInst, a constant
+/// expression bitcast, or a GetElementPtrInst with all zero indices, return the
+/// operand value, otherwise return null.
+static Value *getBitCastOperand(Value *V) {
+  if (BitCastInst *I = dyn_cast<BitCastInst>(V))
+    // BitCastInst?
+    return I->getOperand(0);
+  else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(V)) {
+    // GetElementPtrInst?
+    if (GEP->hasAllZeroIndices())
+      return GEP->getOperand(0);
+  } else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V)) {
+    if (CE->getOpcode() == Instruction::BitCast)
+      // BitCast ConstantExp?
+      return CE->getOperand(0);
+    else if (CE->getOpcode() == Instruction::GetElementPtr) {
+      // GetElementPtr ConstantExp?
+      for (User::op_iterator I = CE->op_begin() + 1, E = CE->op_end();
+           I != E; ++I) {
+        ConstantInt *CI = dyn_cast<ConstantInt>(I);
+        if (!CI || !CI->isZero())
+          // Any non-zero indices? Not cast-like.
+          return 0;
+      }
+      // All-zero indices? This is just like casting.
+      return CE->getOperand(0);
+    }
+  }
+  return 0;
+}
+
+/// This function is a wrapper around CastInst::isEliminableCastPair. It
+/// simply extracts arguments and returns what that function returns.
+static Instruction::CastOps 
+isEliminableCastPair(
+  const CastInst *CI, ///< The first cast instruction
+  unsigned opcode,       ///< The opcode of the second cast instruction
+  const Type *DstTy,     ///< The target type for the second cast instruction
+  TargetData *TD         ///< The target data for pointer size
+) {
+  
+  const Type *SrcTy = CI->getOperand(0)->getType();   // A from above
+  const Type *MidTy = CI->getType();                  // B from above
+
+  // Get the opcodes of the two Cast instructions
+  Instruction::CastOps firstOp = Instruction::CastOps(CI->getOpcode());
+  Instruction::CastOps secondOp = Instruction::CastOps(opcode);
+
+  unsigned Res = CastInst::isEliminableCastPair(firstOp, secondOp, SrcTy, MidTy,
+                                                DstTy, TD->getIntPtrType());
+  
+  // We don't want to form an inttoptr or ptrtoint that converts to an integer
+  // type that differs from the pointer size.
+  if ((Res == Instruction::IntToPtr && SrcTy != TD->getIntPtrType()) ||
+      (Res == Instruction::PtrToInt && DstTy != TD->getIntPtrType()))
+    Res = 0;
+  
+  return Instruction::CastOps(Res);
+}
+
+/// ValueRequiresCast - Return true if the cast from "V to Ty" actually results
+/// in any code being generated.  It does not require codegen if V is simple
+/// enough or if the cast can be folded into other casts.
+static bool ValueRequiresCast(Instruction::CastOps opcode, const Value *V, 
+                              const Type *Ty, TargetData *TD) {
+  if (V->getType() == Ty || isa<Constant>(V)) return false;
+  
+  // If this is another cast that can be eliminated, it isn't codegen either.
+  if (const CastInst *CI = dyn_cast<CastInst>(V))
+    if (isEliminableCastPair(CI, opcode, Ty, TD)) 
+      return false;
+  return true;
+}
+
+// SimplifyCommutative - This performs a few simplifications for commutative
+// operators:
+//
+//  1. Order operands such that they are listed from right (least complex) to
+//     left (most complex).  This puts constants before unary operators before
+//     binary operators.
+//
+//  2. Transform: (op (op V, C1), C2) ==> (op V, (op C1, C2))
+//  3. Transform: (op (op V1, C1), (op V2, C2)) ==> (op (op V1, V2), (op C1,C2))
+//
+bool InstCombiner::SimplifyCommutative(BinaryOperator &I) {
+  bool Changed = false;
+  if (getComplexity(I.getOperand(0)) < getComplexity(I.getOperand(1)))
+    Changed = !I.swapOperands();
+
+  if (!I.isAssociative()) return Changed;
+  Instruction::BinaryOps Opcode = I.getOpcode();
+  if (BinaryOperator *Op = dyn_cast<BinaryOperator>(I.getOperand(0)))
+    if (Op->getOpcode() == Opcode && isa<Constant>(Op->getOperand(1))) {
+      if (isa<Constant>(I.getOperand(1))) {
+        Constant *Folded = ConstantExpr::get(I.getOpcode(),
+                                             cast<Constant>(I.getOperand(1)),
+                                             cast<Constant>(Op->getOperand(1)));
+        I.setOperand(0, Op->getOperand(0));
+        I.setOperand(1, Folded);
+        return true;
+      } else if (BinaryOperator *Op1=dyn_cast<BinaryOperator>(I.getOperand(1)))
+        if (Op1->getOpcode() == Opcode && isa<Constant>(Op1->getOperand(1)) &&
+            isOnlyUse(Op) && isOnlyUse(Op1)) {
+          Constant *C1 = cast<Constant>(Op->getOperand(1));
+          Constant *C2 = cast<Constant>(Op1->getOperand(1));
+
+          // Fold (op (op V1, C1), (op V2, C2)) ==> (op (op V1, V2), (op C1,C2))
+          Constant *Folded = ConstantExpr::get(I.getOpcode(), C1, C2);
+          Instruction *New = BinaryOperator::Create(Opcode, Op->getOperand(0),
+                                                    Op1->getOperand(0),
+                                                    Op1->getName(), &I);
+          AddToWorkList(New);
+          I.setOperand(0, New);
+          I.setOperand(1, Folded);
+          return true;
+        }
+    }
+  return Changed;
+}
+
+/// SimplifyCompare - For a CmpInst this function just orders the operands
+/// so that theyare listed from right (least complex) to left (most complex).
+/// This puts constants before unary operators before binary operators.
+bool InstCombiner::SimplifyCompare(CmpInst &I) {
+  if (getComplexity(I.getOperand(0)) >= getComplexity(I.getOperand(1)))
+    return false;
+  I.swapOperands();
+  // Compare instructions are not associative so there's nothing else we can do.
+  return true;
+}
+
+// dyn_castNegVal - Given a 'sub' instruction, return the RHS of the instruction
+// if the LHS is a constant zero (which is the 'negate' form).
+//
+static inline Value *dyn_castNegVal(Value *V) {
+  if (BinaryOperator::isNeg(V))
+    return BinaryOperator::getNegArgument(V);
+
+  // Constants can be considered to be negated values if they can be folded.
+  if (ConstantInt *C = dyn_cast<ConstantInt>(V))
+    return ConstantExpr::getNeg(C);
+
+  if (ConstantVector *C = dyn_cast<ConstantVector>(V))
+    if (C->getType()->getElementType()->isInteger())
+      return ConstantExpr::getNeg(C);
+
+  return 0;
+}
+
+static inline Value *dyn_castNotVal(Value *V) {
+  if (BinaryOperator::isNot(V))
+    return BinaryOperator::getNotArgument(V);
+
+  // Constants can be considered to be not'ed values...
+  if (ConstantInt *C = dyn_cast<ConstantInt>(V))
+    return ConstantInt::get(~C->getValue());
+  return 0;
+}
+
+// dyn_castFoldableMul - If this value is a multiply that can be folded into
+// other computations (because it has a constant operand), return the
+// non-constant operand of the multiply, and set CST to point to the multiplier.
+// Otherwise, return null.
+//
+static inline Value *dyn_castFoldableMul(Value *V, ConstantInt *&CST) {
+  if (V->hasOneUse() && V->getType()->isInteger())
+    if (Instruction *I = dyn_cast<Instruction>(V)) {
+      if (I->getOpcode() == Instruction::Mul)
+        if ((CST = dyn_cast<ConstantInt>(I->getOperand(1))))
+          return I->getOperand(0);
+      if (I->getOpcode() == Instruction::Shl)
+        if ((CST = dyn_cast<ConstantInt>(I->getOperand(1)))) {
+          // The multiplier is really 1 << CST.
+          uint32_t BitWidth = cast<IntegerType>(V->getType())->getBitWidth();
+          uint32_t CSTVal = CST->getLimitedValue(BitWidth);
+          CST = ConstantInt::get(APInt(BitWidth, 1).shl(CSTVal));
+          return I->getOperand(0);
+        }
+    }
+  return 0;
+}
+
+/// dyn_castGetElementPtr - If this is a getelementptr instruction or constant
+/// expression, return it.
+static User *dyn_castGetElementPtr(Value *V) {
+  if (isa<GetElementPtrInst>(V)) return cast<User>(V);
+  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V))
+    if (CE->getOpcode() == Instruction::GetElementPtr)
+      return cast<User>(V);
+  return false;
+}
+
+/// getOpcode - If this is an Instruction or a ConstantExpr, return the
+/// opcode value. Otherwise return UserOp1.
+static unsigned getOpcode(const Value *V) {
+  if (const Instruction *I = dyn_cast<Instruction>(V))
+    return I->getOpcode();
+  if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(V))
+    return CE->getOpcode();
+  // Use UserOp1 to mean there's no opcode.
+  return Instruction::UserOp1;
+}
+
+/// AddOne - Add one to a ConstantInt
+static ConstantInt *AddOne(ConstantInt *C) {
+  APInt Val(C->getValue());
+  return ConstantInt::get(++Val);
+}
+/// SubOne - Subtract one from a ConstantInt
+static ConstantInt *SubOne(ConstantInt *C) {
+  APInt Val(C->getValue());
+  return ConstantInt::get(--Val);
+}
+/// Add - Add two ConstantInts together
+static ConstantInt *Add(ConstantInt *C1, ConstantInt *C2) {
+  return ConstantInt::get(C1->getValue() + C2->getValue());
+}
+/// And - Bitwise AND two ConstantInts together
+static ConstantInt *And(ConstantInt *C1, ConstantInt *C2) {
+  return ConstantInt::get(C1->getValue() & C2->getValue());
+}
+/// Subtract - Subtract one ConstantInt from another
+static ConstantInt *Subtract(ConstantInt *C1, ConstantInt *C2) {
+  return ConstantInt::get(C1->getValue() - C2->getValue());
+}
+/// Multiply - Multiply two ConstantInts together
+static ConstantInt *Multiply(ConstantInt *C1, ConstantInt *C2) {
+  return ConstantInt::get(C1->getValue() * C2->getValue());
+}
+/// MultiplyOverflows - True if the multiply can not be expressed in an int
+/// this size.
+static bool MultiplyOverflows(ConstantInt *C1, ConstantInt *C2, bool sign) {
+  uint32_t W = C1->getBitWidth();
+  APInt LHSExt = C1->getValue(), RHSExt = C2->getValue();
+  if (sign) {
+    LHSExt.sext(W * 2);
+    RHSExt.sext(W * 2);
+  } else {
+    LHSExt.zext(W * 2);
+    RHSExt.zext(W * 2);
+  }
+
+  APInt MulExt = LHSExt * RHSExt;
+
+  if (sign) {
+    APInt Min = APInt::getSignedMinValue(W).sext(W * 2);
+    APInt Max = APInt::getSignedMaxValue(W).sext(W * 2);
+    return MulExt.slt(Min) || MulExt.sgt(Max);
+  } else 
+    return MulExt.ugt(APInt::getLowBitsSet(W * 2, W));
+}
+
+
+/// ShrinkDemandedConstant - Check to see if the specified operand of the 
+/// specified instruction is a constant integer.  If so, check to see if there
+/// are any bits set in the constant that are not demanded.  If so, shrink the
+/// constant and return true.
+static bool ShrinkDemandedConstant(Instruction *I, unsigned OpNo, 
+                                   APInt Demanded) {
+  assert(I && "No instruction?");
+  assert(OpNo < I->getNumOperands() && "Operand index too large");
+
+  // If the operand is not a constant integer, nothing to do.
+  ConstantInt *OpC = dyn_cast<ConstantInt>(I->getOperand(OpNo));
+  if (!OpC) return false;
+
+  // If there are no bits set that aren't demanded, nothing to do.
+  Demanded.zextOrTrunc(OpC->getValue().getBitWidth());
+  if ((~Demanded & OpC->getValue()) == 0)
+    return false;
+
+  // This instruction is producing bits that are not demanded. Shrink the RHS.
+  Demanded &= OpC->getValue();
+  I->setOperand(OpNo, ConstantInt::get(Demanded));
+  return true;
+}
+
+// ComputeSignedMinMaxValuesFromKnownBits - Given a signed integer type and a 
+// set of known zero and one bits, compute the maximum and minimum values that
+// could have the specified known zero and known one bits, returning them in
+// min/max.
+static void ComputeSignedMinMaxValuesFromKnownBits(const APInt& KnownZero,
+                                                   const APInt& KnownOne,
+                                                   APInt& Min, APInt& Max) {
+  assert(KnownZero.getBitWidth() == KnownOne.getBitWidth() &&
+         KnownZero.getBitWidth() == Min.getBitWidth() &&
+         KnownZero.getBitWidth() == Max.getBitWidth() &&
+         "KnownZero, KnownOne and Min, Max must have equal bitwidth.");
+  APInt UnknownBits = ~(KnownZero|KnownOne);
+
+  // The minimum value is when all unknown bits are zeros, EXCEPT for the sign
+  // bit if it is unknown.
+  Min = KnownOne;
+  Max = KnownOne|UnknownBits;
+  
+  if (UnknownBits.isNegative()) { // Sign bit is unknown
+    Min.set(Min.getBitWidth()-1);
+    Max.clear(Max.getBitWidth()-1);
+  }
+}
+
+// ComputeUnsignedMinMaxValuesFromKnownBits - Given an unsigned integer type and
+// a set of known zero and one bits, compute the maximum and minimum values that
+// could have the specified known zero and known one bits, returning them in
+// min/max.
+static void ComputeUnsignedMinMaxValuesFromKnownBits(const APInt &KnownZero,
+                                                     const APInt &KnownOne,
+                                                     APInt &Min, APInt &Max) {
+  assert(KnownZero.getBitWidth() == KnownOne.getBitWidth() &&
+         KnownZero.getBitWidth() == Min.getBitWidth() &&
+         KnownZero.getBitWidth() == Max.getBitWidth() &&
+         "Ty, KnownZero, KnownOne and Min, Max must have equal bitwidth.");
+  APInt UnknownBits = ~(KnownZero|KnownOne);
+  
+  // The minimum value is when the unknown bits are all zeros.
+  Min = KnownOne;
+  // The maximum value is when the unknown bits are all ones.
+  Max = KnownOne|UnknownBits;
+}
+
+/// SimplifyDemandedInstructionBits - Inst is an integer instruction that
+/// SimplifyDemandedBits knows about.  See if the instruction has any
+/// properties that allow us to simplify its operands.
+bool InstCombiner::SimplifyDemandedInstructionBits(Instruction &Inst) {
+  unsigned BitWidth = cast<IntegerType>(Inst.getType())->getBitWidth();
+  APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0);
+  APInt DemandedMask(APInt::getAllOnesValue(BitWidth));
+  
+  Value *V = SimplifyDemandedUseBits(&Inst, DemandedMask, 
+                                     KnownZero, KnownOne, 0);
+  if (V == 0) return false;
+  if (V == &Inst) return true;
+  ReplaceInstUsesWith(Inst, V);
+  return true;
+}
+
+/// SimplifyDemandedBits - This form of SimplifyDemandedBits simplifies the
+/// specified instruction operand if possible, updating it in place.  It returns
+/// true if it made any change and false otherwise.
+bool InstCombiner::SimplifyDemandedBits(Use &U, APInt DemandedMask, 
+                                        APInt &KnownZero, APInt &KnownOne,
+                                        unsigned Depth) {
+  Value *NewVal = SimplifyDemandedUseBits(U.get(), DemandedMask,
+                                          KnownZero, KnownOne, Depth);
+  if (NewVal == 0) return false;
+  U.set(NewVal);
+  return true;
+}
+
+
+/// SimplifyDemandedUseBits - This function attempts to replace V with a simpler
+/// value based on the demanded bits.  When this function is called, it is known
+/// that only the bits set in DemandedMask of the result of V are ever used
+/// downstream. Consequently, depending on the mask and V, it may be possible
+/// to replace V with a constant or one of its operands. In such cases, this
+/// function does the replacement and returns true. In all other cases, it
+/// returns false after analyzing the expression and setting KnownOne and known
+/// to be one in the expression.  KnownZero contains all the bits that are known
+/// to be zero in the expression. These are provided to potentially allow the
+/// caller (which might recursively be SimplifyDemandedBits itself) to simplify
+/// the expression. KnownOne and KnownZero always follow the invariant that 
+/// KnownOne & KnownZero == 0. That is, a bit can't be both 1 and 0. Note that
+/// the bits in KnownOne and KnownZero may only be accurate for those bits set
+/// in DemandedMask. Note also that the bitwidth of V, DemandedMask, KnownZero
+/// and KnownOne must all be the same.
+///
+/// This returns null if it did not change anything and it permits no
+/// simplification.  This returns V itself if it did some simplification of V's
+/// operands based on the information about what bits are demanded. This returns
+/// some other non-null value if it found out that V is equal to another value
+/// in the context where the specified bits are demanded, but not for all users.
+Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
+                                             APInt &KnownZero, APInt &KnownOne,
+                                             unsigned Depth) {
+  assert(V != 0 && "Null pointer of Value???");
+  assert(Depth <= 6 && "Limit Search Depth");
+  uint32_t BitWidth = DemandedMask.getBitWidth();
+  const Type *VTy = V->getType();
+  assert((TD || !isa<PointerType>(VTy)) &&
+         "SimplifyDemandedBits needs to know bit widths!");
+  assert((!TD || TD->getTypeSizeInBits(VTy) == BitWidth) &&
+         (!isa<IntegerType>(VTy) ||
+          VTy->getPrimitiveSizeInBits() == BitWidth) &&
+         KnownZero.getBitWidth() == BitWidth &&
+         KnownOne.getBitWidth() == BitWidth &&
+         "Value *V, DemandedMask, KnownZero and KnownOne \
+          must have same BitWidth");
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
+    // We know all of the bits for a constant!
+    KnownOne = CI->getValue() & DemandedMask;
+    KnownZero = ~KnownOne & DemandedMask;
+    return 0;
+  }
+  if (isa<ConstantPointerNull>(V)) {
+    // We know all of the bits for a constant!
+    KnownOne.clear();
+    KnownZero = DemandedMask;
+    return 0;
+  }
+
+  KnownZero.clear();
+  KnownOne.clear();
+  if (DemandedMask == 0) {   // Not demanding any bits from V.
+    if (isa<UndefValue>(V))
+      return 0;
+    return UndefValue::get(VTy);
+  }
+  
+  if (Depth == 6)        // Limit search depth.
+    return 0;
+  
+  APInt LHSKnownZero(BitWidth, 0), LHSKnownOne(BitWidth, 0);
+  APInt &RHSKnownZero = KnownZero, &RHSKnownOne = KnownOne;
+
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (!I) {
+    ComputeMaskedBits(V, DemandedMask, RHSKnownZero, RHSKnownOne, Depth);
+    return 0;        // Only analyze instructions.
+  }
+
+  // If there are multiple uses of this value and we aren't at the root, then
+  // we can't do any simplifications of the operands, because DemandedMask
+  // only reflects the bits demanded by *one* of the users.
+  if (Depth != 0 && !I->hasOneUse()) {
+    // Despite the fact that we can't simplify this instruction in all User's
+    // context, we can at least compute the knownzero/knownone bits, and we can
+    // do simplifications that apply to *just* the one user if we know that
+    // this instruction has a simpler value in that context.
+    if (I->getOpcode() == Instruction::And) {
+      // If either the LHS or the RHS are Zero, the result is zero.
+      ComputeMaskedBits(I->getOperand(1), DemandedMask,
+                        RHSKnownZero, RHSKnownOne, Depth+1);
+      ComputeMaskedBits(I->getOperand(0), DemandedMask & ~RHSKnownZero,
+                        LHSKnownZero, LHSKnownOne, Depth+1);
+      
+      // If all of the demanded bits are known 1 on one side, return the other.
+      // These bits cannot contribute to the result of the 'and' in this
+      // context.
+      if ((DemandedMask & ~LHSKnownZero & RHSKnownOne) == 
+          (DemandedMask & ~LHSKnownZero))
+        return I->getOperand(0);
+      if ((DemandedMask & ~RHSKnownZero & LHSKnownOne) == 
+          (DemandedMask & ~RHSKnownZero))
+        return I->getOperand(1);
+      
+      // If all of the demanded bits in the inputs are known zeros, return zero.
+      if ((DemandedMask & (RHSKnownZero|LHSKnownZero)) == DemandedMask)
+        return Constant::getNullValue(VTy);
+      
+    } else if (I->getOpcode() == Instruction::Or) {
+      // We can simplify (X|Y) -> X or Y in the user's context if we know that
+      // only bits from X or Y are demanded.
+      
+      // If either the LHS or the RHS are One, the result is One.
+      ComputeMaskedBits(I->getOperand(1), DemandedMask, 
+                        RHSKnownZero, RHSKnownOne, Depth+1);
+      ComputeMaskedBits(I->getOperand(0), DemandedMask & ~RHSKnownOne, 
+                        LHSKnownZero, LHSKnownOne, Depth+1);
+      
+      // If all of the demanded bits are known zero on one side, return the
+      // other.  These bits cannot contribute to the result of the 'or' in this
+      // context.
+      if ((DemandedMask & ~LHSKnownOne & RHSKnownZero) == 
+          (DemandedMask & ~LHSKnownOne))
+        return I->getOperand(0);
+      if ((DemandedMask & ~RHSKnownOne & LHSKnownZero) == 
+          (DemandedMask & ~RHSKnownOne))
+        return I->getOperand(1);
+      
+      // If all of the potentially set bits on one side are known to be set on
+      // the other side, just use the 'other' side.
+      if ((DemandedMask & (~RHSKnownZero) & LHSKnownOne) == 
+          (DemandedMask & (~RHSKnownZero)))
+        return I->getOperand(0);
+      if ((DemandedMask & (~LHSKnownZero) & RHSKnownOne) == 
+          (DemandedMask & (~LHSKnownZero)))
+        return I->getOperand(1);
+    }
+    
+    // Compute the KnownZero/KnownOne bits to simplify things downstream.
+    ComputeMaskedBits(I, DemandedMask, KnownZero, KnownOne, Depth);
+    return 0;
+  }
+  
+  // If this is the root being simplified, allow it to have multiple uses,
+  // just set the DemandedMask to all bits so that we can try to simplify the
+  // operands.  This allows visitTruncInst (for example) to simplify the
+  // operand of a trunc without duplicating all the logic below.
+  if (Depth == 0 && !V->hasOneUse())
+    DemandedMask = APInt::getAllOnesValue(BitWidth);
+  
+  switch (I->getOpcode()) {
+  default:
+    ComputeMaskedBits(I, DemandedMask, RHSKnownZero, RHSKnownOne, Depth);
+    break;
+  case Instruction::And:
+    // If either the LHS or the RHS are Zero, the result is zero.
+    if (SimplifyDemandedBits(I->getOperandUse(1), DemandedMask,
+                             RHSKnownZero, RHSKnownOne, Depth+1) ||
+        SimplifyDemandedBits(I->getOperandUse(0), DemandedMask & ~RHSKnownZero,
+                             LHSKnownZero, LHSKnownOne, Depth+1))
+      return I;
+    assert(!(RHSKnownZero & RHSKnownOne) && "Bits known to be one AND zero?"); 
+    assert(!(LHSKnownZero & LHSKnownOne) && "Bits known to be one AND zero?"); 
+
+    // If all of the demanded bits are known 1 on one side, return the other.
+    // These bits cannot contribute to the result of the 'and'.
+    if ((DemandedMask & ~LHSKnownZero & RHSKnownOne) == 
+        (DemandedMask & ~LHSKnownZero))
+      return I->getOperand(0);
+    if ((DemandedMask & ~RHSKnownZero & LHSKnownOne) == 
+        (DemandedMask & ~RHSKnownZero))
+      return I->getOperand(1);
+    
+    // If all of the demanded bits in the inputs are known zeros, return zero.
+    if ((DemandedMask & (RHSKnownZero|LHSKnownZero)) == DemandedMask)
+      return Constant::getNullValue(VTy);
+      
+    // If the RHS is a constant, see if we can simplify it.
+    if (ShrinkDemandedConstant(I, 1, DemandedMask & ~LHSKnownZero))
+      return I;
+      
+    // Output known-1 bits are only known if set in both the LHS & RHS.
+    RHSKnownOne &= LHSKnownOne;
+    // Output known-0 are known to be clear if zero in either the LHS | RHS.
+    RHSKnownZero |= LHSKnownZero;
+    break;
+  case Instruction::Or:
+    // If either the LHS or the RHS are One, the result is One.
+    if (SimplifyDemandedBits(I->getOperandUse(1), DemandedMask, 
+                             RHSKnownZero, RHSKnownOne, Depth+1) ||
+        SimplifyDemandedBits(I->getOperandUse(0), DemandedMask & ~RHSKnownOne, 
+                             LHSKnownZero, LHSKnownOne, Depth+1))
+      return I;
+    assert(!(RHSKnownZero & RHSKnownOne) && "Bits known to be one AND zero?"); 
+    assert(!(LHSKnownZero & LHSKnownOne) && "Bits known to be one AND zero?"); 
+    
+    // If all of the demanded bits are known zero on one side, return the other.
+    // These bits cannot contribute to the result of the 'or'.
+    if ((DemandedMask & ~LHSKnownOne & RHSKnownZero) == 
+        (DemandedMask & ~LHSKnownOne))
+      return I->getOperand(0);
+    if ((DemandedMask & ~RHSKnownOne & LHSKnownZero) == 
+        (DemandedMask & ~RHSKnownOne))
+      return I->getOperand(1);
+
+    // If all of the potentially set bits on one side are known to be set on
+    // the other side, just use the 'other' side.
+    if ((DemandedMask & (~RHSKnownZero) & LHSKnownOne) == 
+        (DemandedMask & (~RHSKnownZero)))
+      return I->getOperand(0);
+    if ((DemandedMask & (~LHSKnownZero) & RHSKnownOne) == 
+        (DemandedMask & (~LHSKnownZero)))
+      return I->getOperand(1);
+        
+    // If the RHS is a constant, see if we can simplify it.
+    if (ShrinkDemandedConstant(I, 1, DemandedMask))
+      return I;
+          
+    // Output known-0 bits are only known if clear in both the LHS & RHS.
+    RHSKnownZero &= LHSKnownZero;
+    // Output known-1 are known to be set if set in either the LHS | RHS.
+    RHSKnownOne |= LHSKnownOne;
+    break;
+  case Instruction::Xor: {
+    if (SimplifyDemandedBits(I->getOperandUse(1), DemandedMask,
+                             RHSKnownZero, RHSKnownOne, Depth+1) ||
+        SimplifyDemandedBits(I->getOperandUse(0), DemandedMask, 
+                             LHSKnownZero, LHSKnownOne, Depth+1))
+      return I;
+    assert(!(RHSKnownZero & RHSKnownOne) && "Bits known to be one AND zero?"); 
+    assert(!(LHSKnownZero & LHSKnownOne) && "Bits known to be one AND zero?"); 
+    
+    // If all of the demanded bits are known zero on one side, return the other.
+    // These bits cannot contribute to the result of the 'xor'.
+    if ((DemandedMask & RHSKnownZero) == DemandedMask)
+      return I->getOperand(0);
+    if ((DemandedMask & LHSKnownZero) == DemandedMask)
+      return I->getOperand(1);
+    
+    // Output known-0 bits are known if clear or set in both the LHS & RHS.
+    APInt KnownZeroOut = (RHSKnownZero & LHSKnownZero) | 
+                         (RHSKnownOne & LHSKnownOne);
+    // Output known-1 are known to be set if set in only one of the LHS, RHS.
+    APInt KnownOneOut = (RHSKnownZero & LHSKnownOne) | 
+                        (RHSKnownOne & LHSKnownZero);
+    
+    // If all of the demanded bits are known to be zero on one side or the
+    // other, turn this into an *inclusive* or.
+    //    e.g. (A & C1)^(B & C2) -> (A & C1)|(B & C2) iff C1&C2 == 0
+    if ((DemandedMask & ~RHSKnownZero & ~LHSKnownZero) == 0) {
+      Instruction *Or =
+        BinaryOperator::CreateOr(I->getOperand(0), I->getOperand(1),
+                                 I->getName());
+      return InsertNewInstBefore(Or, *I);
+    }
+    
+    // If all of the demanded bits on one side are known, and all of the set
+    // bits on that side are also known to be set on the other side, turn this
+    // into an AND, as we know the bits will be cleared.
+    //    e.g. (X | C1) ^ C2 --> (X | C1) & ~C2 iff (C1&C2) == C2
+    if ((DemandedMask & (RHSKnownZero|RHSKnownOne)) == DemandedMask) { 
+      // all known
+      if ((RHSKnownOne & LHSKnownOne) == RHSKnownOne) {
+        Constant *AndC = ConstantInt::get(~RHSKnownOne & DemandedMask);
+        Instruction *And = 
+          BinaryOperator::CreateAnd(I->getOperand(0), AndC, "tmp");
+        return InsertNewInstBefore(And, *I);
+      }
+    }
+    
+    // If the RHS is a constant, see if we can simplify it.
+    // FIXME: for XOR, we prefer to force bits to 1 if they will make a -1.
+    if (ShrinkDemandedConstant(I, 1, DemandedMask))
+      return I;
+    
+    RHSKnownZero = KnownZeroOut;
+    RHSKnownOne  = KnownOneOut;
+    break;
+  }
+  case Instruction::Select:
+    if (SimplifyDemandedBits(I->getOperandUse(2), DemandedMask,
+                             RHSKnownZero, RHSKnownOne, Depth+1) ||
+        SimplifyDemandedBits(I->getOperandUse(1), DemandedMask, 
+                             LHSKnownZero, LHSKnownOne, Depth+1))
+      return I;
+    assert(!(RHSKnownZero & RHSKnownOne) && "Bits known to be one AND zero?"); 
+    assert(!(LHSKnownZero & LHSKnownOne) && "Bits known to be one AND zero?"); 
+    
+    // If the operands are constants, see if we can simplify them.
+    if (ShrinkDemandedConstant(I, 1, DemandedMask) ||
+        ShrinkDemandedConstant(I, 2, DemandedMask))
+      return I;
+    
+    // Only known if known in both the LHS and RHS.
+    RHSKnownOne &= LHSKnownOne;
+    RHSKnownZero &= LHSKnownZero;
+    break;
+  case Instruction::Trunc: {
+    unsigned truncBf = I->getOperand(0)->getType()->getPrimitiveSizeInBits();
+    DemandedMask.zext(truncBf);
+    RHSKnownZero.zext(truncBf);
+    RHSKnownOne.zext(truncBf);
+    if (SimplifyDemandedBits(I->getOperandUse(0), DemandedMask, 
+                             RHSKnownZero, RHSKnownOne, Depth+1))
+      return I;
+    DemandedMask.trunc(BitWidth);
+    RHSKnownZero.trunc(BitWidth);
+    RHSKnownOne.trunc(BitWidth);
+    assert(!(RHSKnownZero & RHSKnownOne) && "Bits known to be one AND zero?"); 
+    break;
+  }
+  case Instruction::BitCast:
+    if (!I->getOperand(0)->getType()->isInteger())
+      return false;  // vector->int or fp->int?
+    if (SimplifyDemandedBits(I->getOperandUse(0), DemandedMask,
+                             RHSKnownZero, RHSKnownOne, Depth+1))
+      return I;
+    assert(!(RHSKnownZero & RHSKnownOne) && "Bits known to be one AND zero?"); 
+    break;
+  case Instruction::ZExt: {
+    // Compute the bits in the result that are not present in the input.
+    unsigned SrcBitWidth =I->getOperand(0)->getType()->getPrimitiveSizeInBits();
+    
+    DemandedMask.trunc(SrcBitWidth);
+    RHSKnownZero.trunc(SrcBitWidth);
+    RHSKnownOne.trunc(SrcBitWidth);
+    if (SimplifyDemandedBits(I->getOperandUse(0), DemandedMask,
+                             RHSKnownZero, RHSKnownOne, Depth+1))
+      return I;
+    DemandedMask.zext(BitWidth);
+    RHSKnownZero.zext(BitWidth);
+    RHSKnownOne.zext(BitWidth);
+    assert(!(RHSKnownZero & RHSKnownOne) && "Bits known to be one AND zero?"); 
+    // The top bits are known to be zero.
+    RHSKnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - SrcBitWidth);
+    break;
+  }
+  case Instruction::SExt: {
+    // Compute the bits in the result that are not present in the input.
+    unsigned SrcBitWidth =I->getOperand(0)->getType()->getPrimitiveSizeInBits();
+    
+    APInt InputDemandedBits = DemandedMask & 
+                              APInt::getLowBitsSet(BitWidth, SrcBitWidth);
+
+    APInt NewBits(APInt::getHighBitsSet(BitWidth, BitWidth - SrcBitWidth));
+    // If any of the sign extended bits are demanded, we know that the sign
+    // bit is demanded.
+    if ((NewBits & DemandedMask) != 0)
+      InputDemandedBits.set(SrcBitWidth-1);
+      
+    InputDemandedBits.trunc(SrcBitWidth);
+    RHSKnownZero.trunc(SrcBitWidth);
+    RHSKnownOne.trunc(SrcBitWidth);
+    if (SimplifyDemandedBits(I->getOperandUse(0), InputDemandedBits,
+                             RHSKnownZero, RHSKnownOne, Depth+1))
+      return I;
+    InputDemandedBits.zext(BitWidth);
+    RHSKnownZero.zext(BitWidth);
+    RHSKnownOne.zext(BitWidth);
+    assert(!(RHSKnownZero & RHSKnownOne) && "Bits known to be one AND zero?"); 
+      
+    // If the sign bit of the input is known set or clear, then we know the
+    // top bits of the result.
+
+    // If the input sign bit is known zero, or if the NewBits are not demanded
+    // convert this into a zero extension.
+    if (RHSKnownZero[SrcBitWidth-1] || (NewBits & ~DemandedMask) == NewBits) {
+      // Convert to ZExt cast
+      CastInst *NewCast = new ZExtInst(I->getOperand(0), VTy, I->getName());
+      return InsertNewInstBefore(NewCast, *I);
+    } else if (RHSKnownOne[SrcBitWidth-1]) {    // Input sign bit known set
+      RHSKnownOne |= NewBits;
+    }
+    break;
+  }
+  case Instruction::Add: {
+    // Figure out what the input bits are.  If the top bits of the and result
+    // are not demanded, then the add doesn't demand them from its input
+    // either.
+    unsigned NLZ = DemandedMask.countLeadingZeros();
+      
+    // If there is a constant on the RHS, there are a variety of xformations
+    // we can do.
+    if (ConstantInt *RHS = dyn_cast<ConstantInt>(I->getOperand(1))) {
+      // If null, this should be simplified elsewhere.  Some of the xforms here
+      // won't work if the RHS is zero.
+      if (RHS->isZero())
+        break;
+      
+      // If the top bit of the output is demanded, demand everything from the
+      // input.  Otherwise, we demand all the input bits except NLZ top bits.
+      APInt InDemandedBits(APInt::getLowBitsSet(BitWidth, BitWidth - NLZ));
+
+      // Find information about known zero/one bits in the input.
+      if (SimplifyDemandedBits(I->getOperandUse(0), InDemandedBits, 
+                               LHSKnownZero, LHSKnownOne, Depth+1))
+        return I;
+
+      // If the RHS of the add has bits set that can't affect the input, reduce
+      // the constant.
+      if (ShrinkDemandedConstant(I, 1, InDemandedBits))
+        return I;
+      
+      // Avoid excess work.
+      if (LHSKnownZero == 0 && LHSKnownOne == 0)
+        break;
+      
+      // Turn it into OR if input bits are zero.
+      if ((LHSKnownZero & RHS->getValue()) == RHS->getValue()) {
+        Instruction *Or =
+          BinaryOperator::CreateOr(I->getOperand(0), I->getOperand(1),
+                                   I->getName());
+        return InsertNewInstBefore(Or, *I);
+      }
+      
+      // We can say something about the output known-zero and known-one bits,
+      // depending on potential carries from the input constant and the
+      // unknowns.  For example if the LHS is known to have at most the 0x0F0F0
+      // bits set and the RHS constant is 0x01001, then we know we have a known
+      // one mask of 0x00001 and a known zero mask of 0xE0F0E.
+      
+      // To compute this, we first compute the potential carry bits.  These are
+      // the bits which may be modified.  I'm not aware of a better way to do
+      // this scan.
+      const APInt &RHSVal = RHS->getValue();
+      APInt CarryBits((~LHSKnownZero + RHSVal) ^ (~LHSKnownZero ^ RHSVal));
+      
+      // Now that we know which bits have carries, compute the known-1/0 sets.
+      
+      // Bits are known one if they are known zero in one operand and one in the
+      // other, and there is no input carry.
+      RHSKnownOne = ((LHSKnownZero & RHSVal) | 
+                     (LHSKnownOne & ~RHSVal)) & ~CarryBits;
+      
+      // Bits are known zero if they are known zero in both operands and there
+      // is no input carry.
+      RHSKnownZero = LHSKnownZero & ~RHSVal & ~CarryBits;
+    } else {
+      // If the high-bits of this ADD are not demanded, then it does not demand
+      // the high bits of its LHS or RHS.
+      if (DemandedMask[BitWidth-1] == 0) {
+        // Right fill the mask of bits for this ADD to demand the most
+        // significant bit and all those below it.
+        APInt DemandedFromOps(APInt::getLowBitsSet(BitWidth, BitWidth-NLZ));
+        if (SimplifyDemandedBits(I->getOperandUse(0), DemandedFromOps,
+                                 LHSKnownZero, LHSKnownOne, Depth+1) ||
+            SimplifyDemandedBits(I->getOperandUse(1), DemandedFromOps,
+                                 LHSKnownZero, LHSKnownOne, Depth+1))
+          return I;
+      }
+    }
+    break;
+  }
+  case Instruction::Sub:
+    // If the high-bits of this SUB are not demanded, then it does not demand
+    // the high bits of its LHS or RHS.
+    if (DemandedMask[BitWidth-1] == 0) {
+      // Right fill the mask of bits for this SUB to demand the most
+      // significant bit and all those below it.
+      uint32_t NLZ = DemandedMask.countLeadingZeros();
+      APInt DemandedFromOps(APInt::getLowBitsSet(BitWidth, BitWidth-NLZ));
+      if (SimplifyDemandedBits(I->getOperandUse(0), DemandedFromOps,
+                               LHSKnownZero, LHSKnownOne, Depth+1) ||
+          SimplifyDemandedBits(I->getOperandUse(1), DemandedFromOps,
+                               LHSKnownZero, LHSKnownOne, Depth+1))
+        return I;
+    }
+    // Otherwise just hand the sub off to ComputeMaskedBits to fill in
+    // the known zeros and ones.
+    ComputeMaskedBits(V, DemandedMask, RHSKnownZero, RHSKnownOne, Depth);
+    break;
+  case Instruction::Shl:
+    if (ConstantInt *SA = dyn_cast<ConstantInt>(I->getOperand(1))) {
+      uint64_t ShiftAmt = SA->getLimitedValue(BitWidth);
+      APInt DemandedMaskIn(DemandedMask.lshr(ShiftAmt));
+      if (SimplifyDemandedBits(I->getOperandUse(0), DemandedMaskIn, 
+                               RHSKnownZero, RHSKnownOne, Depth+1))
+        return I;
+      assert(!(RHSKnownZero & RHSKnownOne) && "Bits known to be one AND zero?");
+      RHSKnownZero <<= ShiftAmt;
+      RHSKnownOne  <<= ShiftAmt;
+      // low bits known zero.
+      if (ShiftAmt)
+        RHSKnownZero |= APInt::getLowBitsSet(BitWidth, ShiftAmt);
+    }
+    break;
+  case Instruction::LShr:
+    // For a logical shift right
+    if (ConstantInt *SA = dyn_cast<ConstantInt>(I->getOperand(1))) {
+      uint64_t ShiftAmt = SA->getLimitedValue(BitWidth);
+      
+      // Unsigned shift right.
+      APInt DemandedMaskIn(DemandedMask.shl(ShiftAmt));
+      if (SimplifyDemandedBits(I->getOperandUse(0), DemandedMaskIn,
+                               RHSKnownZero, RHSKnownOne, Depth+1))
+        return I;
+      assert(!(RHSKnownZero & RHSKnownOne) && "Bits known to be one AND zero?");
+      RHSKnownZero = APIntOps::lshr(RHSKnownZero, ShiftAmt);
+      RHSKnownOne  = APIntOps::lshr(RHSKnownOne, ShiftAmt);
+      if (ShiftAmt) {
+        // Compute the new bits that are at the top now.
+        APInt HighBits(APInt::getHighBitsSet(BitWidth, ShiftAmt));
+        RHSKnownZero |= HighBits;  // high bits known zero.
+      }
+    }
+    break;
+  case Instruction::AShr:
+    // If this is an arithmetic shift right and only the low-bit is set, we can
+    // always convert this into a logical shr, even if the shift amount is
+    // variable.  The low bit of the shift cannot be an input sign bit unless
+    // the shift amount is >= the size of the datatype, which is undefined.
+    if (DemandedMask == 1) {
+      // Perform the logical shift right.
+      Instruction *NewVal = BinaryOperator::CreateLShr(
+                        I->getOperand(0), I->getOperand(1), I->getName());
+      return InsertNewInstBefore(NewVal, *I);
+    }    
+
+    // If the sign bit is the only bit demanded by this ashr, then there is no
+    // need to do it, the shift doesn't change the high bit.
+    if (DemandedMask.isSignBit())
+      return I->getOperand(0);
+    
+    if (ConstantInt *SA = dyn_cast<ConstantInt>(I->getOperand(1))) {
+      uint32_t ShiftAmt = SA->getLimitedValue(BitWidth);
+      
+      // Signed shift right.
+      APInt DemandedMaskIn(DemandedMask.shl(ShiftAmt));
+      // If any of the "high bits" are demanded, we should set the sign bit as
+      // demanded.
+      if (DemandedMask.countLeadingZeros() <= ShiftAmt)
+        DemandedMaskIn.set(BitWidth-1);
+      if (SimplifyDemandedBits(I->getOperandUse(0), DemandedMaskIn,
+                               RHSKnownZero, RHSKnownOne, Depth+1))
+        return I;
+      assert(!(RHSKnownZero & RHSKnownOne) && "Bits known to be one AND zero?");
+      // Compute the new bits that are at the top now.
+      APInt HighBits(APInt::getHighBitsSet(BitWidth, ShiftAmt));
+      RHSKnownZero = APIntOps::lshr(RHSKnownZero, ShiftAmt);
+      RHSKnownOne  = APIntOps::lshr(RHSKnownOne, ShiftAmt);
+        
+      // Handle the sign bits.
+      APInt SignBit(APInt::getSignBit(BitWidth));
+      // Adjust to where it is now in the mask.
+      SignBit = APIntOps::lshr(SignBit, ShiftAmt);  
+        
+      // If the input sign bit is known to be zero, or if none of the top bits
+      // are demanded, turn this into an unsigned shift right.
+      if (BitWidth <= ShiftAmt || RHSKnownZero[BitWidth-ShiftAmt-1] || 
+          (HighBits & ~DemandedMask) == HighBits) {
+        // Perform the logical shift right.
+        Instruction *NewVal = BinaryOperator::CreateLShr(
+                          I->getOperand(0), SA, I->getName());
+        return InsertNewInstBefore(NewVal, *I);
+      } else if ((RHSKnownOne & SignBit) != 0) { // New bits are known one.
+        RHSKnownOne |= HighBits;
+      }
+    }
+    break;
+  case Instruction::SRem:
+    if (ConstantInt *Rem = dyn_cast<ConstantInt>(I->getOperand(1))) {
+      APInt RA = Rem->getValue().abs();
+      if (RA.isPowerOf2()) {
+        if (DemandedMask.ule(RA))    // srem won't affect demanded bits
+          return I->getOperand(0);
+
+        APInt LowBits = RA - 1;
+        APInt Mask2 = LowBits | APInt::getSignBit(BitWidth);
+        if (SimplifyDemandedBits(I->getOperandUse(0), Mask2,
+                                 LHSKnownZero, LHSKnownOne, Depth+1))
+          return I;
+
+        if (LHSKnownZero[BitWidth-1] || ((LHSKnownZero & LowBits) == LowBits))
+          LHSKnownZero |= ~LowBits;
+
+        KnownZero |= LHSKnownZero & DemandedMask;
+
+        assert(!(KnownZero & KnownOne) && "Bits known to be one AND zero?"); 
+      }
+    }
+    break;
+  case Instruction::URem: {
+    APInt KnownZero2(BitWidth, 0), KnownOne2(BitWidth, 0);
+    APInt AllOnes = APInt::getAllOnesValue(BitWidth);
+    if (SimplifyDemandedBits(I->getOperandUse(0), AllOnes,
+                             KnownZero2, KnownOne2, Depth+1) ||
+        SimplifyDemandedBits(I->getOperandUse(1), AllOnes,
+                             KnownZero2, KnownOne2, Depth+1))
+      return I;
+
+    unsigned Leaders = KnownZero2.countLeadingOnes();
+    Leaders = std::max(Leaders,
+                       KnownZero2.countLeadingOnes());
+    KnownZero = APInt::getHighBitsSet(BitWidth, Leaders) & DemandedMask;
+    break;
+  }
+  case Instruction::Call:
+    if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+      switch (II->getIntrinsicID()) {
+      default: break;
+      case Intrinsic::bswap: {
+        // If the only bits demanded come from one byte of the bswap result,
+        // just shift the input byte into position to eliminate the bswap.
+        unsigned NLZ = DemandedMask.countLeadingZeros();
+        unsigned NTZ = DemandedMask.countTrailingZeros();
+          
+        // Round NTZ down to the next byte.  If we have 11 trailing zeros, then
+        // we need all the bits down to bit 8.  Likewise, round NLZ.  If we
+        // have 14 leading zeros, round to 8.
+        NLZ &= ~7;
+        NTZ &= ~7;
+        // If we need exactly one byte, we can do this transformation.
+        if (BitWidth-NLZ-NTZ == 8) {
+          unsigned ResultBit = NTZ;
+          unsigned InputBit = BitWidth-NTZ-8;
+          
+          // Replace this with either a left or right shift to get the byte into
+          // the right place.
+          Instruction *NewVal;
+          if (InputBit > ResultBit)
+            NewVal = BinaryOperator::CreateLShr(I->getOperand(1),
+                    ConstantInt::get(I->getType(), InputBit-ResultBit));
+          else
+            NewVal = BinaryOperator::CreateShl(I->getOperand(1),
+                    ConstantInt::get(I->getType(), ResultBit-InputBit));
+          NewVal->takeName(I);
+          return InsertNewInstBefore(NewVal, *I);
+        }
+          
+        // TODO: Could compute known zero/one bits based on the input.
+        break;
+      }
+      }
+    }
+    ComputeMaskedBits(V, DemandedMask, RHSKnownZero, RHSKnownOne, Depth);
+    break;
+  }
+  
+  // If the client is only demanding bits that we know, return the known
+  // constant.
+  if ((DemandedMask & (RHSKnownZero|RHSKnownOne)) == DemandedMask) {
+    Constant *C = ConstantInt::get(RHSKnownOne);
+    if (isa<PointerType>(V->getType()))
+      C = ConstantExpr::getIntToPtr(C, V->getType());
+    return C;
+  }
+  return false;
+}
+
+
+/// SimplifyDemandedVectorElts - The specified value produces a vector with
+/// any number of elements. DemandedElts contains the set of elements that are
+/// actually used by the caller.  This method analyzes which elements of the
+/// operand are undef and returns that information in UndefElts.
+///
+/// If the information about demanded elements can be used to simplify the
+/// operation, the operation is simplified, then the resultant value is
+/// returned.  This returns null if no change was made.
+Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
+                                                APInt& UndefElts,
+                                                unsigned Depth) {
+  unsigned VWidth = cast<VectorType>(V->getType())->getNumElements();
+  APInt EltMask(APInt::getAllOnesValue(VWidth));
+  assert((DemandedElts & ~EltMask) == 0 && "Invalid DemandedElts!");
+
+  if (isa<UndefValue>(V)) {
+    // If the entire vector is undefined, just return this info.
+    UndefElts = EltMask;
+    return 0;
+  } else if (DemandedElts == 0) { // If nothing is demanded, provide undef.
+    UndefElts = EltMask;
+    return UndefValue::get(V->getType());
+  }
+
+  UndefElts = 0;
+  if (ConstantVector *CP = dyn_cast<ConstantVector>(V)) {
+    const Type *EltTy = cast<VectorType>(V->getType())->getElementType();
+    Constant *Undef = UndefValue::get(EltTy);
+
+    std::vector<Constant*> Elts;
+    for (unsigned i = 0; i != VWidth; ++i)
+      if (!DemandedElts[i]) {   // If not demanded, set to undef.
+        Elts.push_back(Undef);
+        UndefElts.set(i);
+      } else if (isa<UndefValue>(CP->getOperand(i))) {   // Already undef.
+        Elts.push_back(Undef);
+        UndefElts.set(i);
+      } else {                               // Otherwise, defined.
+        Elts.push_back(CP->getOperand(i));
+      }
+
+    // If we changed the constant, return it.
+    Constant *NewCP = ConstantVector::get(Elts);
+    return NewCP != CP ? NewCP : 0;
+  } else if (isa<ConstantAggregateZero>(V)) {
+    // Simplify the CAZ to a ConstantVector where the non-demanded elements are
+    // set to undef.
+    
+    // Check if this is identity. If so, return 0 since we are not simplifying
+    // anything.
+    if (DemandedElts == ((1ULL << VWidth) -1))
+      return 0;
+    
+    const Type *EltTy = cast<VectorType>(V->getType())->getElementType();
+    Constant *Zero = Constant::getNullValue(EltTy);
+    Constant *Undef = UndefValue::get(EltTy);
+    std::vector<Constant*> Elts;
+    for (unsigned i = 0; i != VWidth; ++i) {
+      Constant *Elt = DemandedElts[i] ? Zero : Undef;
+      Elts.push_back(Elt);
+    }
+    UndefElts = DemandedElts ^ EltMask;
+    return ConstantVector::get(Elts);
+  }
+  
+  // Limit search depth.
+  if (Depth == 10)
+    return 0;
+
+  // If multiple users are using the root value, procede with
+  // simplification conservatively assuming that all elements
+  // are needed.
+  if (!V->hasOneUse()) {
+    // Quit if we find multiple users of a non-root value though.
+    // They'll be handled when it's their turn to be visited by
+    // the main instcombine process.
+    if (Depth != 0)
+      // TODO: Just compute the UndefElts information recursively.
+      return 0;
+
+    // Conservatively assume that all elements are needed.
+    DemandedElts = EltMask;
+  }
+  
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (!I) return 0;        // Only analyze instructions.
+  
+  bool MadeChange = false;
+  APInt UndefElts2(VWidth, 0);
+  Value *TmpV;
+  switch (I->getOpcode()) {
+  default: break;
+    
+  case Instruction::InsertElement: {
+    // If this is a variable index, we don't know which element it overwrites.
+    // demand exactly the same input as we produce.
+    ConstantInt *Idx = dyn_cast<ConstantInt>(I->getOperand(2));
+    if (Idx == 0) {
+      // Note that we can't propagate undef elt info, because we don't know
+      // which elt is getting updated.
+      TmpV = SimplifyDemandedVectorElts(I->getOperand(0), DemandedElts,
+                                        UndefElts2, Depth+1);
+      if (TmpV) { I->setOperand(0, TmpV); MadeChange = true; }
+      break;
+    }
+    
+    // If this is inserting an element that isn't demanded, remove this
+    // insertelement.
+    unsigned IdxNo = Idx->getZExtValue();
+    if (IdxNo >= VWidth || !DemandedElts[IdxNo])
+      return AddSoonDeadInstToWorklist(*I, 0);
+    
+    // Otherwise, the element inserted overwrites whatever was there, so the
+    // input demanded set is simpler than the output set.
+    APInt DemandedElts2 = DemandedElts;
+    DemandedElts2.clear(IdxNo);
+    TmpV = SimplifyDemandedVectorElts(I->getOperand(0), DemandedElts2,
+                                      UndefElts, Depth+1);
+    if (TmpV) { I->setOperand(0, TmpV); MadeChange = true; }
+
+    // The inserted element is defined.
+    UndefElts.clear(IdxNo);
+    break;
+  }
+  case Instruction::ShuffleVector: {
+    ShuffleVectorInst *Shuffle = cast<ShuffleVectorInst>(I);
+    uint64_t LHSVWidth =
+      cast<VectorType>(Shuffle->getOperand(0)->getType())->getNumElements();
+    APInt LeftDemanded(LHSVWidth, 0), RightDemanded(LHSVWidth, 0);
+    for (unsigned i = 0; i < VWidth; i++) {
+      if (DemandedElts[i]) {
+        unsigned MaskVal = Shuffle->getMaskValue(i);
+        if (MaskVal != -1u) {
+          assert(MaskVal < LHSVWidth * 2 &&
+                 "shufflevector mask index out of range!");
+          if (MaskVal < LHSVWidth)
+            LeftDemanded.set(MaskVal);
+          else
+            RightDemanded.set(MaskVal - LHSVWidth);
+        }
+      }
+    }
+
+    APInt UndefElts4(LHSVWidth, 0);
+    TmpV = SimplifyDemandedVectorElts(I->getOperand(0), LeftDemanded,
+                                      UndefElts4, Depth+1);
+    if (TmpV) { I->setOperand(0, TmpV); MadeChange = true; }
+
+    APInt UndefElts3(LHSVWidth, 0);
+    TmpV = SimplifyDemandedVectorElts(I->getOperand(1), RightDemanded,
+                                      UndefElts3, Depth+1);
+    if (TmpV) { I->setOperand(1, TmpV); MadeChange = true; }
+
+    bool NewUndefElts = false;
+    for (unsigned i = 0; i < VWidth; i++) {
+      unsigned MaskVal = Shuffle->getMaskValue(i);
+      if (MaskVal == -1u) {
+        UndefElts.set(i);
+      } else if (MaskVal < LHSVWidth) {
+        if (UndefElts4[MaskVal]) {
+          NewUndefElts = true;
+          UndefElts.set(i);
+        }
+      } else {
+        if (UndefElts3[MaskVal - LHSVWidth]) {
+          NewUndefElts = true;
+          UndefElts.set(i);
+        }
+      }
+    }
+
+    if (NewUndefElts) {
+      // Add additional discovered undefs.
+      std::vector<Constant*> Elts;
+      for (unsigned i = 0; i < VWidth; ++i) {
+        if (UndefElts[i])
+          Elts.push_back(UndefValue::get(Type::Int32Ty));
+        else
+          Elts.push_back(ConstantInt::get(Type::Int32Ty,
+                                          Shuffle->getMaskValue(i)));
+      }
+      I->setOperand(2, ConstantVector::get(Elts));
+      MadeChange = true;
+    }
+    break;
+  }
+  case Instruction::BitCast: {
+    // Vector->vector casts only.
+    const VectorType *VTy = dyn_cast<VectorType>(I->getOperand(0)->getType());
+    if (!VTy) break;
+    unsigned InVWidth = VTy->getNumElements();
+    APInt InputDemandedElts(InVWidth, 0);
+    unsigned Ratio;
+
+    if (VWidth == InVWidth) {
+      // If we are converting from <4 x i32> -> <4 x f32>, we demand the same
+      // elements as are demanded of us.
+      Ratio = 1;
+      InputDemandedElts = DemandedElts;
+    } else if (VWidth > InVWidth) {
+      // Untested so far.
+      break;
+      
+      // If there are more elements in the result than there are in the source,
+      // then an input element is live if any of the corresponding output
+      // elements are live.
+      Ratio = VWidth/InVWidth;
+      for (unsigned OutIdx = 0; OutIdx != VWidth; ++OutIdx) {
+        if (DemandedElts[OutIdx])
+          InputDemandedElts.set(OutIdx/Ratio);
+      }
+    } else {
+      // Untested so far.
+      break;
+      
+      // If there are more elements in the source than there are in the result,
+      // then an input element is live if the corresponding output element is
+      // live.
+      Ratio = InVWidth/VWidth;
+      for (unsigned InIdx = 0; InIdx != InVWidth; ++InIdx)
+        if (DemandedElts[InIdx/Ratio])
+          InputDemandedElts.set(InIdx);
+    }
+    
+    // div/rem demand all inputs, because they don't want divide by zero.
+    TmpV = SimplifyDemandedVectorElts(I->getOperand(0), InputDemandedElts,
+                                      UndefElts2, Depth+1);
+    if (TmpV) {
+      I->setOperand(0, TmpV);
+      MadeChange = true;
+    }
+    
+    UndefElts = UndefElts2;
+    if (VWidth > InVWidth) {
+      assert(0 && "Unimp");
+      // If there are more elements in the result than there are in the source,
+      // then an output element is undef if the corresponding input element is
+      // undef.
+      for (unsigned OutIdx = 0; OutIdx != VWidth; ++OutIdx)
+        if (UndefElts2[OutIdx/Ratio])
+          UndefElts.set(OutIdx);
+    } else if (VWidth < InVWidth) {
+      assert(0 && "Unimp");
+      // If there are more elements in the source than there are in the result,
+      // then a result element is undef if all of the corresponding input
+      // elements are undef.
+      UndefElts = ~0ULL >> (64-VWidth);  // Start out all undef.
+      for (unsigned InIdx = 0; InIdx != InVWidth; ++InIdx)
+        if (!UndefElts2[InIdx])            // Not undef?
+          UndefElts.clear(InIdx/Ratio);    // Clear undef bit.
+    }
+    break;
+  }
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:
+  case Instruction::Add:
+  case Instruction::Sub:
+  case Instruction::Mul:
+    // div/rem demand all inputs, because they don't want divide by zero.
+    TmpV = SimplifyDemandedVectorElts(I->getOperand(0), DemandedElts,
+                                      UndefElts, Depth+1);
+    if (TmpV) { I->setOperand(0, TmpV); MadeChange = true; }
+    TmpV = SimplifyDemandedVectorElts(I->getOperand(1), DemandedElts,
+                                      UndefElts2, Depth+1);
+    if (TmpV) { I->setOperand(1, TmpV); MadeChange = true; }
+      
+    // Output elements are undefined if both are undefined.  Consider things
+    // like undef&0.  The result is known zero, not undef.
+    UndefElts &= UndefElts2;
+    break;
+    
+  case Instruction::Call: {
+    IntrinsicInst *II = dyn_cast<IntrinsicInst>(I);
+    if (!II) break;
+    switch (II->getIntrinsicID()) {
+    default: break;
+      
+    // Binary vector operations that work column-wise.  A dest element is a
+    // function of the corresponding input elements from the two inputs.
+    case Intrinsic::x86_sse_sub_ss:
+    case Intrinsic::x86_sse_mul_ss:
+    case Intrinsic::x86_sse_min_ss:
+    case Intrinsic::x86_sse_max_ss:
+    case Intrinsic::x86_sse2_sub_sd:
+    case Intrinsic::x86_sse2_mul_sd:
+    case Intrinsic::x86_sse2_min_sd:
+    case Intrinsic::x86_sse2_max_sd:
+      TmpV = SimplifyDemandedVectorElts(II->getOperand(1), DemandedElts,
+                                        UndefElts, Depth+1);
+      if (TmpV) { II->setOperand(1, TmpV); MadeChange = true; }
+      TmpV = SimplifyDemandedVectorElts(II->getOperand(2), DemandedElts,
+                                        UndefElts2, Depth+1);
+      if (TmpV) { II->setOperand(2, TmpV); MadeChange = true; }
+
+      // If only the low elt is demanded and this is a scalarizable intrinsic,
+      // scalarize it now.
+      if (DemandedElts == 1) {
+        switch (II->getIntrinsicID()) {
+        default: break;
+        case Intrinsic::x86_sse_sub_ss:
+        case Intrinsic::x86_sse_mul_ss:
+        case Intrinsic::x86_sse2_sub_sd:
+        case Intrinsic::x86_sse2_mul_sd:
+          // TODO: Lower MIN/MAX/ABS/etc
+          Value *LHS = II->getOperand(1);
+          Value *RHS = II->getOperand(2);
+          // Extract the element as scalars.
+          LHS = InsertNewInstBefore(new ExtractElementInst(LHS, 0U,"tmp"), *II);
+          RHS = InsertNewInstBefore(new ExtractElementInst(RHS, 0U,"tmp"), *II);
+          
+          switch (II->getIntrinsicID()) {
+          default: assert(0 && "Case stmts out of sync!");
+          case Intrinsic::x86_sse_sub_ss:
+          case Intrinsic::x86_sse2_sub_sd:
+            TmpV = InsertNewInstBefore(BinaryOperator::CreateSub(LHS, RHS,
+                                                        II->getName()), *II);
+            break;
+          case Intrinsic::x86_sse_mul_ss:
+          case Intrinsic::x86_sse2_mul_sd:
+            TmpV = InsertNewInstBefore(BinaryOperator::CreateMul(LHS, RHS,
+                                                         II->getName()), *II);
+            break;
+          }
+          
+          Instruction *New =
+            InsertElementInst::Create(UndefValue::get(II->getType()), TmpV, 0U,
+                                      II->getName());
+          InsertNewInstBefore(New, *II);
+          AddSoonDeadInstToWorklist(*II, 0);
+          return New;
+        }            
+      }
+        
+      // Output elements are undefined if both are undefined.  Consider things
+      // like undef&0.  The result is known zero, not undef.
+      UndefElts &= UndefElts2;
+      break;
+    }
+    break;
+  }
+  }
+  return MadeChange ? I : 0;
+}
+
+
+/// AssociativeOpt - Perform an optimization on an associative operator.  This
+/// function is designed to check a chain of associative operators for a
+/// potential to apply a certain optimization.  Since the optimization may be
+/// applicable if the expression was reassociated, this checks the chain, then
+/// reassociates the expression as necessary to expose the optimization
+/// opportunity.  This makes use of a special Functor, which must define
+/// 'shouldApply' and 'apply' methods.
+///
+template<typename Functor>
+static Instruction *AssociativeOpt(BinaryOperator &Root, const Functor &F) {
+  unsigned Opcode = Root.getOpcode();
+  Value *LHS = Root.getOperand(0);
+
+  // Quick check, see if the immediate LHS matches...
+  if (F.shouldApply(LHS))
+    return F.apply(Root);
+
+  // Otherwise, if the LHS is not of the same opcode as the root, return.
+  Instruction *LHSI = dyn_cast<Instruction>(LHS);
+  while (LHSI && LHSI->getOpcode() == Opcode && LHSI->hasOneUse()) {
+    // Should we apply this transform to the RHS?
+    bool ShouldApply = F.shouldApply(LHSI->getOperand(1));
+
+    // If not to the RHS, check to see if we should apply to the LHS...
+    if (!ShouldApply && F.shouldApply(LHSI->getOperand(0))) {
+      cast<BinaryOperator>(LHSI)->swapOperands();   // Make the LHS the RHS
+      ShouldApply = true;
+    }
+
+    // If the functor wants to apply the optimization to the RHS of LHSI,
+    // reassociate the expression from ((? op A) op B) to (? op (A op B))
+    if (ShouldApply) {
+      // Now all of the instructions are in the current basic block, go ahead
+      // and perform the reassociation.
+      Instruction *TmpLHSI = cast<Instruction>(Root.getOperand(0));
+
+      // First move the selected RHS to the LHS of the root...
+      Root.setOperand(0, LHSI->getOperand(1));
+
+      // Make what used to be the LHS of the root be the user of the root...
+      Value *ExtraOperand = TmpLHSI->getOperand(1);
+      if (&Root == TmpLHSI) {
+        Root.replaceAllUsesWith(Constant::getNullValue(TmpLHSI->getType()));
+        return 0;
+      }
+      Root.replaceAllUsesWith(TmpLHSI);          // Users now use TmpLHSI
+      TmpLHSI->setOperand(1, &Root);             // TmpLHSI now uses the root
+      BasicBlock::iterator ARI = &Root; ++ARI;
+      TmpLHSI->moveBefore(ARI);                  // Move TmpLHSI to after Root
+      ARI = Root;
+
+      // Now propagate the ExtraOperand down the chain of instructions until we
+      // get to LHSI.
+      while (TmpLHSI != LHSI) {
+        Instruction *NextLHSI = cast<Instruction>(TmpLHSI->getOperand(0));
+        // Move the instruction to immediately before the chain we are
+        // constructing to avoid breaking dominance properties.
+        NextLHSI->moveBefore(ARI);
+        ARI = NextLHSI;
+
+        Value *NextOp = NextLHSI->getOperand(1);
+        NextLHSI->setOperand(1, ExtraOperand);
+        TmpLHSI = NextLHSI;
+        ExtraOperand = NextOp;
+      }
+
+      // Now that the instructions are reassociated, have the functor perform
+      // the transformation...
+      return F.apply(Root);
+    }
+
+    LHSI = dyn_cast<Instruction>(LHSI->getOperand(0));
+  }
+  return 0;
+}
+
+namespace {
+
+// AddRHS - Implements: X + X --> X << 1
+struct AddRHS {
+  Value *RHS;
+  AddRHS(Value *rhs) : RHS(rhs) {}
+  bool shouldApply(Value *LHS) const { return LHS == RHS; }
+  Instruction *apply(BinaryOperator &Add) const {
+    return BinaryOperator::CreateShl(Add.getOperand(0),
+                                     ConstantInt::get(Add.getType(), 1));
+  }
+};
+
+// AddMaskingAnd - Implements (A & C1)+(B & C2) --> (A & C1)|(B & C2)
+//                 iff C1&C2 == 0
+struct AddMaskingAnd {
+  Constant *C2;
+  AddMaskingAnd(Constant *c) : C2(c) {}
+  bool shouldApply(Value *LHS) const {
+    ConstantInt *C1;
+    return match(LHS, m_And(m_Value(), m_ConstantInt(C1))) &&
+           ConstantExpr::getAnd(C1, C2)->isNullValue();
+  }
+  Instruction *apply(BinaryOperator &Add) const {
+    return BinaryOperator::CreateOr(Add.getOperand(0), Add.getOperand(1));
+  }
+};
+
+}
+
+static Value *FoldOperationIntoSelectOperand(Instruction &I, Value *SO,
+                                             InstCombiner *IC) {
+  if (CastInst *CI = dyn_cast<CastInst>(&I)) {
+    return IC->InsertCastBefore(CI->getOpcode(), SO, I.getType(), I);
+  }
+
+  // Figure out if the constant is the left or the right argument.
+  bool ConstIsRHS = isa<Constant>(I.getOperand(1));
+  Constant *ConstOperand = cast<Constant>(I.getOperand(ConstIsRHS));
+
+  if (Constant *SOC = dyn_cast<Constant>(SO)) {
+    if (ConstIsRHS)
+      return ConstantExpr::get(I.getOpcode(), SOC, ConstOperand);
+    return ConstantExpr::get(I.getOpcode(), ConstOperand, SOC);
+  }
+
+  Value *Op0 = SO, *Op1 = ConstOperand;
+  if (!ConstIsRHS)
+    std::swap(Op0, Op1);
+  Instruction *New;
+  if (BinaryOperator *BO = dyn_cast<BinaryOperator>(&I))
+    New = BinaryOperator::Create(BO->getOpcode(), Op0, Op1,SO->getName()+".op");
+  else if (CmpInst *CI = dyn_cast<CmpInst>(&I))
+    New = CmpInst::Create(CI->getOpcode(), CI->getPredicate(), Op0, Op1, 
+                          SO->getName()+".cmp");
+  else {
+    assert(0 && "Unknown binary instruction type!");
+    abort();
+  }
+  return IC->InsertNewInstBefore(New, I);
+}
+
+// FoldOpIntoSelect - Given an instruction with a select as one operand and a
+// constant as the other operand, try to fold the binary operator into the
+// select arguments.  This also works for Cast instructions, which obviously do
+// not have a second operand.
+static Instruction *FoldOpIntoSelect(Instruction &Op, SelectInst *SI,
+                                     InstCombiner *IC) {
+  // Don't modify shared select instructions
+  if (!SI->hasOneUse()) return 0;
+  Value *TV = SI->getOperand(1);
+  Value *FV = SI->getOperand(2);
+
+  if (isa<Constant>(TV) || isa<Constant>(FV)) {
+    // Bool selects with constant operands can be folded to logical ops.
+    if (SI->getType() == Type::Int1Ty) return 0;
+
+    Value *SelectTrueVal = FoldOperationIntoSelectOperand(Op, TV, IC);
+    Value *SelectFalseVal = FoldOperationIntoSelectOperand(Op, FV, IC);
+
+    return SelectInst::Create(SI->getCondition(), SelectTrueVal,
+                              SelectFalseVal);
+  }
+  return 0;
+}
+
+
+/// FoldOpIntoPhi - Given a binary operator or cast instruction which has a PHI
+/// node as operand #0, see if we can fold the instruction into the PHI (which
+/// is only possible if all operands to the PHI are constants).
+Instruction *InstCombiner::FoldOpIntoPhi(Instruction &I) {
+  PHINode *PN = cast<PHINode>(I.getOperand(0));
+  unsigned NumPHIValues = PN->getNumIncomingValues();
+  if (!PN->hasOneUse() || NumPHIValues == 0) return 0;
+
+  // Check to see if all of the operands of the PHI are constants.  If there is
+  // one non-constant value, remember the BB it is.  If there is more than one
+  // or if *it* is a PHI, bail out.
+  BasicBlock *NonConstBB = 0;
+  for (unsigned i = 0; i != NumPHIValues; ++i)
+    if (!isa<Constant>(PN->getIncomingValue(i))) {
+      if (NonConstBB) return 0;  // More than one non-const value.
+      if (isa<PHINode>(PN->getIncomingValue(i))) return 0;  // Itself a phi.
+      NonConstBB = PN->getIncomingBlock(i);
+      
+      // If the incoming non-constant value is in I's block, we have an infinite
+      // loop.
+      if (NonConstBB == I.getParent())
+        return 0;
+    }
+  
+  // If there is exactly one non-constant value, we can insert a copy of the
+  // operation in that block.  However, if this is a critical edge, we would be
+  // inserting the computation one some other paths (e.g. inside a loop).  Only
+  // do this if the pred block is unconditionally branching into the phi block.
+  if (NonConstBB) {
+    BranchInst *BI = dyn_cast<BranchInst>(NonConstBB->getTerminator());
+    if (!BI || !BI->isUnconditional()) return 0;
+  }
+
+  // Okay, we can do the transformation: create the new PHI node.
+  PHINode *NewPN = PHINode::Create(I.getType(), "");
+  NewPN->reserveOperandSpace(PN->getNumOperands()/2);
+  InsertNewInstBefore(NewPN, *PN);
+  NewPN->takeName(PN);
+
+  // Next, add all of the operands to the PHI.
+  if (I.getNumOperands() == 2) {
+    Constant *C = cast<Constant>(I.getOperand(1));
+    for (unsigned i = 0; i != NumPHIValues; ++i) {
+      Value *InV = 0;
+      if (Constant *InC = dyn_cast<Constant>(PN->getIncomingValue(i))) {
+        if (CmpInst *CI = dyn_cast<CmpInst>(&I))
+          InV = ConstantExpr::getCompare(CI->getPredicate(), InC, C);
+        else
+          InV = ConstantExpr::get(I.getOpcode(), InC, C);
+      } else {
+        assert(PN->getIncomingBlock(i) == NonConstBB);
+        if (BinaryOperator *BO = dyn_cast<BinaryOperator>(&I)) 
+          InV = BinaryOperator::Create(BO->getOpcode(),
+                                       PN->getIncomingValue(i), C, "phitmp",
+                                       NonConstBB->getTerminator());
+        else if (CmpInst *CI = dyn_cast<CmpInst>(&I))
+          InV = CmpInst::Create(CI->getOpcode(), 
+                                CI->getPredicate(),
+                                PN->getIncomingValue(i), C, "phitmp",
+                                NonConstBB->getTerminator());
+        else
+          assert(0 && "Unknown binop!");
+        
+        AddToWorkList(cast<Instruction>(InV));
+      }
+      NewPN->addIncoming(InV, PN->getIncomingBlock(i));
+    }
+  } else { 
+    CastInst *CI = cast<CastInst>(&I);
+    const Type *RetTy = CI->getType();
+    for (unsigned i = 0; i != NumPHIValues; ++i) {
+      Value *InV;
+      if (Constant *InC = dyn_cast<Constant>(PN->getIncomingValue(i))) {
+        InV = ConstantExpr::getCast(CI->getOpcode(), InC, RetTy);
+      } else {
+        assert(PN->getIncomingBlock(i) == NonConstBB);
+        InV = CastInst::Create(CI->getOpcode(), PN->getIncomingValue(i), 
+                               I.getType(), "phitmp", 
+                               NonConstBB->getTerminator());
+        AddToWorkList(cast<Instruction>(InV));
+      }
+      NewPN->addIncoming(InV, PN->getIncomingBlock(i));
+    }
+  }
+  return ReplaceInstUsesWith(I, NewPN);
+}
+
+
+/// WillNotOverflowSignedAdd - Return true if we can prove that:
+///    (sext (add LHS, RHS))  === (add (sext LHS), (sext RHS))
+/// This basically requires proving that the add in the original type would not
+/// overflow to change the sign bit or have a carry out.
+bool InstCombiner::WillNotOverflowSignedAdd(Value *LHS, Value *RHS) {
+  // There are different heuristics we can use for this.  Here are some simple
+  // ones.
+  
+  // Add has the property that adding any two 2's complement numbers can only 
+  // have one carry bit which can change a sign.  As such, if LHS and RHS each
+  // have at least two sign bits, we know that the addition of the two values will
+  // sign extend fine.
+  if (ComputeNumSignBits(LHS) > 1 && ComputeNumSignBits(RHS) > 1)
+    return true;
+  
+  
+  // If one of the operands only has one non-zero bit, and if the other operand
+  // has a known-zero bit in a more significant place than it (not including the
+  // sign bit) the ripple may go up to and fill the zero, but won't change the
+  // sign.  For example, (X & ~4) + 1.
+  
+  // TODO: Implement.
+  
+  return false;
+}
+
+
+Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
+  bool Changed = SimplifyCommutative(I);
+  Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
+
+  if (Constant *RHSC = dyn_cast<Constant>(RHS)) {
+    // X + undef -> undef
+    if (isa<UndefValue>(RHS))
+      return ReplaceInstUsesWith(I, RHS);
+
+    // X + 0 --> X
+    if (!I.getType()->isFPOrFPVector()) { // NOTE: -0 + +0 = +0.
+      if (RHSC->isNullValue())
+        return ReplaceInstUsesWith(I, LHS);
+    } else if (ConstantFP *CFP = dyn_cast<ConstantFP>(RHSC)) {
+      if (CFP->isExactlyValue(ConstantFP::getNegativeZero
+                              (I.getType())->getValueAPF()))
+        return ReplaceInstUsesWith(I, LHS);
+    }
+
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(RHSC)) {
+      // X + (signbit) --> X ^ signbit
+      const APInt& Val = CI->getValue();
+      uint32_t BitWidth = Val.getBitWidth();
+      if (Val == APInt::getSignBit(BitWidth))
+        return BinaryOperator::CreateXor(LHS, RHS);
+      
+      // See if SimplifyDemandedBits can simplify this.  This handles stuff like
+      // (X & 254)+1 -> (X&254)|1
+      if (!isa<VectorType>(I.getType()) && SimplifyDemandedInstructionBits(I))
+        return &I;
+
+      // zext(i1) - 1  ->  select i1, 0, -1
+      if (ZExtInst *ZI = dyn_cast<ZExtInst>(LHS))
+        if (CI->isAllOnesValue() &&
+            ZI->getOperand(0)->getType() == Type::Int1Ty)
+          return SelectInst::Create(ZI->getOperand(0),
+                                    Constant::getNullValue(I.getType()),
+                                    ConstantInt::getAllOnesValue(I.getType()));
+    }
+
+    if (isa<PHINode>(LHS))
+      if (Instruction *NV = FoldOpIntoPhi(I))
+        return NV;
+    
+    ConstantInt *XorRHS = 0;
+    Value *XorLHS = 0;
+    if (isa<ConstantInt>(RHSC) &&
+        match(LHS, m_Xor(m_Value(XorLHS), m_ConstantInt(XorRHS)))) {
+      uint32_t TySizeBits = I.getType()->getPrimitiveSizeInBits();
+      const APInt& RHSVal = cast<ConstantInt>(RHSC)->getValue();
+      
+      uint32_t Size = TySizeBits / 2;
+      APInt C0080Val(APInt(TySizeBits, 1ULL).shl(Size - 1));
+      APInt CFF80Val(-C0080Val);
+      do {
+        if (TySizeBits > Size) {
+          // If we have ADD(XOR(AND(X, 0xFF), 0x80), 0xF..F80), it's a sext.
+          // If we have ADD(XOR(AND(X, 0xFF), 0xF..F80), 0x80), it's a sext.
+          if ((RHSVal == CFF80Val && XorRHS->getValue() == C0080Val) ||
+              (RHSVal == C0080Val && XorRHS->getValue() == CFF80Val)) {
+            // This is a sign extend if the top bits are known zero.
+            if (!MaskedValueIsZero(XorLHS, 
+                   APInt::getHighBitsSet(TySizeBits, TySizeBits - Size)))
+              Size = 0;  // Not a sign ext, but can't be any others either.
+            break;
+          }
+        }
+        Size >>= 1;
+        C0080Val = APIntOps::lshr(C0080Val, Size);
+        CFF80Val = APIntOps::ashr(CFF80Val, Size);
+      } while (Size >= 1);
+      
+      // FIXME: This shouldn't be necessary. When the backends can handle types
+      // with funny bit widths then this switch statement should be removed. It
+      // is just here to get the size of the "middle" type back up to something
+      // that the back ends can handle.
+      const Type *MiddleType = 0;
+      switch (Size) {
+        default: break;
+        case 32: MiddleType = Type::Int32Ty; break;
+        case 16: MiddleType = Type::Int16Ty; break;
+        case  8: MiddleType = Type::Int8Ty; break;
+      }
+      if (MiddleType) {
+        Instruction *NewTrunc = new TruncInst(XorLHS, MiddleType, "sext");
+        InsertNewInstBefore(NewTrunc, I);
+        return new SExtInst(NewTrunc, I.getType(), I.getName());
+      }
+    }
+  }
+
+  if (I.getType() == Type::Int1Ty)
+    return BinaryOperator::CreateXor(LHS, RHS);
+
+  // X + X --> X << 1
+  if (I.getType()->isInteger()) {
+    if (Instruction *Result = AssociativeOpt(I, AddRHS(RHS))) return Result;
+
+    if (Instruction *RHSI = dyn_cast<Instruction>(RHS)) {
+      if (RHSI->getOpcode() == Instruction::Sub)
+        if (LHS == RHSI->getOperand(1))                   // A + (B - A) --> B
+          return ReplaceInstUsesWith(I, RHSI->getOperand(0));
+    }
+    if (Instruction *LHSI = dyn_cast<Instruction>(LHS)) {
+      if (LHSI->getOpcode() == Instruction::Sub)
+        if (RHS == LHSI->getOperand(1))                   // (B - A) + A --> B
+          return ReplaceInstUsesWith(I, LHSI->getOperand(0));
+    }
+  }
+
+  // -A + B  -->  B - A
+  // -A + -B  -->  -(A + B)
+  if (Value *LHSV = dyn_castNegVal(LHS)) {
+    if (LHS->getType()->isIntOrIntVector()) {
+      if (Value *RHSV = dyn_castNegVal(RHS)) {
+        Instruction *NewAdd = BinaryOperator::CreateAdd(LHSV, RHSV, "sum");
+        InsertNewInstBefore(NewAdd, I);
+        return BinaryOperator::CreateNeg(NewAdd);
+      }
+    }
+    
+    return BinaryOperator::CreateSub(RHS, LHSV);
+  }
+
+  // A + -B  -->  A - B
+  if (!isa<Constant>(RHS))
+    if (Value *V = dyn_castNegVal(RHS))
+      return BinaryOperator::CreateSub(LHS, V);
+
+
+  ConstantInt *C2;
+  if (Value *X = dyn_castFoldableMul(LHS, C2)) {
+    if (X == RHS)   // X*C + X --> X * (C+1)
+      return BinaryOperator::CreateMul(RHS, AddOne(C2));
+
+    // X*C1 + X*C2 --> X * (C1+C2)
+    ConstantInt *C1;
+    if (X == dyn_castFoldableMul(RHS, C1))
+      return BinaryOperator::CreateMul(X, Add(C1, C2));
+  }
+
+  // X + X*C --> X * (C+1)
+  if (dyn_castFoldableMul(RHS, C2) == LHS)
+    return BinaryOperator::CreateMul(LHS, AddOne(C2));
+
+  // X + ~X --> -1   since   ~X = -X-1
+  if (dyn_castNotVal(LHS) == RHS || dyn_castNotVal(RHS) == LHS)
+    return ReplaceInstUsesWith(I, Constant::getAllOnesValue(I.getType()));
+  
+
+  // (A & C1)+(B & C2) --> (A & C1)|(B & C2) iff C1&C2 == 0
+  if (match(RHS, m_And(m_Value(), m_ConstantInt(C2))))
+    if (Instruction *R = AssociativeOpt(I, AddMaskingAnd(C2)))
+      return R;
+  
+  // A+B --> A|B iff A and B have no bits set in common.
+  if (const IntegerType *IT = dyn_cast<IntegerType>(I.getType())) {
+    APInt Mask = APInt::getAllOnesValue(IT->getBitWidth());
+    APInt LHSKnownOne(IT->getBitWidth(), 0);
+    APInt LHSKnownZero(IT->getBitWidth(), 0);
+    ComputeMaskedBits(LHS, Mask, LHSKnownZero, LHSKnownOne);
+    if (LHSKnownZero != 0) {
+      APInt RHSKnownOne(IT->getBitWidth(), 0);
+      APInt RHSKnownZero(IT->getBitWidth(), 0);
+      ComputeMaskedBits(RHS, Mask, RHSKnownZero, RHSKnownOne);
+      
+      // No bits in common -> bitwise or.
+      if ((LHSKnownZero|RHSKnownZero).isAllOnesValue())
+        return BinaryOperator::CreateOr(LHS, RHS);
+    }
+  }
+
+  // W*X + Y*Z --> W * (X+Z)  iff W == Y
+  if (I.getType()->isIntOrIntVector()) {
+    Value *W, *X, *Y, *Z;
+    if (match(LHS, m_Mul(m_Value(W), m_Value(X))) &&
+        match(RHS, m_Mul(m_Value(Y), m_Value(Z)))) {
+      if (W != Y) {
+        if (W == Z) {
+          std::swap(Y, Z);
+        } else if (Y == X) {
+          std::swap(W, X);
+        } else if (X == Z) {
+          std::swap(Y, Z);
+          std::swap(W, X);
+        }
+      }
+
+      if (W == Y) {
+        Value *NewAdd = InsertNewInstBefore(BinaryOperator::CreateAdd(X, Z,
+                                                            LHS->getName()), I);
+        return BinaryOperator::CreateMul(W, NewAdd);
+      }
+    }
+  }
+
+  if (ConstantInt *CRHS = dyn_cast<ConstantInt>(RHS)) {
+    Value *X = 0;
+    if (match(LHS, m_Not(m_Value(X))))    // ~X + C --> (C-1) - X
+      return BinaryOperator::CreateSub(SubOne(CRHS), X);
+
+    // (X & FF00) + xx00  -> (X+xx00) & FF00
+    if (LHS->hasOneUse() && match(LHS, m_And(m_Value(X), m_ConstantInt(C2)))) {
+      Constant *Anded = And(CRHS, C2);
+      if (Anded == CRHS) {
+        // See if all bits from the first bit set in the Add RHS up are included
+        // in the mask.  First, get the rightmost bit.
+        const APInt& AddRHSV = CRHS->getValue();
+
+        // Form a mask of all bits from the lowest bit added through the top.
+        APInt AddRHSHighBits(~((AddRHSV & -AddRHSV)-1));
+
+        // See if the and mask includes all of these bits.
+        APInt AddRHSHighBitsAnd(AddRHSHighBits & C2->getValue());
+
+        if (AddRHSHighBits == AddRHSHighBitsAnd) {
+          // Okay, the xform is safe.  Insert the new add pronto.
+          Value *NewAdd = InsertNewInstBefore(BinaryOperator::CreateAdd(X, CRHS,
+                                                            LHS->getName()), I);
+          return BinaryOperator::CreateAnd(NewAdd, C2);
+        }
+      }
+    }
+
+    // Try to fold constant add into select arguments.
+    if (SelectInst *SI = dyn_cast<SelectInst>(LHS))
+      if (Instruction *R = FoldOpIntoSelect(I, SI, this))
+        return R;
+  }
+
+  // add (cast *A to intptrtype) B -> 
+  //   cast (GEP (cast *A to sbyte*) B)  -->  intptrtype
+  {
+    CastInst *CI = dyn_cast<CastInst>(LHS);
+    Value *Other = RHS;
+    if (!CI) {
+      CI = dyn_cast<CastInst>(RHS);
+      Other = LHS;
+    }
+    if (CI && CI->getType()->isSized() && 
+        (CI->getType()->getPrimitiveSizeInBits() == 
+         TD->getIntPtrType()->getPrimitiveSizeInBits()) 
+        && isa<PointerType>(CI->getOperand(0)->getType())) {
+      unsigned AS =
+        cast<PointerType>(CI->getOperand(0)->getType())->getAddressSpace();
+      Value *I2 = InsertBitCastBefore(CI->getOperand(0),
+                                      PointerType::get(Type::Int8Ty, AS), I);
+      I2 = InsertNewInstBefore(GetElementPtrInst::Create(I2, Other, "ctg2"), I);
+      return new PtrToIntInst(I2, CI->getType());
+    }
+  }
+  
+  // add (select X 0 (sub n A)) A  -->  select X A n
+  {
+    SelectInst *SI = dyn_cast<SelectInst>(LHS);
+    Value *A = RHS;
+    if (!SI) {
+      SI = dyn_cast<SelectInst>(RHS);
+      A = LHS;
+    }
+    if (SI && SI->hasOneUse()) {
+      Value *TV = SI->getTrueValue();
+      Value *FV = SI->getFalseValue();
+      Value *N;
+
+      // Can we fold the add into the argument of the select?
+      // We check both true and false select arguments for a matching subtract.
+      if (match(FV, m_Zero()) && match(TV, m_Sub(m_Value(N), m_Specific(A))))
+        // Fold the add into the true select value.
+        return SelectInst::Create(SI->getCondition(), N, A);
+      if (match(TV, m_Zero()) && match(FV, m_Sub(m_Value(N), m_Specific(A))))
+        // Fold the add into the false select value.
+        return SelectInst::Create(SI->getCondition(), A, N);
+    }
+  }
+  
+  // Check for X+0.0.  Simplify it to X if we know X is not -0.0.
+  if (ConstantFP *CFP = dyn_cast<ConstantFP>(RHS))
+    if (CFP->getValueAPF().isPosZero() && CannotBeNegativeZero(LHS))
+      return ReplaceInstUsesWith(I, LHS);
+
+  // Check for (add (sext x), y), see if we can merge this into an
+  // integer add followed by a sext.
+  if (SExtInst *LHSConv = dyn_cast<SExtInst>(LHS)) {
+    // (add (sext x), cst) --> (sext (add x, cst'))
+    if (ConstantInt *RHSC = dyn_cast<ConstantInt>(RHS)) {
+      Constant *CI = 
+        ConstantExpr::getTrunc(RHSC, LHSConv->getOperand(0)->getType());
+      if (LHSConv->hasOneUse() &&
+          ConstantExpr::getSExt(CI, I.getType()) == RHSC &&
+          WillNotOverflowSignedAdd(LHSConv->getOperand(0), CI)) {
+        // Insert the new, smaller add.
+        Instruction *NewAdd = BinaryOperator::CreateAdd(LHSConv->getOperand(0), 
+                                                        CI, "addconv");
+        InsertNewInstBefore(NewAdd, I);
+        return new SExtInst(NewAdd, I.getType());
+      }
+    }
+    
+    // (add (sext x), (sext y)) --> (sext (add int x, y))
+    if (SExtInst *RHSConv = dyn_cast<SExtInst>(RHS)) {
+      // Only do this if x/y have the same type, if at last one of them has a
+      // single use (so we don't increase the number of sexts), and if the
+      // integer add will not overflow.
+      if (LHSConv->getOperand(0)->getType()==RHSConv->getOperand(0)->getType()&&
+          (LHSConv->hasOneUse() || RHSConv->hasOneUse()) &&
+          WillNotOverflowSignedAdd(LHSConv->getOperand(0),
+                                   RHSConv->getOperand(0))) {
+        // Insert the new integer add.
+        Instruction *NewAdd = BinaryOperator::CreateAdd(LHSConv->getOperand(0), 
+                                                        RHSConv->getOperand(0),
+                                                        "addconv");
+        InsertNewInstBefore(NewAdd, I);
+        return new SExtInst(NewAdd, I.getType());
+      }
+    }
+  }
+  
+  // Check for (add double (sitofp x), y), see if we can merge this into an
+  // integer add followed by a promotion.
+  if (SIToFPInst *LHSConv = dyn_cast<SIToFPInst>(LHS)) {
+    // (add double (sitofp x), fpcst) --> (sitofp (add int x, intcst))
+    // ... if the constant fits in the integer value.  This is useful for things
+    // like (double)(x & 1234) + 4.0 -> (double)((X & 1234)+4) which no longer
+    // requires a constant pool load, and generally allows the add to be better
+    // instcombined.
+    if (ConstantFP *CFP = dyn_cast<ConstantFP>(RHS)) {
+      Constant *CI = 
+      ConstantExpr::getFPToSI(CFP, LHSConv->getOperand(0)->getType());
+      if (LHSConv->hasOneUse() &&
+          ConstantExpr::getSIToFP(CI, I.getType()) == CFP &&
+          WillNotOverflowSignedAdd(LHSConv->getOperand(0), CI)) {
+        // Insert the new integer add.
+        Instruction *NewAdd = BinaryOperator::CreateAdd(LHSConv->getOperand(0), 
+                                                        CI, "addconv");
+        InsertNewInstBefore(NewAdd, I);
+        return new SIToFPInst(NewAdd, I.getType());
+      }
+    }
+    
+    // (add double (sitofp x), (sitofp y)) --> (sitofp (add int x, y))
+    if (SIToFPInst *RHSConv = dyn_cast<SIToFPInst>(RHS)) {
+      // Only do this if x/y have the same type, if at last one of them has a
+      // single use (so we don't increase the number of int->fp conversions),
+      // and if the integer add will not overflow.
+      if (LHSConv->getOperand(0)->getType()==RHSConv->getOperand(0)->getType()&&
+          (LHSConv->hasOneUse() || RHSConv->hasOneUse()) &&
+          WillNotOverflowSignedAdd(LHSConv->getOperand(0),
+                                   RHSConv->getOperand(0))) {
+        // Insert the new integer add.
+        Instruction *NewAdd = BinaryOperator::CreateAdd(LHSConv->getOperand(0), 
+                                                        RHSConv->getOperand(0),
+                                                        "addconv");
+        InsertNewInstBefore(NewAdd, I);
+        return new SIToFPInst(NewAdd, I.getType());
+      }
+    }
+  }
+  
+  return Changed ? &I : 0;
+}
+
+Instruction *InstCombiner::visitSub(BinaryOperator &I) {
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+
+  if (Op0 == Op1 &&                        // sub X, X  -> 0
+      !I.getType()->isFPOrFPVector())
+    return ReplaceInstUsesWith(I, Constant::getNullValue(I.getType()));
+
+  // If this is a 'B = x-(-A)', change to B = x+A...
+  if (Value *V = dyn_castNegVal(Op1))
+    return BinaryOperator::CreateAdd(Op0, V);
+
+  if (isa<UndefValue>(Op0))
+    return ReplaceInstUsesWith(I, Op0);    // undef - X -> undef
+  if (isa<UndefValue>(Op1))
+    return ReplaceInstUsesWith(I, Op1);    // X - undef -> undef
+
+  if (ConstantInt *C = dyn_cast<ConstantInt>(Op0)) {
+    // Replace (-1 - A) with (~A)...
+    if (C->isAllOnesValue())
+      return BinaryOperator::CreateNot(Op1);
+
+    // C - ~X == X + (1+C)
+    Value *X = 0;
+    if (match(Op1, m_Not(m_Value(X))))
+      return BinaryOperator::CreateAdd(X, AddOne(C));
+
+    // -(X >>u 31) -> (X >>s 31)
+    // -(X >>s 31) -> (X >>u 31)
+    if (C->isZero()) {
+      if (BinaryOperator *SI = dyn_cast<BinaryOperator>(Op1)) {
+        if (SI->getOpcode() == Instruction::LShr) {
+          if (ConstantInt *CU = dyn_cast<ConstantInt>(SI->getOperand(1))) {
+            // Check to see if we are shifting out everything but the sign bit.
+            if (CU->getLimitedValue(SI->getType()->getPrimitiveSizeInBits()) ==
+                SI->getType()->getPrimitiveSizeInBits()-1) {
+              // Ok, the transformation is safe.  Insert AShr.
+              return BinaryOperator::Create(Instruction::AShr, 
+                                          SI->getOperand(0), CU, SI->getName());
+            }
+          }
+        }
+        else if (SI->getOpcode() == Instruction::AShr) {
+          if (ConstantInt *CU = dyn_cast<ConstantInt>(SI->getOperand(1))) {
+            // Check to see if we are shifting out everything but the sign bit.
+            if (CU->getLimitedValue(SI->getType()->getPrimitiveSizeInBits()) ==
+                SI->getType()->getPrimitiveSizeInBits()-1) {
+              // Ok, the transformation is safe.  Insert LShr. 
+              return BinaryOperator::CreateLShr(
+                                          SI->getOperand(0), CU, SI->getName());
+            }
+          }
+        }
+      }
+    }
+
+    // Try to fold constant sub into select arguments.
+    if (SelectInst *SI = dyn_cast<SelectInst>(Op1))
+      if (Instruction *R = FoldOpIntoSelect(I, SI, this))
+        return R;
+  }
+
+  if (I.getType() == Type::Int1Ty)
+    return BinaryOperator::CreateXor(Op0, Op1);
+
+  if (BinaryOperator *Op1I = dyn_cast<BinaryOperator>(Op1)) {
+    if (Op1I->getOpcode() == Instruction::Add &&
+        !Op0->getType()->isFPOrFPVector()) {
+      if (Op1I->getOperand(0) == Op0)              // X-(X+Y) == -Y
+        return BinaryOperator::CreateNeg(Op1I->getOperand(1), I.getName());
+      else if (Op1I->getOperand(1) == Op0)         // X-(Y+X) == -Y
+        return BinaryOperator::CreateNeg(Op1I->getOperand(0), I.getName());
+      else if (ConstantInt *CI1 = dyn_cast<ConstantInt>(I.getOperand(0))) {
+        if (ConstantInt *CI2 = dyn_cast<ConstantInt>(Op1I->getOperand(1)))
+          // C1-(X+C2) --> (C1-C2)-X
+          return BinaryOperator::CreateSub(Subtract(CI1, CI2), 
+                                           Op1I->getOperand(0));
+      }
+    }
+
+    if (Op1I->hasOneUse()) {
+      // Replace (x - (y - z)) with (x + (z - y)) if the (y - z) subexpression
+      // is not used by anyone else...
+      //
+      if (Op1I->getOpcode() == Instruction::Sub &&
+          !Op1I->getType()->isFPOrFPVector()) {
+        // Swap the two operands of the subexpr...
+        Value *IIOp0 = Op1I->getOperand(0), *IIOp1 = Op1I->getOperand(1);
+        Op1I->setOperand(0, IIOp1);
+        Op1I->setOperand(1, IIOp0);
+
+        // Create the new top level add instruction...
+        return BinaryOperator::CreateAdd(Op0, Op1);
+      }
+
+      // Replace (A - (A & B)) with (A & ~B) if this is the only use of (A&B)...
+      //
+      if (Op1I->getOpcode() == Instruction::And &&
+          (Op1I->getOperand(0) == Op0 || Op1I->getOperand(1) == Op0)) {
+        Value *OtherOp = Op1I->getOperand(Op1I->getOperand(0) == Op0);
+
+        Value *NewNot =
+          InsertNewInstBefore(BinaryOperator::CreateNot(OtherOp, "B.not"), I);
+        return BinaryOperator::CreateAnd(Op0, NewNot);
+      }
+
+      // 0 - (X sdiv C)  -> (X sdiv -C)
+      if (Op1I->getOpcode() == Instruction::SDiv)
+        if (ConstantInt *CSI = dyn_cast<ConstantInt>(Op0))
+          if (CSI->isZero())
+            if (Constant *DivRHS = dyn_cast<Constant>(Op1I->getOperand(1)))
+              return BinaryOperator::CreateSDiv(Op1I->getOperand(0),
+                                               ConstantExpr::getNeg(DivRHS));
+
+      // X - X*C --> X * (1-C)
+      ConstantInt *C2 = 0;
+      if (dyn_castFoldableMul(Op1I, C2) == Op0) {
+        Constant *CP1 = Subtract(ConstantInt::get(I.getType(), 1), C2);
+        return BinaryOperator::CreateMul(Op0, CP1);
+      }
+    }
+  }
+
+  if (!Op0->getType()->isFPOrFPVector())
+    if (BinaryOperator *Op0I = dyn_cast<BinaryOperator>(Op0)) {
+      if (Op0I->getOpcode() == Instruction::Add) {
+        if (Op0I->getOperand(0) == Op1)             // (Y+X)-Y == X
+          return ReplaceInstUsesWith(I, Op0I->getOperand(1));
+        else if (Op0I->getOperand(1) == Op1)        // (X+Y)-Y == X
+          return ReplaceInstUsesWith(I, Op0I->getOperand(0));
+      } else if (Op0I->getOpcode() == Instruction::Sub) {
+        if (Op0I->getOperand(0) == Op1)             // (X-Y)-X == -Y
+          return BinaryOperator::CreateNeg(Op0I->getOperand(1), I.getName());
+      }
+    }
+
+  ConstantInt *C1;
+  if (Value *X = dyn_castFoldableMul(Op0, C1)) {
+    if (X == Op1)  // X*C - X --> X * (C-1)
+      return BinaryOperator::CreateMul(Op1, SubOne(C1));
+
+    ConstantInt *C2;   // X*C1 - X*C2 -> X * (C1-C2)
+    if (X == dyn_castFoldableMul(Op1, C2))
+      return BinaryOperator::CreateMul(X, Subtract(C1, C2));
+  }
+  return 0;
+}
+
+/// isSignBitCheck - Given an exploded icmp instruction, return true if the
+/// comparison only checks the sign bit.  If it only checks the sign bit, set
+/// TrueIfSigned if the result of the comparison is true when the input value is
+/// signed.
+static bool isSignBitCheck(ICmpInst::Predicate pred, ConstantInt *RHS,
+                           bool &TrueIfSigned) {
+  switch (pred) {
+  case ICmpInst::ICMP_SLT:   // True if LHS s< 0
+    TrueIfSigned = true;
+    return RHS->isZero();
+  case ICmpInst::ICMP_SLE:   // True if LHS s<= RHS and RHS == -1
+    TrueIfSigned = true;
+    return RHS->isAllOnesValue();
+  case ICmpInst::ICMP_SGT:   // True if LHS s> -1
+    TrueIfSigned = false;
+    return RHS->isAllOnesValue();
+  case ICmpInst::ICMP_UGT:
+    // True if LHS u> RHS and RHS == high-bit-mask - 1
+    TrueIfSigned = true;
+    return RHS->getValue() ==
+      APInt::getSignedMaxValue(RHS->getType()->getPrimitiveSizeInBits());
+  case ICmpInst::ICMP_UGE: 
+    // True if LHS u>= RHS and RHS == high-bit-mask (2^7, 2^15, 2^31, etc)
+    TrueIfSigned = true;
+    return RHS->getValue().isSignBit();
+  default:
+    return false;
+  }
+}
+
+Instruction *InstCombiner::visitMul(BinaryOperator &I) {
+  bool Changed = SimplifyCommutative(I);
+  Value *Op0 = I.getOperand(0);
+
+  if (isa<UndefValue>(I.getOperand(1)))              // undef * X -> 0
+    return ReplaceInstUsesWith(I, Constant::getNullValue(I.getType()));
+
+  // Simplify mul instructions with a constant RHS...
+  if (Constant *Op1 = dyn_cast<Constant>(I.getOperand(1))) {
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(Op1)) {
+
+      // ((X << C1)*C2) == (X * (C2 << C1))
+      if (BinaryOperator *SI = dyn_cast<BinaryOperator>(Op0))
+        if (SI->getOpcode() == Instruction::Shl)
+          if (Constant *ShOp = dyn_cast<Constant>(SI->getOperand(1)))
+            return BinaryOperator::CreateMul(SI->getOperand(0),
+                                             ConstantExpr::getShl(CI, ShOp));
+
+      if (CI->isZero())
+        return ReplaceInstUsesWith(I, Op1);  // X * 0  == 0
+      if (CI->equalsInt(1))                  // X * 1  == X
+        return ReplaceInstUsesWith(I, Op0);
+      if (CI->isAllOnesValue())              // X * -1 == 0 - X
+        return BinaryOperator::CreateNeg(Op0, I.getName());
+
+      const APInt& Val = cast<ConstantInt>(CI)->getValue();
+      if (Val.isPowerOf2()) {          // Replace X*(2^C) with X << C
+        return BinaryOperator::CreateShl(Op0,
+                 ConstantInt::get(Op0->getType(), Val.logBase2()));
+      }
+    } else if (ConstantFP *Op1F = dyn_cast<ConstantFP>(Op1)) {
+      if (Op1F->isNullValue())
+        return ReplaceInstUsesWith(I, Op1);
+
+      // "In IEEE floating point, x*1 is not equivalent to x for nans.  However,
+      // ANSI says we can drop signals, so we can do this anyway." (from GCC)
+      if (Op1F->isExactlyValue(1.0))
+        return ReplaceInstUsesWith(I, Op0);  // Eliminate 'mul double %X, 1.0'
+    } else if (isa<VectorType>(Op1->getType())) {
+      if (isa<ConstantAggregateZero>(Op1))
+        return ReplaceInstUsesWith(I, Op1);
+
+      if (ConstantVector *Op1V = dyn_cast<ConstantVector>(Op1)) {
+        if (Op1V->isAllOnesValue())              // X * -1 == 0 - X
+          return BinaryOperator::CreateNeg(Op0, I.getName());
+
+        // As above, vector X*splat(1.0) -> X in all defined cases.
+        if (Constant *Splat = Op1V->getSplatValue()) {
+          if (ConstantFP *F = dyn_cast<ConstantFP>(Splat))
+            if (F->isExactlyValue(1.0))
+              return ReplaceInstUsesWith(I, Op0);
+          if (ConstantInt *CI = dyn_cast<ConstantInt>(Splat))
+            if (CI->equalsInt(1))
+              return ReplaceInstUsesWith(I, Op0);
+        }
+      }
+    }
+    
+    if (BinaryOperator *Op0I = dyn_cast<BinaryOperator>(Op0))
+      if (Op0I->getOpcode() == Instruction::Add && Op0I->hasOneUse() &&
+          isa<ConstantInt>(Op0I->getOperand(1)) && isa<ConstantInt>(Op1)) {
+        // Canonicalize (X+C1)*C2 -> X*C2+C1*C2.
+        Instruction *Add = BinaryOperator::CreateMul(Op0I->getOperand(0),
+                                                     Op1, "tmp");
+        InsertNewInstBefore(Add, I);
+        Value *C1C2 = ConstantExpr::getMul(Op1, 
+                                           cast<Constant>(Op0I->getOperand(1)));
+        return BinaryOperator::CreateAdd(Add, C1C2);
+        
+      }
+
+    // Try to fold constant mul into select arguments.
+    if (SelectInst *SI = dyn_cast<SelectInst>(Op0))
+      if (Instruction *R = FoldOpIntoSelect(I, SI, this))
+        return R;
+
+    if (isa<PHINode>(Op0))
+      if (Instruction *NV = FoldOpIntoPhi(I))
+        return NV;
+  }
+
+  if (Value *Op0v = dyn_castNegVal(Op0))     // -X * -Y = X*Y
+    if (Value *Op1v = dyn_castNegVal(I.getOperand(1)))
+      return BinaryOperator::CreateMul(Op0v, Op1v);
+
+  // (X / Y) *  Y = X - (X % Y)
+  // (X / Y) * -Y = (X % Y) - X
+  {
+    Value *Op1 = I.getOperand(1);
+    BinaryOperator *BO = dyn_cast<BinaryOperator>(Op0);
+    if (!BO ||
+        (BO->getOpcode() != Instruction::UDiv && 
+         BO->getOpcode() != Instruction::SDiv)) {
+      Op1 = Op0;
+      BO = dyn_cast<BinaryOperator>(I.getOperand(1));
+    }
+    Value *Neg = dyn_castNegVal(Op1);
+    if (BO && BO->hasOneUse() &&
+        (BO->getOperand(1) == Op1 || BO->getOperand(1) == Neg) &&
+        (BO->getOpcode() == Instruction::UDiv ||
+         BO->getOpcode() == Instruction::SDiv)) {
+      Value *Op0BO = BO->getOperand(0), *Op1BO = BO->getOperand(1);
+
+      Instruction *Rem;
+      if (BO->getOpcode() == Instruction::UDiv)
+        Rem = BinaryOperator::CreateURem(Op0BO, Op1BO);
+      else
+        Rem = BinaryOperator::CreateSRem(Op0BO, Op1BO);
+
+      InsertNewInstBefore(Rem, I);
+      Rem->takeName(BO);
+
+      if (Op1BO == Op1)
+        return BinaryOperator::CreateSub(Op0BO, Rem);
+      else
+        return BinaryOperator::CreateSub(Rem, Op0BO);
+    }
+  }
+
+  if (I.getType() == Type::Int1Ty)
+    return BinaryOperator::CreateAnd(Op0, I.getOperand(1));
+
+  // If one of the operands of the multiply is a cast from a boolean value, then
+  // we know the bool is either zero or one, so this is a 'masking' multiply.
+  // See if we can simplify things based on how the boolean was originally
+  // formed.
+  CastInst *BoolCast = 0;
+  if (ZExtInst *CI = dyn_cast<ZExtInst>(Op0))
+    if (CI->getOperand(0)->getType() == Type::Int1Ty)
+      BoolCast = CI;
+  if (!BoolCast)
+    if (ZExtInst *CI = dyn_cast<ZExtInst>(I.getOperand(1)))
+      if (CI->getOperand(0)->getType() == Type::Int1Ty)
+        BoolCast = CI;
+  if (BoolCast) {
+    if (ICmpInst *SCI = dyn_cast<ICmpInst>(BoolCast->getOperand(0))) {
+      Value *SCIOp0 = SCI->getOperand(0), *SCIOp1 = SCI->getOperand(1);
+      const Type *SCOpTy = SCIOp0->getType();
+      bool TIS = false;
+      
+      // If the icmp is true iff the sign bit of X is set, then convert this
+      // multiply into a shift/and combination.
+      if (isa<ConstantInt>(SCIOp1) &&
+          isSignBitCheck(SCI->getPredicate(), cast<ConstantInt>(SCIOp1), TIS) &&
+          TIS) {
+        // Shift the X value right to turn it into "all signbits".
+        Constant *Amt = ConstantInt::get(SCIOp0->getType(),
+                                          SCOpTy->getPrimitiveSizeInBits()-1);
+        Value *V =
+          InsertNewInstBefore(
+            BinaryOperator::Create(Instruction::AShr, SCIOp0, Amt,
+                                            BoolCast->getOperand(0)->getName()+
+                                            ".mask"), I);
+
+        // If the multiply type is not the same as the source type, sign extend
+        // or truncate to the multiply type.
+        if (I.getType() != V->getType()) {
+          uint32_t SrcBits = V->getType()->getPrimitiveSizeInBits();
+          uint32_t DstBits = I.getType()->getPrimitiveSizeInBits();
+          Instruction::CastOps opcode = 
+            (SrcBits == DstBits ? Instruction::BitCast : 
+             (SrcBits < DstBits ? Instruction::SExt : Instruction::Trunc));
+          V = InsertCastBefore(opcode, V, I.getType(), I);
+        }
+
+        Value *OtherOp = Op0 == BoolCast ? I.getOperand(1) : Op0;
+        return BinaryOperator::CreateAnd(V, OtherOp);
+      }
+    }
+  }
+
+  return Changed ? &I : 0;
+}
+
+/// SimplifyDivRemOfSelect - Try to fold a divide or remainder of a select
+/// instruction.
+bool InstCombiner::SimplifyDivRemOfSelect(BinaryOperator &I) {
+  SelectInst *SI = cast<SelectInst>(I.getOperand(1));
+  
+  // div/rem X, (Cond ? 0 : Y) -> div/rem X, Y
+  int NonNullOperand = -1;
+  if (Constant *ST = dyn_cast<Constant>(SI->getOperand(1)))
+    if (ST->isNullValue())
+      NonNullOperand = 2;
+  // div/rem X, (Cond ? Y : 0) -> div/rem X, Y
+  if (Constant *ST = dyn_cast<Constant>(SI->getOperand(2)))
+    if (ST->isNullValue())
+      NonNullOperand = 1;
+  
+  if (NonNullOperand == -1)
+    return false;
+  
+  Value *SelectCond = SI->getOperand(0);
+  
+  // Change the div/rem to use 'Y' instead of the select.
+  I.setOperand(1, SI->getOperand(NonNullOperand));
+  
+  // Okay, we know we replace the operand of the div/rem with 'Y' with no
+  // problem.  However, the select, or the condition of the select may have
+  // multiple uses.  Based on our knowledge that the operand must be non-zero,
+  // propagate the known value for the select into other uses of it, and
+  // propagate a known value of the condition into its other users.
+  
+  // If the select and condition only have a single use, don't bother with this,
+  // early exit.
+  if (SI->use_empty() && SelectCond->hasOneUse())
+    return true;
+  
+  // Scan the current block backward, looking for other uses of SI.
+  BasicBlock::iterator BBI = &I, BBFront = I.getParent()->begin();
+  
+  while (BBI != BBFront) {
+    --BBI;
+    // If we found a call to a function, we can't assume it will return, so
+    // information from below it cannot be propagated above it.
+    if (isa<CallInst>(BBI) && !isa<IntrinsicInst>(BBI))
+      break;
+    
+    // Replace uses of the select or its condition with the known values.
+    for (Instruction::op_iterator I = BBI->op_begin(), E = BBI->op_end();
+         I != E; ++I) {
+      if (*I == SI) {
+        *I = SI->getOperand(NonNullOperand);
+        AddToWorkList(BBI);
+      } else if (*I == SelectCond) {
+        *I = NonNullOperand == 1 ? ConstantInt::getTrue() :
+                                   ConstantInt::getFalse();
+        AddToWorkList(BBI);
+      }
+    }
+    
+    // If we past the instruction, quit looking for it.
+    if (&*BBI == SI)
+      SI = 0;
+    if (&*BBI == SelectCond)
+      SelectCond = 0;
+    
+    // If we ran out of things to eliminate, break out of the loop.
+    if (SelectCond == 0 && SI == 0)
+      break;
+    
+  }
+  return true;
+}
+
+
+/// This function implements the transforms on div instructions that work
+/// regardless of the kind of div instruction it is (udiv, sdiv, or fdiv). It is
+/// used by the visitors to those instructions.
+/// @brief Transforms common to all three div instructions
+Instruction *InstCombiner::commonDivTransforms(BinaryOperator &I) {
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+
+  // undef / X -> 0        for integer.
+  // undef / X -> undef    for FP (the undef could be a snan).
+  if (isa<UndefValue>(Op0)) {
+    if (Op0->getType()->isFPOrFPVector())
+      return ReplaceInstUsesWith(I, Op0);
+    return ReplaceInstUsesWith(I, Constant::getNullValue(I.getType()));
+  }
+
+  // X / undef -> undef
+  if (isa<UndefValue>(Op1))
+    return ReplaceInstUsesWith(I, Op1);
+
+  return 0;
+}
+
+/// This function implements the transforms common to both integer division
+/// instructions (udiv and sdiv). It is called by the visitors to those integer
+/// division instructions.
+/// @brief Common integer divide transforms
+Instruction *InstCombiner::commonIDivTransforms(BinaryOperator &I) {
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+
+  // (sdiv X, X) --> 1     (udiv X, X) --> 1
+  if (Op0 == Op1) {
+    if (const VectorType *Ty = dyn_cast<VectorType>(I.getType())) {
+      ConstantInt *CI = ConstantInt::get(Ty->getElementType(), 1);
+      std::vector<Constant*> Elts(Ty->getNumElements(), CI);
+      return ReplaceInstUsesWith(I, ConstantVector::get(Elts));
+    }
+
+    ConstantInt *CI = ConstantInt::get(I.getType(), 1);
+    return ReplaceInstUsesWith(I, CI);
+  }
+  
+  if (Instruction *Common = commonDivTransforms(I))
+    return Common;
+  
+  // Handle cases involving: [su]div X, (select Cond, Y, Z)
+  // This does not apply for fdiv.
+  if (isa<SelectInst>(Op1) && SimplifyDivRemOfSelect(I))
+    return &I;
+
+  if (ConstantInt *RHS = dyn_cast<ConstantInt>(Op1)) {
+    // div X, 1 == X
+    if (RHS->equalsInt(1))
+      return ReplaceInstUsesWith(I, Op0);
+
+    // (X / C1) / C2  -> X / (C1*C2)
+    if (Instruction *LHS = dyn_cast<Instruction>(Op0))
+      if (Instruction::BinaryOps(LHS->getOpcode()) == I.getOpcode())
+        if (ConstantInt *LHSRHS = dyn_cast<ConstantInt>(LHS->getOperand(1))) {
+          if (MultiplyOverflows(RHS, LHSRHS, I.getOpcode()==Instruction::SDiv))
+            return ReplaceInstUsesWith(I, Constant::getNullValue(I.getType()));
+          else 
+            return BinaryOperator::Create(I.getOpcode(), LHS->getOperand(0),
+                                          Multiply(RHS, LHSRHS));
+        }
+
+    if (!RHS->isZero()) { // avoid X udiv 0
+      if (SelectInst *SI = dyn_cast<SelectInst>(Op0))
+        if (Instruction *R = FoldOpIntoSelect(I, SI, this))
+          return R;
+      if (isa<PHINode>(Op0))
+        if (Instruction *NV = FoldOpIntoPhi(I))
+          return NV;
+    }
+  }
+
+  // 0 / X == 0, we don't need to preserve faults!
+  if (ConstantInt *LHS = dyn_cast<ConstantInt>(Op0))
+    if (LHS->equalsInt(0))
+      return ReplaceInstUsesWith(I, Constant::getNullValue(I.getType()));
+
+  // It can't be division by zero, hence it must be division by one.
+  if (I.getType() == Type::Int1Ty)
+    return ReplaceInstUsesWith(I, Op0);
+
+  if (ConstantVector *Op1V = dyn_cast<ConstantVector>(Op1)) {
+    if (ConstantInt *X = cast_or_null<ConstantInt>(Op1V->getSplatValue()))
+      // div X, 1 == X
+      if (X->isOne())
+        return ReplaceInstUsesWith(I, Op0);
+  }
+
+  return 0;
+}
+
+Instruction *InstCombiner::visitUDiv(BinaryOperator &I) {
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+
+  // Handle the integer div common cases
+  if (Instruction *Common = commonIDivTransforms(I))
+    return Common;
+
+  if (ConstantInt *C = dyn_cast<ConstantInt>(Op1)) {
+    // X udiv C^2 -> X >> C
+    // Check to see if this is an unsigned division with an exact power of 2,
+    // if so, convert to a right shift.
+    if (C->getValue().isPowerOf2())  // 0 not included in isPowerOf2
+      return BinaryOperator::CreateLShr(Op0, 
+               ConstantInt::get(Op0->getType(), C->getValue().logBase2()));
+
+    // X udiv C, where C >= signbit
+    if (C->getValue().isNegative()) {
+      Value *IC = InsertNewInstBefore(new ICmpInst(ICmpInst::ICMP_ULT, Op0, C),
+                                      I);
+      return SelectInst::Create(IC, Constant::getNullValue(I.getType()),
+                                ConstantInt::get(I.getType(), 1));
+    }
+  }
+
+  // X udiv (C1 << N), where C1 is "1<<C2"  -->  X >> (N+C2)
+  if (BinaryOperator *RHSI = dyn_cast<BinaryOperator>(I.getOperand(1))) {
+    if (RHSI->getOpcode() == Instruction::Shl &&
+        isa<ConstantInt>(RHSI->getOperand(0))) {
+      const APInt& C1 = cast<ConstantInt>(RHSI->getOperand(0))->getValue();
+      if (C1.isPowerOf2()) {
+        Value *N = RHSI->getOperand(1);
+        const Type *NTy = N->getType();
+        if (uint32_t C2 = C1.logBase2()) {
+          Constant *C2V = ConstantInt::get(NTy, C2);
+          N = InsertNewInstBefore(BinaryOperator::CreateAdd(N, C2V, "tmp"), I);
+        }
+        return BinaryOperator::CreateLShr(Op0, N);
+      }
+    }
+  }
+  
+  // udiv X, (Select Cond, C1, C2) --> Select Cond, (shr X, C1), (shr X, C2)
+  // where C1&C2 are powers of two.
+  if (SelectInst *SI = dyn_cast<SelectInst>(Op1)) 
+    if (ConstantInt *STO = dyn_cast<ConstantInt>(SI->getOperand(1)))
+      if (ConstantInt *SFO = dyn_cast<ConstantInt>(SI->getOperand(2)))  {
+        const APInt &TVA = STO->getValue(), &FVA = SFO->getValue();
+        if (TVA.isPowerOf2() && FVA.isPowerOf2()) {
+          // Compute the shift amounts
+          uint32_t TSA = TVA.logBase2(), FSA = FVA.logBase2();
+          // Construct the "on true" case of the select
+          Constant *TC = ConstantInt::get(Op0->getType(), TSA);
+          Instruction *TSI = BinaryOperator::CreateLShr(
+                                                 Op0, TC, SI->getName()+".t");
+          TSI = InsertNewInstBefore(TSI, I);
+  
+          // Construct the "on false" case of the select
+          Constant *FC = ConstantInt::get(Op0->getType(), FSA); 
+          Instruction *FSI = BinaryOperator::CreateLShr(
+                                                 Op0, FC, SI->getName()+".f");
+          FSI = InsertNewInstBefore(FSI, I);
+
+          // construct the select instruction and return it.
+          return SelectInst::Create(SI->getOperand(0), TSI, FSI, SI->getName());
+        }
+      }
+  return 0;
+}
+
+Instruction *InstCombiner::visitSDiv(BinaryOperator &I) {
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+
+  // Handle the integer div common cases
+  if (Instruction *Common = commonIDivTransforms(I))
+    return Common;
+
+  if (ConstantInt *RHS = dyn_cast<ConstantInt>(Op1)) {
+    // sdiv X, -1 == -X
+    if (RHS->isAllOnesValue())
+      return BinaryOperator::CreateNeg(Op0);
+  }
+
+  // If the sign bits of both operands are zero (i.e. we can prove they are
+  // unsigned inputs), turn this into a udiv.
+  if (I.getType()->isInteger()) {
+    APInt Mask(APInt::getSignBit(I.getType()->getPrimitiveSizeInBits()));
+    if (MaskedValueIsZero(Op1, Mask) && MaskedValueIsZero(Op0, Mask)) {
+      // X sdiv Y -> X udiv Y, iff X and Y don't have sign bit set
+      return BinaryOperator::CreateUDiv(Op0, Op1, I.getName());
+    }
+  }      
+  
+  return 0;
+}
+
+Instruction *InstCombiner::visitFDiv(BinaryOperator &I) {
+  return commonDivTransforms(I);
+}
+
+/// This function implements the transforms on rem instructions that work
+/// regardless of the kind of rem instruction it is (urem, srem, or frem). It 
+/// is used by the visitors to those instructions.
+/// @brief Transforms common to all three rem instructions
+Instruction *InstCombiner::commonRemTransforms(BinaryOperator &I) {
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+
+  if (isa<UndefValue>(Op0)) {             // undef % X -> 0
+    if (I.getType()->isFPOrFPVector())
+      return ReplaceInstUsesWith(I, Op0);  // X % undef -> undef (could be SNaN)
+    return ReplaceInstUsesWith(I, Constant::getNullValue(I.getType()));
+  }
+  if (isa<UndefValue>(Op1))
+    return ReplaceInstUsesWith(I, Op1);  // X % undef -> undef
+
+  // Handle cases involving: rem X, (select Cond, Y, Z)
+  if (isa<SelectInst>(Op1) && SimplifyDivRemOfSelect(I))
+    return &I;
+
+  return 0;
+}
+
+/// This function implements the transforms common to both integer remainder
+/// instructions (urem and srem). It is called by the visitors to those integer
+/// remainder instructions.
+/// @brief Common integer remainder transforms
+Instruction *InstCombiner::commonIRemTransforms(BinaryOperator &I) {
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+
+  if (Instruction *common = commonRemTransforms(I))
+    return common;
+
+  // 0 % X == 0 for integer, we don't need to preserve faults!
+  if (Constant *LHS = dyn_cast<Constant>(Op0))
+    if (LHS->isNullValue())
+      return ReplaceInstUsesWith(I, Constant::getNullValue(I.getType()));
+
+  if (ConstantInt *RHS = dyn_cast<ConstantInt>(Op1)) {
+    // X % 0 == undef, we don't need to preserve faults!
+    if (RHS->equalsInt(0))
+      return ReplaceInstUsesWith(I, UndefValue::get(I.getType()));
+    
+    if (RHS->equalsInt(1))  // X % 1 == 0
+      return ReplaceInstUsesWith(I, Constant::getNullValue(I.getType()));
+
+    if (Instruction *Op0I = dyn_cast<Instruction>(Op0)) {
+      if (SelectInst *SI = dyn_cast<SelectInst>(Op0I)) {
+        if (Instruction *R = FoldOpIntoSelect(I, SI, this))
+          return R;
+      } else if (isa<PHINode>(Op0I)) {
+        if (Instruction *NV = FoldOpIntoPhi(I))
+          return NV;
+      }
+
+      // See if we can fold away this rem instruction.
+      if (SimplifyDemandedInstructionBits(I))
+        return &I;
+    }
+  }
+
+  return 0;
+}
+
+Instruction *InstCombiner::visitURem(BinaryOperator &I) {
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+
+  if (Instruction *common = commonIRemTransforms(I))
+    return common;
+  
+  if (ConstantInt *RHS = dyn_cast<ConstantInt>(Op1)) {
+    // X urem C^2 -> X and C
+    // Check to see if this is an unsigned remainder with an exact power of 2,
+    // if so, convert to a bitwise and.
+    if (ConstantInt *C = dyn_cast<ConstantInt>(RHS))
+      if (C->getValue().isPowerOf2())
+        return BinaryOperator::CreateAnd(Op0, SubOne(C));
+  }
+
+  if (Instruction *RHSI = dyn_cast<Instruction>(I.getOperand(1))) {
+    // Turn A % (C << N), where C is 2^k, into A & ((C << N)-1)  
+    if (RHSI->getOpcode() == Instruction::Shl &&
+        isa<ConstantInt>(RHSI->getOperand(0))) {
+      if (cast<ConstantInt>(RHSI->getOperand(0))->getValue().isPowerOf2()) {
+        Constant *N1 = ConstantInt::getAllOnesValue(I.getType());
+        Value *Add = InsertNewInstBefore(BinaryOperator::CreateAdd(RHSI, N1,
+                                                                   "tmp"), I);
+        return BinaryOperator::CreateAnd(Op0, Add);
+      }
+    }
+  }
+
+  // urem X, (select Cond, 2^C1, 2^C2) --> select Cond, (and X, C1), (and X, C2)
+  // where C1&C2 are powers of two.
+  if (SelectInst *SI = dyn_cast<SelectInst>(Op1)) {
+    if (ConstantInt *STO = dyn_cast<ConstantInt>(SI->getOperand(1)))
+      if (ConstantInt *SFO = dyn_cast<ConstantInt>(SI->getOperand(2))) {
+        // STO == 0 and SFO == 0 handled above.
+        if ((STO->getValue().isPowerOf2()) && 
+            (SFO->getValue().isPowerOf2())) {
+          Value *TrueAnd = InsertNewInstBefore(
+            BinaryOperator::CreateAnd(Op0, SubOne(STO), SI->getName()+".t"), I);
+          Value *FalseAnd = InsertNewInstBefore(
+            BinaryOperator::CreateAnd(Op0, SubOne(SFO), SI->getName()+".f"), I);
+          return SelectInst::Create(SI->getOperand(0), TrueAnd, FalseAnd);
+        }
+      }
+  }
+  
+  return 0;
+}
+
+Instruction *InstCombiner::visitSRem(BinaryOperator &I) {
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+
+  // Handle the integer rem common cases
+  if (Instruction *common = commonIRemTransforms(I))
+    return common;
+  
+  if (Value *RHSNeg = dyn_castNegVal(Op1))
+    if (!isa<Constant>(RHSNeg) ||
+        (isa<ConstantInt>(RHSNeg) &&
+         cast<ConstantInt>(RHSNeg)->getValue().isStrictlyPositive())) {
+      // X % -Y -> X % Y
+      AddUsesToWorkList(I);
+      I.setOperand(1, RHSNeg);
+      return &I;
+    }
+
+  // If the sign bits of both operands are zero (i.e. we can prove they are
+  // unsigned inputs), turn this into a urem.
+  if (I.getType()->isInteger()) {
+    APInt Mask(APInt::getSignBit(I.getType()->getPrimitiveSizeInBits()));
+    if (MaskedValueIsZero(Op1, Mask) && MaskedValueIsZero(Op0, Mask)) {
+      // X srem Y -> X urem Y, iff X and Y don't have sign bit set
+      return BinaryOperator::CreateURem(Op0, Op1, I.getName());
+    }
+  }
+
+  // If it's a constant vector, flip any negative values positive.
+  if (ConstantVector *RHSV = dyn_cast<ConstantVector>(Op1)) {
+    unsigned VWidth = RHSV->getNumOperands();
+
+    bool hasNegative = false;
+    for (unsigned i = 0; !hasNegative && i != VWidth; ++i)
+      if (ConstantInt *RHS = dyn_cast<ConstantInt>(RHSV->getOperand(i)))
+        if (RHS->getValue().isNegative())
+          hasNegative = true;
+
+    if (hasNegative) {
+      std::vector<Constant *> Elts(VWidth);
+      for (unsigned i = 0; i != VWidth; ++i) {
+        if (ConstantInt *RHS = dyn_cast<ConstantInt>(RHSV->getOperand(i))) {
+          if (RHS->getValue().isNegative())
+            Elts[i] = cast<ConstantInt>(ConstantExpr::getNeg(RHS));
+          else
+            Elts[i] = RHS;
+        }
+      }
+
+      Constant *NewRHSV = ConstantVector::get(Elts);
+      if (NewRHSV != RHSV) {
+        AddUsesToWorkList(I);
+        I.setOperand(1, NewRHSV);
+        return &I;
+      }
+    }
+  }
+
+  return 0;
+}
+
+Instruction *InstCombiner::visitFRem(BinaryOperator &I) {
+  return commonRemTransforms(I);
+}
+
+// isOneBitSet - Return true if there is exactly one bit set in the specified
+// constant.
+static bool isOneBitSet(const ConstantInt *CI) {
+  return CI->getValue().isPowerOf2();
+}
+
+// isHighOnes - Return true if the constant is of the form 1+0+.
+// This is the same as lowones(~X).
+static bool isHighOnes(const ConstantInt *CI) {
+  return (~CI->getValue() + 1).isPowerOf2();
+}
+
+/// getICmpCode - Encode a icmp predicate into a three bit mask.  These bits
+/// are carefully arranged to allow folding of expressions such as:
+///
+///      (A < B) | (A > B) --> (A != B)
+///
+/// Note that this is only valid if the first and second predicates have the
+/// same sign. Is illegal to do: (A u< B) | (A s> B) 
+///
+/// Three bits are used to represent the condition, as follows:
+///   0  A > B
+///   1  A == B
+///   2  A < B
+///
+/// <=>  Value  Definition
+/// 000     0   Always false
+/// 001     1   A >  B
+/// 010     2   A == B
+/// 011     3   A >= B
+/// 100     4   A <  B
+/// 101     5   A != B
+/// 110     6   A <= B
+/// 111     7   Always true
+///  
+static unsigned getICmpCode(const ICmpInst *ICI) {
+  switch (ICI->getPredicate()) {
+    // False -> 0
+  case ICmpInst::ICMP_UGT: return 1;  // 001
+  case ICmpInst::ICMP_SGT: return 1;  // 001
+  case ICmpInst::ICMP_EQ:  return 2;  // 010
+  case ICmpInst::ICMP_UGE: return 3;  // 011
+  case ICmpInst::ICMP_SGE: return 3;  // 011
+  case ICmpInst::ICMP_ULT: return 4;  // 100
+  case ICmpInst::ICMP_SLT: return 4;  // 100
+  case ICmpInst::ICMP_NE:  return 5;  // 101
+  case ICmpInst::ICMP_ULE: return 6;  // 110
+  case ICmpInst::ICMP_SLE: return 6;  // 110
+    // True -> 7
+  default:
+    assert(0 && "Invalid ICmp predicate!");
+    return 0;
+  }
+}
+
+/// getFCmpCode - Similar to getICmpCode but for FCmpInst. This encodes a fcmp
+/// predicate into a three bit mask. It also returns whether it is an ordered
+/// predicate by reference.
+static unsigned getFCmpCode(FCmpInst::Predicate CC, bool &isOrdered) {
+  isOrdered = false;
+  switch (CC) {
+  case FCmpInst::FCMP_ORD: isOrdered = true; return 0;  // 000
+  case FCmpInst::FCMP_UNO:                   return 0;  // 000
+  case FCmpInst::FCMP_OGT: isOrdered = true; return 1;  // 001
+  case FCmpInst::FCMP_UGT:                   return 1;  // 001
+  case FCmpInst::FCMP_OEQ: isOrdered = true; return 2;  // 010
+  case FCmpInst::FCMP_UEQ:                   return 2;  // 010
+  case FCmpInst::FCMP_OGE: isOrdered = true; return 3;  // 011
+  case FCmpInst::FCMP_UGE:                   return 3;  // 011
+  case FCmpInst::FCMP_OLT: isOrdered = true; return 4;  // 100
+  case FCmpInst::FCMP_ULT:                   return 4;  // 100
+  case FCmpInst::FCMP_ONE: isOrdered = true; return 5;  // 101
+  case FCmpInst::FCMP_UNE:                   return 5;  // 101
+  case FCmpInst::FCMP_OLE: isOrdered = true; return 6;  // 110
+  case FCmpInst::FCMP_ULE:                   return 6;  // 110
+    // True -> 7
+  default:
+    // Not expecting FCMP_FALSE and FCMP_TRUE;
+    assert(0 && "Unexpected FCmp predicate!");
+    return 0;
+  }
+}
+
+/// getICmpValue - This is the complement of getICmpCode, which turns an
+/// opcode and two operands into either a constant true or false, or a brand 
+/// new ICmp instruction. The sign is passed in to determine which kind
+/// of predicate to use in the new icmp instruction.
+static Value *getICmpValue(bool sign, unsigned code, Value *LHS, Value *RHS) {
+  switch (code) {
+  default: assert(0 && "Illegal ICmp code!");
+  case  0: return ConstantInt::getFalse();
+  case  1: 
+    if (sign)
+      return new ICmpInst(ICmpInst::ICMP_SGT, LHS, RHS);
+    else
+      return new ICmpInst(ICmpInst::ICMP_UGT, LHS, RHS);
+  case  2: return new ICmpInst(ICmpInst::ICMP_EQ,  LHS, RHS);
+  case  3: 
+    if (sign)
+      return new ICmpInst(ICmpInst::ICMP_SGE, LHS, RHS);
+    else
+      return new ICmpInst(ICmpInst::ICMP_UGE, LHS, RHS);
+  case  4: 
+    if (sign)
+      return new ICmpInst(ICmpInst::ICMP_SLT, LHS, RHS);
+    else
+      return new ICmpInst(ICmpInst::ICMP_ULT, LHS, RHS);
+  case  5: return new ICmpInst(ICmpInst::ICMP_NE,  LHS, RHS);
+  case  6: 
+    if (sign)
+      return new ICmpInst(ICmpInst::ICMP_SLE, LHS, RHS);
+    else
+      return new ICmpInst(ICmpInst::ICMP_ULE, LHS, RHS);
+  case  7: return ConstantInt::getTrue();
+  }
+}
+
+/// getFCmpValue - This is the complement of getFCmpCode, which turns an
+/// opcode and two operands into either a FCmp instruction. isordered is passed
+/// in to determine which kind of predicate to use in the new fcmp instruction.
+static Value *getFCmpValue(bool isordered, unsigned code,
+                           Value *LHS, Value *RHS) {
+  switch (code) {
+  default: assert(0 && "Illegal FCmp code!");
+  case  0:
+    if (isordered)
+      return new FCmpInst(FCmpInst::FCMP_ORD, LHS, RHS);
+    else
+      return new FCmpInst(FCmpInst::FCMP_UNO, LHS, RHS);
+  case  1: 
+    if (isordered)
+      return new FCmpInst(FCmpInst::FCMP_OGT, LHS, RHS);
+    else
+      return new FCmpInst(FCmpInst::FCMP_UGT, LHS, RHS);
+  case  2: 
+    if (isordered)
+      return new FCmpInst(FCmpInst::FCMP_OEQ, LHS, RHS);
+    else
+      return new FCmpInst(FCmpInst::FCMP_UEQ, LHS, RHS);
+  case  3: 
+    if (isordered)
+      return new FCmpInst(FCmpInst::FCMP_OGE, LHS, RHS);
+    else
+      return new FCmpInst(FCmpInst::FCMP_UGE, LHS, RHS);
+  case  4: 
+    if (isordered)
+      return new FCmpInst(FCmpInst::FCMP_OLT, LHS, RHS);
+    else
+      return new FCmpInst(FCmpInst::FCMP_ULT, LHS, RHS);
+  case  5: 
+    if (isordered)
+      return new FCmpInst(FCmpInst::FCMP_ONE, LHS, RHS);
+    else
+      return new FCmpInst(FCmpInst::FCMP_UNE, LHS, RHS);
+  case  6: 
+    if (isordered)
+      return new FCmpInst(FCmpInst::FCMP_OLE, LHS, RHS);
+    else
+      return new FCmpInst(FCmpInst::FCMP_ULE, LHS, RHS);
+  case  7: return ConstantInt::getTrue();
+  }
+}
+
+/// PredicatesFoldable - Return true if both predicates match sign or if at
+/// least one of them is an equality comparison (which is signless).
+static bool PredicatesFoldable(ICmpInst::Predicate p1, ICmpInst::Predicate p2) {
+  return (ICmpInst::isSignedPredicate(p1) == ICmpInst::isSignedPredicate(p2)) ||
+         (ICmpInst::isSignedPredicate(p1) && ICmpInst::isEquality(p2)) ||
+         (ICmpInst::isSignedPredicate(p2) && ICmpInst::isEquality(p1));
+}
+
+namespace { 
+// FoldICmpLogical - Implements (icmp1 A, B) & (icmp2 A, B) --> (icmp3 A, B)
+struct FoldICmpLogical {
+  InstCombiner &IC;
+  Value *LHS, *RHS;
+  ICmpInst::Predicate pred;
+  FoldICmpLogical(InstCombiner &ic, ICmpInst *ICI)
+    : IC(ic), LHS(ICI->getOperand(0)), RHS(ICI->getOperand(1)),
+      pred(ICI->getPredicate()) {}
+  bool shouldApply(Value *V) const {
+    if (ICmpInst *ICI = dyn_cast<ICmpInst>(V))
+      if (PredicatesFoldable(pred, ICI->getPredicate()))
+        return ((ICI->getOperand(0) == LHS && ICI->getOperand(1) == RHS) ||
+                (ICI->getOperand(0) == RHS && ICI->getOperand(1) == LHS));
+    return false;
+  }
+  Instruction *apply(Instruction &Log) const {
+    ICmpInst *ICI = cast<ICmpInst>(Log.getOperand(0));
+    if (ICI->getOperand(0) != LHS) {
+      assert(ICI->getOperand(1) == LHS);
+      ICI->swapOperands();  // Swap the LHS and RHS of the ICmp
+    }
+
+    ICmpInst *RHSICI = cast<ICmpInst>(Log.getOperand(1));
+    unsigned LHSCode = getICmpCode(ICI);
+    unsigned RHSCode = getICmpCode(RHSICI);
+    unsigned Code;
+    switch (Log.getOpcode()) {
+    case Instruction::And: Code = LHSCode & RHSCode; break;
+    case Instruction::Or:  Code = LHSCode | RHSCode; break;
+    case Instruction::Xor: Code = LHSCode ^ RHSCode; break;
+    default: assert(0 && "Illegal logical opcode!"); return 0;
+    }
+
+    bool isSigned = ICmpInst::isSignedPredicate(RHSICI->getPredicate()) || 
+                    ICmpInst::isSignedPredicate(ICI->getPredicate());
+      
+    Value *RV = getICmpValue(isSigned, Code, LHS, RHS);
+    if (Instruction *I = dyn_cast<Instruction>(RV))
+      return I;
+    // Otherwise, it's a constant boolean value...
+    return IC.ReplaceInstUsesWith(Log, RV);
+  }
+};
+} // end anonymous namespace
+
+// OptAndOp - This handles expressions of the form ((val OP C1) & C2).  Where
+// the Op parameter is 'OP', OpRHS is 'C1', and AndRHS is 'C2'.  Op is
+// guaranteed to be a binary operator.
+Instruction *InstCombiner::OptAndOp(Instruction *Op,
+                                    ConstantInt *OpRHS,
+                                    ConstantInt *AndRHS,
+                                    BinaryOperator &TheAnd) {
+  Value *X = Op->getOperand(0);
+  Constant *Together = 0;
+  if (!Op->isShift())
+    Together = And(AndRHS, OpRHS);
+
+  switch (Op->getOpcode()) {
+  case Instruction::Xor:
+    if (Op->hasOneUse()) {
+      // (X ^ C1) & C2 --> (X & C2) ^ (C1&C2)
+      Instruction *And = BinaryOperator::CreateAnd(X, AndRHS);
+      InsertNewInstBefore(And, TheAnd);
+      And->takeName(Op);
+      return BinaryOperator::CreateXor(And, Together);
+    }
+    break;
+  case Instruction::Or:
+    if (Together == AndRHS) // (X | C) & C --> C
+      return ReplaceInstUsesWith(TheAnd, AndRHS);
+
+    if (Op->hasOneUse() && Together != OpRHS) {
+      // (X | C1) & C2 --> (X | (C1&C2)) & C2
+      Instruction *Or = BinaryOperator::CreateOr(X, Together);
+      InsertNewInstBefore(Or, TheAnd);
+      Or->takeName(Op);
+      return BinaryOperator::CreateAnd(Or, AndRHS);
+    }
+    break;
+  case Instruction::Add:
+    if (Op->hasOneUse()) {
+      // Adding a one to a single bit bit-field should be turned into an XOR
+      // of the bit.  First thing to check is to see if this AND is with a
+      // single bit constant.
+      const APInt& AndRHSV = cast<ConstantInt>(AndRHS)->getValue();
+
+      // If there is only one bit set...
+      if (isOneBitSet(cast<ConstantInt>(AndRHS))) {
+        // Ok, at this point, we know that we are masking the result of the
+        // ADD down to exactly one bit.  If the constant we are adding has
+        // no bits set below this bit, then we can eliminate the ADD.
+        const APInt& AddRHS = cast<ConstantInt>(OpRHS)->getValue();
+
+        // Check to see if any bits below the one bit set in AndRHSV are set.
+        if ((AddRHS & (AndRHSV-1)) == 0) {
+          // If not, the only thing that can effect the output of the AND is
+          // the bit specified by AndRHSV.  If that bit is set, the effect of
+          // the XOR is to toggle the bit.  If it is clear, then the ADD has
+          // no effect.
+          if ((AddRHS & AndRHSV) == 0) { // Bit is not set, noop
+            TheAnd.setOperand(0, X);
+            return &TheAnd;
+          } else {
+            // Pull the XOR out of the AND.
+            Instruction *NewAnd = BinaryOperator::CreateAnd(X, AndRHS);
+            InsertNewInstBefore(NewAnd, TheAnd);
+            NewAnd->takeName(Op);
+            return BinaryOperator::CreateXor(NewAnd, AndRHS);
+          }
+        }
+      }
+    }
+    break;
+
+  case Instruction::Shl: {
+    // We know that the AND will not produce any of the bits shifted in, so if
+    // the anded constant includes them, clear them now!
+    //
+    uint32_t BitWidth = AndRHS->getType()->getBitWidth();
+    uint32_t OpRHSVal = OpRHS->getLimitedValue(BitWidth);
+    APInt ShlMask(APInt::getHighBitsSet(BitWidth, BitWidth-OpRHSVal));
+    ConstantInt *CI = ConstantInt::get(AndRHS->getValue() & ShlMask);
+
+    if (CI->getValue() == ShlMask) { 
+    // Masking out bits that the shift already masks
+      return ReplaceInstUsesWith(TheAnd, Op);   // No need for the and.
+    } else if (CI != AndRHS) {                  // Reducing bits set in and.
+      TheAnd.setOperand(1, CI);
+      return &TheAnd;
+    }
+    break;
+  }
+  case Instruction::LShr:
+  {
+    // We know that the AND will not produce any of the bits shifted in, so if
+    // the anded constant includes them, clear them now!  This only applies to
+    // unsigned shifts, because a signed shr may bring in set bits!
+    //
+    uint32_t BitWidth = AndRHS->getType()->getBitWidth();
+    uint32_t OpRHSVal = OpRHS->getLimitedValue(BitWidth);
+    APInt ShrMask(APInt::getLowBitsSet(BitWidth, BitWidth - OpRHSVal));
+    ConstantInt *CI = ConstantInt::get(AndRHS->getValue() & ShrMask);
+
+    if (CI->getValue() == ShrMask) {   
+    // Masking out bits that the shift already masks.
+      return ReplaceInstUsesWith(TheAnd, Op);
+    } else if (CI != AndRHS) {
+      TheAnd.setOperand(1, CI);  // Reduce bits set in and cst.
+      return &TheAnd;
+    }
+    break;
+  }
+  case Instruction::AShr:
+    // Signed shr.
+    // See if this is shifting in some sign extension, then masking it out
+    // with an and.
+    if (Op->hasOneUse()) {
+      uint32_t BitWidth = AndRHS->getType()->getBitWidth();
+      uint32_t OpRHSVal = OpRHS->getLimitedValue(BitWidth);
+      APInt ShrMask(APInt::getLowBitsSet(BitWidth, BitWidth - OpRHSVal));
+      Constant *C = ConstantInt::get(AndRHS->getValue() & ShrMask);
+      if (C == AndRHS) {          // Masking out bits shifted in.
+        // (Val ashr C1) & C2 -> (Val lshr C1) & C2
+        // Make the argument unsigned.
+        Value *ShVal = Op->getOperand(0);
+        ShVal = InsertNewInstBefore(
+            BinaryOperator::CreateLShr(ShVal, OpRHS, 
+                                   Op->getName()), TheAnd);
+        return BinaryOperator::CreateAnd(ShVal, AndRHS, TheAnd.getName());
+      }
+    }
+    break;
+  }
+  return 0;
+}
+
+
+/// InsertRangeTest - Emit a computation of: (V >= Lo && V < Hi) if Inside is
+/// true, otherwise (V < Lo || V >= Hi).  In pratice, we emit the more efficient
+/// (V-Lo) <u Hi-Lo.  This method expects that Lo <= Hi. isSigned indicates
+/// whether to treat the V, Lo and HI as signed or not. IB is the location to
+/// insert new instructions.
+Instruction *InstCombiner::InsertRangeTest(Value *V, Constant *Lo, Constant *Hi,
+                                           bool isSigned, bool Inside, 
+                                           Instruction &IB) {
+  assert(cast<ConstantInt>(ConstantExpr::getICmp((isSigned ? 
+            ICmpInst::ICMP_SLE:ICmpInst::ICMP_ULE), Lo, Hi))->getZExtValue() &&
+         "Lo is not <= Hi in range emission code!");
+    
+  if (Inside) {
+    if (Lo == Hi)  // Trivially false.
+      return new ICmpInst(ICmpInst::ICMP_NE, V, V);
+
+    // V >= Min && V < Hi --> V < Hi
+    if (cast<ConstantInt>(Lo)->isMinValue(isSigned)) {
+      ICmpInst::Predicate pred = (isSigned ? 
+        ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT);
+      return new ICmpInst(pred, V, Hi);
+    }
+
+    // Emit V-Lo <u Hi-Lo
+    Constant *NegLo = ConstantExpr::getNeg(Lo);
+    Instruction *Add = BinaryOperator::CreateAdd(V, NegLo, V->getName()+".off");
+    InsertNewInstBefore(Add, IB);
+    Constant *UpperBound = ConstantExpr::getAdd(NegLo, Hi);
+    return new ICmpInst(ICmpInst::ICMP_ULT, Add, UpperBound);
+  }
+
+  if (Lo == Hi)  // Trivially true.
+    return new ICmpInst(ICmpInst::ICMP_EQ, V, V);
+
+  // V < Min || V >= Hi -> V > Hi-1
+  Hi = SubOne(cast<ConstantInt>(Hi));
+  if (cast<ConstantInt>(Lo)->isMinValue(isSigned)) {
+    ICmpInst::Predicate pred = (isSigned ? 
+        ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT);
+    return new ICmpInst(pred, V, Hi);
+  }
+
+  // Emit V-Lo >u Hi-1-Lo
+  // Note that Hi has already had one subtracted from it, above.
+  ConstantInt *NegLo = cast<ConstantInt>(ConstantExpr::getNeg(Lo));
+  Instruction *Add = BinaryOperator::CreateAdd(V, NegLo, V->getName()+".off");
+  InsertNewInstBefore(Add, IB);
+  Constant *LowerBound = ConstantExpr::getAdd(NegLo, Hi);
+  return new ICmpInst(ICmpInst::ICMP_UGT, Add, LowerBound);
+}
+
+// isRunOfOnes - Returns true iff Val consists of one contiguous run of 1s with
+// any number of 0s on either side.  The 1s are allowed to wrap from LSB to
+// MSB, so 0x000FFF0, 0x0000FFFF, and 0xFF0000FF are all runs.  0x0F0F0000 is
+// not, since all 1s are not contiguous.
+static bool isRunOfOnes(ConstantInt *Val, uint32_t &MB, uint32_t &ME) {
+  const APInt& V = Val->getValue();
+  uint32_t BitWidth = Val->getType()->getBitWidth();
+  if (!APIntOps::isShiftedMask(BitWidth, V)) return false;
+
+  // look for the first zero bit after the run of ones
+  MB = BitWidth - ((V - 1) ^ V).countLeadingZeros();
+  // look for the first non-zero bit
+  ME = V.getActiveBits(); 
+  return true;
+}
+
+/// FoldLogicalPlusAnd - This is part of an expression (LHS +/- RHS) & Mask,
+/// where isSub determines whether the operator is a sub.  If we can fold one of
+/// the following xforms:
+/// 
+/// ((A & N) +/- B) & Mask -> (A +/- B) & Mask iff N&Mask == Mask
+/// ((A | N) +/- B) & Mask -> (A +/- B) & Mask iff N&Mask == 0
+/// ((A ^ N) +/- B) & Mask -> (A +/- B) & Mask iff N&Mask == 0
+///
+/// return (A +/- B).
+///
+Value *InstCombiner::FoldLogicalPlusAnd(Value *LHS, Value *RHS,
+                                        ConstantInt *Mask, bool isSub,
+                                        Instruction &I) {
+  Instruction *LHSI = dyn_cast<Instruction>(LHS);
+  if (!LHSI || LHSI->getNumOperands() != 2 ||
+      !isa<ConstantInt>(LHSI->getOperand(1))) return 0;
+
+  ConstantInt *N = cast<ConstantInt>(LHSI->getOperand(1));
+
+  switch (LHSI->getOpcode()) {
+  default: return 0;
+  case Instruction::And:
+    if (And(N, Mask) == Mask) {
+      // If the AndRHS is a power of two minus one (0+1+), this is simple.
+      if ((Mask->getValue().countLeadingZeros() + 
+           Mask->getValue().countPopulation()) == 
+          Mask->getValue().getBitWidth())
+        break;
+
+      // Otherwise, if Mask is 0+1+0+, and if B is known to have the low 0+
+      // part, we don't need any explicit masks to take them out of A.  If that
+      // is all N is, ignore it.
+      uint32_t MB = 0, ME = 0;
+      if (isRunOfOnes(Mask, MB, ME)) {  // begin/end bit of run, inclusive
+        uint32_t BitWidth = cast<IntegerType>(RHS->getType())->getBitWidth();
+        APInt Mask(APInt::getLowBitsSet(BitWidth, MB-1));
+        if (MaskedValueIsZero(RHS, Mask))
+          break;
+      }
+    }
+    return 0;
+  case Instruction::Or:
+  case Instruction::Xor:
+    // If the AndRHS is a power of two minus one (0+1+), and N&Mask == 0
+    if ((Mask->getValue().countLeadingZeros() + 
+         Mask->getValue().countPopulation()) == Mask->getValue().getBitWidth()
+        && And(N, Mask)->isZero())
+      break;
+    return 0;
+  }
+  
+  Instruction *New;
+  if (isSub)
+    New = BinaryOperator::CreateSub(LHSI->getOperand(0), RHS, "fold");
+  else
+    New = BinaryOperator::CreateAdd(LHSI->getOperand(0), RHS, "fold");
+  return InsertNewInstBefore(New, I);
+}
+
+/// FoldAndOfICmps - Fold (icmp)&(icmp) if possible.
+Instruction *InstCombiner::FoldAndOfICmps(Instruction &I,
+                                          ICmpInst *LHS, ICmpInst *RHS) {
+  Value *Val, *Val2;
+  ConstantInt *LHSCst, *RHSCst;
+  ICmpInst::Predicate LHSCC, RHSCC;
+  
+  // This only handles icmp of constants: (icmp1 A, C1) & (icmp2 B, C2).
+  if (!match(LHS, m_ICmp(LHSCC, m_Value(Val), m_ConstantInt(LHSCst))) ||
+      !match(RHS, m_ICmp(RHSCC, m_Value(Val2), m_ConstantInt(RHSCst))))
+    return 0;
+  
+  // (icmp ult A, C) & (icmp ult B, C) --> (icmp ult (A|B), C)
+  // where C is a power of 2
+  if (LHSCst == RHSCst && LHSCC == RHSCC && LHSCC == ICmpInst::ICMP_ULT &&
+      LHSCst->getValue().isPowerOf2()) {
+    Instruction *NewOr = BinaryOperator::CreateOr(Val, Val2);
+    InsertNewInstBefore(NewOr, I);
+    return new ICmpInst(LHSCC, NewOr, LHSCst);
+  }
+  
+  // From here on, we only handle:
+  //    (icmp1 A, C1) & (icmp2 A, C2) --> something simpler.
+  if (Val != Val2) return 0;
+  
+  // ICMP_[US][GL]E X, CST is folded to ICMP_[US][GL]T elsewhere.
+  if (LHSCC == ICmpInst::ICMP_UGE || LHSCC == ICmpInst::ICMP_ULE ||
+      RHSCC == ICmpInst::ICMP_UGE || RHSCC == ICmpInst::ICMP_ULE ||
+      LHSCC == ICmpInst::ICMP_SGE || LHSCC == ICmpInst::ICMP_SLE ||
+      RHSCC == ICmpInst::ICMP_SGE || RHSCC == ICmpInst::ICMP_SLE)
+    return 0;
+  
+  // We can't fold (ugt x, C) & (sgt x, C2).
+  if (!PredicatesFoldable(LHSCC, RHSCC))
+    return 0;
+    
+  // Ensure that the larger constant is on the RHS.
+  bool ShouldSwap;
+  if (ICmpInst::isSignedPredicate(LHSCC) ||
+      (ICmpInst::isEquality(LHSCC) && 
+       ICmpInst::isSignedPredicate(RHSCC)))
+    ShouldSwap = LHSCst->getValue().sgt(RHSCst->getValue());
+  else
+    ShouldSwap = LHSCst->getValue().ugt(RHSCst->getValue());
+    
+  if (ShouldSwap) {
+    std::swap(LHS, RHS);
+    std::swap(LHSCst, RHSCst);
+    std::swap(LHSCC, RHSCC);
+  }
+
+  // At this point, we know we have have two icmp instructions
+  // comparing a value against two constants and and'ing the result
+  // together.  Because of the above check, we know that we only have
+  // icmp eq, icmp ne, icmp [su]lt, and icmp [SU]gt here. We also know 
+  // (from the FoldICmpLogical check above), that the two constants 
+  // are not equal and that the larger constant is on the RHS
+  assert(LHSCst != RHSCst && "Compares not folded above?");
+
+  switch (LHSCC) {
+  default: assert(0 && "Unknown integer condition code!");
+  case ICmpInst::ICMP_EQ:
+    switch (RHSCC) {
+    default: assert(0 && "Unknown integer condition code!");
+    case ICmpInst::ICMP_EQ:         // (X == 13 & X == 15) -> false
+    case ICmpInst::ICMP_UGT:        // (X == 13 & X >  15) -> false
+    case ICmpInst::ICMP_SGT:        // (X == 13 & X >  15) -> false
+      return ReplaceInstUsesWith(I, ConstantInt::getFalse());
+    case ICmpInst::ICMP_NE:         // (X == 13 & X != 15) -> X == 13
+    case ICmpInst::ICMP_ULT:        // (X == 13 & X <  15) -> X == 13
+    case ICmpInst::ICMP_SLT:        // (X == 13 & X <  15) -> X == 13
+      return ReplaceInstUsesWith(I, LHS);
+    }
+  case ICmpInst::ICMP_NE:
+    switch (RHSCC) {
+    default: assert(0 && "Unknown integer condition code!");
+    case ICmpInst::ICMP_ULT:
+      if (LHSCst == SubOne(RHSCst)) // (X != 13 & X u< 14) -> X < 13
+        return new ICmpInst(ICmpInst::ICMP_ULT, Val, LHSCst);
+      break;                        // (X != 13 & X u< 15) -> no change
+    case ICmpInst::ICMP_SLT:
+      if (LHSCst == SubOne(RHSCst)) // (X != 13 & X s< 14) -> X < 13
+        return new ICmpInst(ICmpInst::ICMP_SLT, Val, LHSCst);
+      break;                        // (X != 13 & X s< 15) -> no change
+    case ICmpInst::ICMP_EQ:         // (X != 13 & X == 15) -> X == 15
+    case ICmpInst::ICMP_UGT:        // (X != 13 & X u> 15) -> X u> 15
+    case ICmpInst::ICMP_SGT:        // (X != 13 & X s> 15) -> X s> 15
+      return ReplaceInstUsesWith(I, RHS);
+    case ICmpInst::ICMP_NE:
+      if (LHSCst == SubOne(RHSCst)){// (X != 13 & X != 14) -> X-13 >u 1
+        Constant *AddCST = ConstantExpr::getNeg(LHSCst);
+        Instruction *Add = BinaryOperator::CreateAdd(Val, AddCST,
+                                                     Val->getName()+".off");
+        InsertNewInstBefore(Add, I);
+        return new ICmpInst(ICmpInst::ICMP_UGT, Add,
+                            ConstantInt::get(Add->getType(), 1));
+      }
+      break;                        // (X != 13 & X != 15) -> no change
+    }
+    break;
+  case ICmpInst::ICMP_ULT:
+    switch (RHSCC) {
+    default: assert(0 && "Unknown integer condition code!");
+    case ICmpInst::ICMP_EQ:         // (X u< 13 & X == 15) -> false
+    case ICmpInst::ICMP_UGT:        // (X u< 13 & X u> 15) -> false
+      return ReplaceInstUsesWith(I, ConstantInt::getFalse());
+    case ICmpInst::ICMP_SGT:        // (X u< 13 & X s> 15) -> no change
+      break;
+    case ICmpInst::ICMP_NE:         // (X u< 13 & X != 15) -> X u< 13
+    case ICmpInst::ICMP_ULT:        // (X u< 13 & X u< 15) -> X u< 13
+      return ReplaceInstUsesWith(I, LHS);
+    case ICmpInst::ICMP_SLT:        // (X u< 13 & X s< 15) -> no change
+      break;
+    }
+    break;
+  case ICmpInst::ICMP_SLT:
+    switch (RHSCC) {
+    default: assert(0 && "Unknown integer condition code!");
+    case ICmpInst::ICMP_EQ:         // (X s< 13 & X == 15) -> false
+    case ICmpInst::ICMP_SGT:        // (X s< 13 & X s> 15) -> false
+      return ReplaceInstUsesWith(I, ConstantInt::getFalse());
+    case ICmpInst::ICMP_UGT:        // (X s< 13 & X u> 15) -> no change
+      break;
+    case ICmpInst::ICMP_NE:         // (X s< 13 & X != 15) -> X < 13
+    case ICmpInst::ICMP_SLT:        // (X s< 13 & X s< 15) -> X < 13
+      return ReplaceInstUsesWith(I, LHS);
+    case ICmpInst::ICMP_ULT:        // (X s< 13 & X u< 15) -> no change
+      break;
+    }
+    break;
+  case ICmpInst::ICMP_UGT:
+    switch (RHSCC) {
+    default: assert(0 && "Unknown integer condition code!");
+    case ICmpInst::ICMP_EQ:         // (X u> 13 & X == 15) -> X == 15
+    case ICmpInst::ICMP_UGT:        // (X u> 13 & X u> 15) -> X u> 15
+      return ReplaceInstUsesWith(I, RHS);
+    case ICmpInst::ICMP_SGT:        // (X u> 13 & X s> 15) -> no change
+      break;
+    case ICmpInst::ICMP_NE:
+      if (RHSCst == AddOne(LHSCst)) // (X u> 13 & X != 14) -> X u> 14
+        return new ICmpInst(LHSCC, Val, RHSCst);
+      break;                        // (X u> 13 & X != 15) -> no change
+    case ICmpInst::ICMP_ULT:        // (X u> 13 & X u< 15) -> (X-14) <u 1
+      return InsertRangeTest(Val, AddOne(LHSCst), RHSCst, false, true, I);
+    case ICmpInst::ICMP_SLT:        // (X u> 13 & X s< 15) -> no change
+      break;
+    }
+    break;
+  case ICmpInst::ICMP_SGT:
+    switch (RHSCC) {
+    default: assert(0 && "Unknown integer condition code!");
+    case ICmpInst::ICMP_EQ:         // (X s> 13 & X == 15) -> X == 15
+    case ICmpInst::ICMP_SGT:        // (X s> 13 & X s> 15) -> X s> 15
+      return ReplaceInstUsesWith(I, RHS);
+    case ICmpInst::ICMP_UGT:        // (X s> 13 & X u> 15) -> no change
+      break;
+    case ICmpInst::ICMP_NE:
+      if (RHSCst == AddOne(LHSCst)) // (X s> 13 & X != 14) -> X s> 14
+        return new ICmpInst(LHSCC, Val, RHSCst);
+      break;                        // (X s> 13 & X != 15) -> no change
+    case ICmpInst::ICMP_SLT:        // (X s> 13 & X s< 15) -> (X-14) s< 1
+      return InsertRangeTest(Val, AddOne(LHSCst), RHSCst, true, true, I);
+    case ICmpInst::ICMP_ULT:        // (X s> 13 & X u< 15) -> no change
+      break;
+    }
+    break;
+  }
+ 
+  return 0;
+}
+
+
+Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
+  bool Changed = SimplifyCommutative(I);
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+
+  if (isa<UndefValue>(Op1))                         // X & undef -> 0
+    return ReplaceInstUsesWith(I, Constant::getNullValue(I.getType()));
+
+  // and X, X = X
+  if (Op0 == Op1)
+    return ReplaceInstUsesWith(I, Op1);
+
+  // See if we can simplify any instructions used by the instruction whose sole 
+  // purpose is to compute bits we don't care about.
+  if (!isa<VectorType>(I.getType())) {
+    if (SimplifyDemandedInstructionBits(I))
+      return &I;
+  } else {
+    if (ConstantVector *CP = dyn_cast<ConstantVector>(Op1)) {
+      if (CP->isAllOnesValue())            // X & <-1,-1> -> X
+        return ReplaceInstUsesWith(I, I.getOperand(0));
+    } else if (isa<ConstantAggregateZero>(Op1)) {
+      return ReplaceInstUsesWith(I, Op1);  // X & <0,0> -> <0,0>
+    }
+  }
+  
+  if (ConstantInt *AndRHS = dyn_cast<ConstantInt>(Op1)) {
+    const APInt& AndRHSMask = AndRHS->getValue();
+    APInt NotAndRHS(~AndRHSMask);
+
+    // Optimize a variety of ((val OP C1) & C2) combinations...
+    if (isa<BinaryOperator>(Op0)) {
+      Instruction *Op0I = cast<Instruction>(Op0);
+      Value *Op0LHS = Op0I->getOperand(0);
+      Value *Op0RHS = Op0I->getOperand(1);
+      switch (Op0I->getOpcode()) {
+      case Instruction::Xor:
+      case Instruction::Or:
+        // If the mask is only needed on one incoming arm, push it up.
+        if (Op0I->hasOneUse()) {
+          if (MaskedValueIsZero(Op0LHS, NotAndRHS)) {
+            // Not masking anything out for the LHS, move to RHS.
+            Instruction *NewRHS = BinaryOperator::CreateAnd(Op0RHS, AndRHS,
+                                                   Op0RHS->getName()+".masked");
+            InsertNewInstBefore(NewRHS, I);
+            return BinaryOperator::Create(
+                       cast<BinaryOperator>(Op0I)->getOpcode(), Op0LHS, NewRHS);
+          }
+          if (!isa<Constant>(Op0RHS) &&
+              MaskedValueIsZero(Op0RHS, NotAndRHS)) {
+            // Not masking anything out for the RHS, move to LHS.
+            Instruction *NewLHS = BinaryOperator::CreateAnd(Op0LHS, AndRHS,
+                                                   Op0LHS->getName()+".masked");
+            InsertNewInstBefore(NewLHS, I);
+            return BinaryOperator::Create(
+                       cast<BinaryOperator>(Op0I)->getOpcode(), NewLHS, Op0RHS);
+          }
+        }
+
+        break;
+      case Instruction::Add:
+        // ((A & N) + B) & AndRHS -> (A + B) & AndRHS iff N&AndRHS == AndRHS.
+        // ((A | N) + B) & AndRHS -> (A + B) & AndRHS iff N&AndRHS == 0
+        // ((A ^ N) + B) & AndRHS -> (A + B) & AndRHS iff N&AndRHS == 0
+        if (Value *V = FoldLogicalPlusAnd(Op0LHS, Op0RHS, AndRHS, false, I))
+          return BinaryOperator::CreateAnd(V, AndRHS);
+        if (Value *V = FoldLogicalPlusAnd(Op0RHS, Op0LHS, AndRHS, false, I))
+          return BinaryOperator::CreateAnd(V, AndRHS);  // Add commutes
+        break;
+
+      case Instruction::Sub:
+        // ((A & N) - B) & AndRHS -> (A - B) & AndRHS iff N&AndRHS == AndRHS.
+        // ((A | N) - B) & AndRHS -> (A - B) & AndRHS iff N&AndRHS == 0
+        // ((A ^ N) - B) & AndRHS -> (A - B) & AndRHS iff N&AndRHS == 0
+        if (Value *V = FoldLogicalPlusAnd(Op0LHS, Op0RHS, AndRHS, true, I))
+          return BinaryOperator::CreateAnd(V, AndRHS);
+
+        // (A - N) & AndRHS -> -N & AndRHS iff A&AndRHS==0 and AndRHS
+        // has 1's for all bits that the subtraction with A might affect.
+        if (Op0I->hasOneUse()) {
+          uint32_t BitWidth = AndRHSMask.getBitWidth();
+          uint32_t Zeros = AndRHSMask.countLeadingZeros();
+          APInt Mask = APInt::getLowBitsSet(BitWidth, BitWidth - Zeros);
+
+          ConstantInt *A = dyn_cast<ConstantInt>(Op0LHS);
+          if (!(A && A->isZero()) &&               // avoid infinite recursion.
+              MaskedValueIsZero(Op0LHS, Mask)) {
+            Instruction *NewNeg = BinaryOperator::CreateNeg(Op0RHS);
+            InsertNewInstBefore(NewNeg, I);
+            return BinaryOperator::CreateAnd(NewNeg, AndRHS);
+          }
+        }
+        break;
+
+      case Instruction::Shl:
+      case Instruction::LShr:
+        // (1 << x) & 1 --> zext(x == 0)
+        // (1 >> x) & 1 --> zext(x == 0)
+        if (AndRHSMask == 1 && Op0LHS == AndRHS) {
+          Instruction *NewICmp = new ICmpInst(ICmpInst::ICMP_EQ, Op0RHS,
+                                           Constant::getNullValue(I.getType()));
+          InsertNewInstBefore(NewICmp, I);
+          return new ZExtInst(NewICmp, I.getType());
+        }
+        break;
+      }
+
+      if (ConstantInt *Op0CI = dyn_cast<ConstantInt>(Op0I->getOperand(1)))
+        if (Instruction *Res = OptAndOp(Op0I, Op0CI, AndRHS, I))
+          return Res;
+    } else if (CastInst *CI = dyn_cast<CastInst>(Op0)) {
+      // If this is an integer truncation or change from signed-to-unsigned, and
+      // if the source is an and/or with immediate, transform it.  This
+      // frequently occurs for bitfield accesses.
+      if (Instruction *CastOp = dyn_cast<Instruction>(CI->getOperand(0))) {
+        if ((isa<TruncInst>(CI) || isa<BitCastInst>(CI)) &&
+            CastOp->getNumOperands() == 2)
+          if (ConstantInt *AndCI = dyn_cast<ConstantInt>(CastOp->getOperand(1))) {
+            if (CastOp->getOpcode() == Instruction::And) {
+              // Change: and (cast (and X, C1) to T), C2
+              // into  : and (cast X to T), trunc_or_bitcast(C1)&C2
+              // This will fold the two constants together, which may allow 
+              // other simplifications.
+              Instruction *NewCast = CastInst::CreateTruncOrBitCast(
+                CastOp->getOperand(0), I.getType(), 
+                CastOp->getName()+".shrunk");
+              NewCast = InsertNewInstBefore(NewCast, I);
+              // trunc_or_bitcast(C1)&C2
+              Constant *C3 = ConstantExpr::getTruncOrBitCast(AndCI,I.getType());
+              C3 = ConstantExpr::getAnd(C3, AndRHS);
+              return BinaryOperator::CreateAnd(NewCast, C3);
+            } else if (CastOp->getOpcode() == Instruction::Or) {
+              // Change: and (cast (or X, C1) to T), C2
+              // into  : trunc(C1)&C2 iff trunc(C1)&C2 == C2
+              Constant *C3 = ConstantExpr::getTruncOrBitCast(AndCI,I.getType());
+              if (ConstantExpr::getAnd(C3, AndRHS) == AndRHS)   // trunc(C1)&C2
+                return ReplaceInstUsesWith(I, AndRHS);
+            }
+          }
+      }
+    }
+
+    // Try to fold constant and into select arguments.
+    if (SelectInst *SI = dyn_cast<SelectInst>(Op0))
+      if (Instruction *R = FoldOpIntoSelect(I, SI, this))
+        return R;
+    if (isa<PHINode>(Op0))
+      if (Instruction *NV = FoldOpIntoPhi(I))
+        return NV;
+  }
+
+  Value *Op0NotVal = dyn_castNotVal(Op0);
+  Value *Op1NotVal = dyn_castNotVal(Op1);
+
+  if (Op0NotVal == Op1 || Op1NotVal == Op0)  // A & ~A  == ~A & A == 0
+    return ReplaceInstUsesWith(I, Constant::getNullValue(I.getType()));
+
+  // (~A & ~B) == (~(A | B)) - De Morgan's Law
+  if (Op0NotVal && Op1NotVal && isOnlyUse(Op0) && isOnlyUse(Op1)) {
+    Instruction *Or = BinaryOperator::CreateOr(Op0NotVal, Op1NotVal,
+                                               I.getName()+".demorgan");
+    InsertNewInstBefore(Or, I);
+    return BinaryOperator::CreateNot(Or);
+  }
+  
+  {
+    Value *A = 0, *B = 0, *C = 0, *D = 0;
+    if (match(Op0, m_Or(m_Value(A), m_Value(B)))) {
+      if (A == Op1 || B == Op1)    // (A | ?) & A  --> A
+        return ReplaceInstUsesWith(I, Op1);
+    
+      // (A|B) & ~(A&B) -> A^B
+      if (match(Op1, m_Not(m_And(m_Value(C), m_Value(D))))) {
+        if ((A == C && B == D) || (A == D && B == C))
+          return BinaryOperator::CreateXor(A, B);
+      }
+    }
+    
+    if (match(Op1, m_Or(m_Value(A), m_Value(B)))) {
+      if (A == Op0 || B == Op0)    // A & (A | ?)  --> A
+        return ReplaceInstUsesWith(I, Op0);
+
+      // ~(A&B) & (A|B) -> A^B
+      if (match(Op0, m_Not(m_And(m_Value(C), m_Value(D))))) {
+        if ((A == C && B == D) || (A == D && B == C))
+          return BinaryOperator::CreateXor(A, B);
+      }
+    }
+    
+    if (Op0->hasOneUse() &&
+        match(Op0, m_Xor(m_Value(A), m_Value(B)))) {
+      if (A == Op1) {                                // (A^B)&A -> A&(A^B)
+        I.swapOperands();     // Simplify below
+        std::swap(Op0, Op1);
+      } else if (B == Op1) {                         // (A^B)&B -> B&(B^A)
+        cast<BinaryOperator>(Op0)->swapOperands();
+        I.swapOperands();     // Simplify below
+        std::swap(Op0, Op1);
+      }
+    }
+
+    if (Op1->hasOneUse() &&
+        match(Op1, m_Xor(m_Value(A), m_Value(B)))) {
+      if (B == Op0) {                                // B&(A^B) -> B&(B^A)
+        cast<BinaryOperator>(Op1)->swapOperands();
+        std::swap(A, B);
+      }
+      if (A == Op0) {                                // A&(A^B) -> A & ~B
+        Instruction *NotB = BinaryOperator::CreateNot(B, "tmp");
+        InsertNewInstBefore(NotB, I);
+        return BinaryOperator::CreateAnd(A, NotB);
+      }
+    }
+
+    // (A&((~A)|B)) -> A&B
+    if (match(Op0, m_Or(m_Not(m_Specific(Op1)), m_Value(A))) ||
+        match(Op0, m_Or(m_Value(A), m_Not(m_Specific(Op1)))))
+      return BinaryOperator::CreateAnd(A, Op1);
+    if (match(Op1, m_Or(m_Not(m_Specific(Op0)), m_Value(A))) ||
+        match(Op1, m_Or(m_Value(A), m_Not(m_Specific(Op0)))))
+      return BinaryOperator::CreateAnd(A, Op0);
+  }
+  
+  if (ICmpInst *RHS = dyn_cast<ICmpInst>(Op1)) {
+    // (icmp1 A, B) & (icmp2 A, B) --> (icmp3 A, B)
+    if (Instruction *R = AssociativeOpt(I, FoldICmpLogical(*this, RHS)))
+      return R;
+
+    if (ICmpInst *LHS = dyn_cast<ICmpInst>(Op0))
+      if (Instruction *Res = FoldAndOfICmps(I, LHS, RHS))
+        return Res;
+  }
+
+  // fold (and (cast A), (cast B)) -> (cast (and A, B))
+  if (CastInst *Op0C = dyn_cast<CastInst>(Op0))
+    if (CastInst *Op1C = dyn_cast<CastInst>(Op1))
+      if (Op0C->getOpcode() == Op1C->getOpcode()) { // same cast kind ?
+        const Type *SrcTy = Op0C->getOperand(0)->getType();
+        if (SrcTy == Op1C->getOperand(0)->getType() && SrcTy->isInteger() &&
+            // Only do this if the casts both really cause code to be generated.
+            ValueRequiresCast(Op0C->getOpcode(), Op0C->getOperand(0), 
+                              I.getType(), TD) &&
+            ValueRequiresCast(Op1C->getOpcode(), Op1C->getOperand(0), 
+                              I.getType(), TD)) {
+          Instruction *NewOp = BinaryOperator::CreateAnd(Op0C->getOperand(0),
+                                                         Op1C->getOperand(0),
+                                                         I.getName());
+          InsertNewInstBefore(NewOp, I);
+          return CastInst::Create(Op0C->getOpcode(), NewOp, I.getType());
+        }
+      }
+    
+  // (X >> Z) & (Y >> Z)  -> (X&Y) >> Z  for all shifts.
+  if (BinaryOperator *SI1 = dyn_cast<BinaryOperator>(Op1)) {
+    if (BinaryOperator *SI0 = dyn_cast<BinaryOperator>(Op0))
+      if (SI0->isShift() && SI0->getOpcode() == SI1->getOpcode() && 
+          SI0->getOperand(1) == SI1->getOperand(1) &&
+          (SI0->hasOneUse() || SI1->hasOneUse())) {
+        Instruction *NewOp =
+          InsertNewInstBefore(BinaryOperator::CreateAnd(SI0->getOperand(0),
+                                                        SI1->getOperand(0),
+                                                        SI0->getName()), I);
+        return BinaryOperator::Create(SI1->getOpcode(), NewOp, 
+                                      SI1->getOperand(1));
+      }
+  }
+
+  // If and'ing two fcmp, try combine them into one.
+  if (FCmpInst *LHS = dyn_cast<FCmpInst>(I.getOperand(0))) {
+    if (FCmpInst *RHS = dyn_cast<FCmpInst>(I.getOperand(1))) {
+      if (LHS->getPredicate() == FCmpInst::FCMP_ORD &&
+          RHS->getPredicate() == FCmpInst::FCMP_ORD) {
+        // (fcmp ord x, c) & (fcmp ord y, c)  -> (fcmp ord x, y)
+        if (ConstantFP *LHSC = dyn_cast<ConstantFP>(LHS->getOperand(1)))
+          if (ConstantFP *RHSC = dyn_cast<ConstantFP>(RHS->getOperand(1))) {
+            // If either of the constants are nans, then the whole thing returns
+            // false.
+            if (LHSC->getValueAPF().isNaN() || RHSC->getValueAPF().isNaN())
+              return ReplaceInstUsesWith(I, ConstantInt::getFalse());
+            return new FCmpInst(FCmpInst::FCMP_ORD, LHS->getOperand(0),
+                                RHS->getOperand(0));
+          }
+      } else {
+        Value *Op0LHS, *Op0RHS, *Op1LHS, *Op1RHS;
+        FCmpInst::Predicate Op0CC, Op1CC;
+        if (match(Op0, m_FCmp(Op0CC, m_Value(Op0LHS), m_Value(Op0RHS))) &&
+            match(Op1, m_FCmp(Op1CC, m_Value(Op1LHS), m_Value(Op1RHS)))) {
+          if (Op0LHS == Op1RHS && Op0RHS == Op1LHS) {
+            // Swap RHS operands to match LHS.
+            Op1CC = FCmpInst::getSwappedPredicate(Op1CC);
+            std::swap(Op1LHS, Op1RHS);
+          }
+          if (Op0LHS == Op1LHS && Op0RHS == Op1RHS) {
+            // Simplify (fcmp cc0 x, y) & (fcmp cc1 x, y).
+            if (Op0CC == Op1CC)
+              return new FCmpInst((FCmpInst::Predicate)Op0CC, Op0LHS, Op0RHS);
+            else if (Op0CC == FCmpInst::FCMP_FALSE ||
+                     Op1CC == FCmpInst::FCMP_FALSE)
+              return ReplaceInstUsesWith(I, ConstantInt::getFalse());
+            else if (Op0CC == FCmpInst::FCMP_TRUE)
+              return ReplaceInstUsesWith(I, Op1);
+            else if (Op1CC == FCmpInst::FCMP_TRUE)
+              return ReplaceInstUsesWith(I, Op0);
+            bool Op0Ordered;
+            bool Op1Ordered;
+            unsigned Op0Pred = getFCmpCode(Op0CC, Op0Ordered);
+            unsigned Op1Pred = getFCmpCode(Op1CC, Op1Ordered);
+            if (Op1Pred == 0) {
+              std::swap(Op0, Op1);
+              std::swap(Op0Pred, Op1Pred);
+              std::swap(Op0Ordered, Op1Ordered);
+            }
+            if (Op0Pred == 0) {
+              // uno && ueq -> uno && (uno || eq) -> ueq
+              // ord && olt -> ord && (ord && lt) -> olt
+              if (Op0Ordered == Op1Ordered)
+                return ReplaceInstUsesWith(I, Op1);
+              // uno && oeq -> uno && (ord && eq) -> false
+              // uno && ord -> false
+              if (!Op0Ordered)
+                return ReplaceInstUsesWith(I, ConstantInt::getFalse());
+              // ord && ueq -> ord && (uno || eq) -> oeq
+              return cast<Instruction>(getFCmpValue(true, Op1Pred,
+                                                    Op0LHS, Op0RHS));
+            }
+          }
+        }
+      }
+    }
+  }
+
+  return Changed ? &I : 0;
+}
+
+/// CollectBSwapParts - Analyze the specified subexpression and see if it is
+/// capable of providing pieces of a bswap.  The subexpression provides pieces
+/// of a bswap if it is proven that each of the non-zero bytes in the output of
+/// the expression came from the corresponding "byte swapped" byte in some other
+/// value.  For example, if the current subexpression is "(shl i32 %X, 24)" then
+/// we know that the expression deposits the low byte of %X into the high byte
+/// of the bswap result and that all other bytes are zero.  This expression is
+/// accepted, the high byte of ByteValues is set to X to indicate a correct
+/// match.
+///
+/// This function returns true if the match was unsuccessful and false if so.
+/// On entry to the function the "OverallLeftShift" is a signed integer value
+/// indicating the number of bytes that the subexpression is later shifted.  For
+/// example, if the expression is later right shifted by 16 bits, the
+/// OverallLeftShift value would be -2 on entry.  This is used to specify which
+/// byte of ByteValues is actually being set.
+///
+/// Similarly, ByteMask is a bitmask where a bit is clear if its corresponding
+/// byte is masked to zero by a user.  For example, in (X & 255), X will be
+/// processed with a bytemask of 1.  Because bytemask is 32-bits, this limits
+/// this function to working on up to 32-byte (256 bit) values.  ByteMask is
+/// always in the local (OverallLeftShift) coordinate space.
+///
+static bool CollectBSwapParts(Value *V, int OverallLeftShift, uint32_t ByteMask,
+                              SmallVector<Value*, 8> &ByteValues) {
+  if (Instruction *I = dyn_cast<Instruction>(V)) {
+    // If this is an or instruction, it may be an inner node of the bswap.
+    if (I->getOpcode() == Instruction::Or) {
+      return CollectBSwapParts(I->getOperand(0), OverallLeftShift, ByteMask,
+                               ByteValues) ||
+             CollectBSwapParts(I->getOperand(1), OverallLeftShift, ByteMask,
+                               ByteValues);
+    }
+  
+    // If this is a logical shift by a constant multiple of 8, recurse with
+    // OverallLeftShift and ByteMask adjusted.
+    if (I->isLogicalShift() && isa<ConstantInt>(I->getOperand(1))) {
+      unsigned ShAmt = 
+        cast<ConstantInt>(I->getOperand(1))->getLimitedValue(~0U);
+      // Ensure the shift amount is defined and of a byte value.
+      if ((ShAmt & 7) || (ShAmt > 8*ByteValues.size()))
+        return true;
+
+      unsigned ByteShift = ShAmt >> 3;
+      if (I->getOpcode() == Instruction::Shl) {
+        // X << 2 -> collect(X, +2)
+        OverallLeftShift += ByteShift;
+        ByteMask >>= ByteShift;
+      } else {
+        // X >>u 2 -> collect(X, -2)
+        OverallLeftShift -= ByteShift;
+        ByteMask <<= ByteShift;
+        ByteMask &= (~0U >> (32-ByteValues.size()));
+      }
+
+      if (OverallLeftShift >= (int)ByteValues.size()) return true;
+      if (OverallLeftShift <= -(int)ByteValues.size()) return true;
+
+      return CollectBSwapParts(I->getOperand(0), OverallLeftShift, ByteMask, 
+                               ByteValues);
+    }
+
+    // If this is a logical 'and' with a mask that clears bytes, clear the
+    // corresponding bytes in ByteMask.
+    if (I->getOpcode() == Instruction::And &&
+        isa<ConstantInt>(I->getOperand(1))) {
+      // Scan every byte of the and mask, seeing if the byte is either 0 or 255.
+      unsigned NumBytes = ByteValues.size();
+      APInt Byte(I->getType()->getPrimitiveSizeInBits(), 255);
+      const APInt &AndMask = cast<ConstantInt>(I->getOperand(1))->getValue();
+      
+      for (unsigned i = 0; i != NumBytes; ++i, Byte <<= 8) {
+        // If this byte is masked out by a later operation, we don't care what
+        // the and mask is.
+        if ((ByteMask & (1 << i)) == 0)
+          continue;
+        
+        // If the AndMask is all zeros for this byte, clear the bit.
+        APInt MaskB = AndMask & Byte;
+        if (MaskB == 0) {
+          ByteMask &= ~(1U << i);
+          continue;
+        }
+        
+        // If the AndMask is not all ones for this byte, it's not a bytezap.
+        if (MaskB != Byte)
+          return true;
+
+        // Otherwise, this byte is kept.
+      }
+
+      return CollectBSwapParts(I->getOperand(0), OverallLeftShift, ByteMask, 
+                               ByteValues);
+    }
+  }
+  
+  // Okay, we got to something that isn't a shift, 'or' or 'and'.  This must be
+  // the input value to the bswap.  Some observations: 1) if more than one byte
+  // is demanded from this input, then it could not be successfully assembled
+  // into a byteswap.  At least one of the two bytes would not be aligned with
+  // their ultimate destination.
+  if (!isPowerOf2_32(ByteMask)) return true;
+  unsigned InputByteNo = CountTrailingZeros_32(ByteMask);
+  
+  // 2) The input and ultimate destinations must line up: if byte 3 of an i32
+  // is demanded, it needs to go into byte 0 of the result.  This means that the
+  // byte needs to be shifted until it lands in the right byte bucket.  The
+  // shift amount depends on the position: if the byte is coming from the high
+  // part of the value (e.g. byte 3) then it must be shifted right.  If from the
+  // low part, it must be shifted left.
+  unsigned DestByteNo = InputByteNo + OverallLeftShift;
+  if (InputByteNo < ByteValues.size()/2) {
+    if (ByteValues.size()-1-DestByteNo != InputByteNo)
+      return true;
+  } else {
+    if (ByteValues.size()-1-DestByteNo != InputByteNo)
+      return true;
+  }
+  
+  // If the destination byte value is already defined, the values are or'd
+  // together, which isn't a bswap (unless it's an or of the same bits).
+  if (ByteValues[DestByteNo] && ByteValues[DestByteNo] != V)
+    return true;
+  ByteValues[DestByteNo] = V;
+  return false;
+}
+
+/// MatchBSwap - Given an OR instruction, check to see if this is a bswap idiom.
+/// If so, insert the new bswap intrinsic and return it.
+Instruction *InstCombiner::MatchBSwap(BinaryOperator &I) {
+  const IntegerType *ITy = dyn_cast<IntegerType>(I.getType());
+  if (!ITy || ITy->getBitWidth() % 16 || 
+      // ByteMask only allows up to 32-byte values.
+      ITy->getBitWidth() > 32*8) 
+    return 0;   // Can only bswap pairs of bytes.  Can't do vectors.
+  
+  /// ByteValues - For each byte of the result, we keep track of which value
+  /// defines each byte.
+  SmallVector<Value*, 8> ByteValues;
+  ByteValues.resize(ITy->getBitWidth()/8);
+    
+  // Try to find all the pieces corresponding to the bswap.
+  uint32_t ByteMask = ~0U >> (32-ByteValues.size());
+  if (CollectBSwapParts(&I, 0, ByteMask, ByteValues))
+    return 0;
+  
+  // Check to see if all of the bytes come from the same value.
+  Value *V = ByteValues[0];
+  if (V == 0) return 0;  // Didn't find a byte?  Must be zero.
+  
+  // Check to make sure that all of the bytes come from the same value.
+  for (unsigned i = 1, e = ByteValues.size(); i != e; ++i)
+    if (ByteValues[i] != V)
+      return 0;
+  const Type *Tys[] = { ITy };
+  Module *M = I.getParent()->getParent()->getParent();
+  Function *F = Intrinsic::getDeclaration(M, Intrinsic::bswap, Tys, 1);
+  return CallInst::Create(F, V);
+}
+
+/// MatchSelectFromAndOr - We have an expression of the form (A&C)|(B&D).  Check
+/// If A is (cond?-1:0) and either B or D is ~(cond?-1,0) or (cond?0,-1), then
+/// we can simplify this expression to "cond ? C : D or B".
+static Instruction *MatchSelectFromAndOr(Value *A, Value *B,
+                                         Value *C, Value *D) {
+  // If A is not a select of -1/0, this cannot match.
+  Value *Cond = 0;
+  if (!match(A, m_SelectCst<-1, 0>(m_Value(Cond))))
+    return 0;
+
+  // ((cond?-1:0)&C) | (B&(cond?0:-1)) -> cond ? C : B.
+  if (match(D, m_SelectCst<0, -1>(m_Specific(Cond))))
+    return SelectInst::Create(Cond, C, B);
+  if (match(D, m_Not(m_SelectCst<-1, 0>(m_Specific(Cond)))))
+    return SelectInst::Create(Cond, C, B);
+  // ((cond?-1:0)&C) | ((cond?0:-1)&D) -> cond ? C : D.
+  if (match(B, m_SelectCst<0, -1>(m_Specific(Cond))))
+    return SelectInst::Create(Cond, C, D);
+  if (match(B, m_Not(m_SelectCst<-1, 0>(m_Specific(Cond)))))
+    return SelectInst::Create(Cond, C, D);
+  return 0;
+}
+
+/// FoldOrOfICmps - Fold (icmp)|(icmp) if possible.
+Instruction *InstCombiner::FoldOrOfICmps(Instruction &I,
+                                         ICmpInst *LHS, ICmpInst *RHS) {
+  Value *Val, *Val2;
+  ConstantInt *LHSCst, *RHSCst;
+  ICmpInst::Predicate LHSCC, RHSCC;
+  
+  // This only handles icmp of constants: (icmp1 A, C1) | (icmp2 B, C2).
+  if (!match(LHS, m_ICmp(LHSCC, m_Value(Val), m_ConstantInt(LHSCst))) ||
+      !match(RHS, m_ICmp(RHSCC, m_Value(Val2), m_ConstantInt(RHSCst))))
+    return 0;
+  
+  // From here on, we only handle:
+  //    (icmp1 A, C1) | (icmp2 A, C2) --> something simpler.
+  if (Val != Val2) return 0;
+  
+  // ICMP_[US][GL]E X, CST is folded to ICMP_[US][GL]T elsewhere.
+  if (LHSCC == ICmpInst::ICMP_UGE || LHSCC == ICmpInst::ICMP_ULE ||
+      RHSCC == ICmpInst::ICMP_UGE || RHSCC == ICmpInst::ICMP_ULE ||
+      LHSCC == ICmpInst::ICMP_SGE || LHSCC == ICmpInst::ICMP_SLE ||
+      RHSCC == ICmpInst::ICMP_SGE || RHSCC == ICmpInst::ICMP_SLE)
+    return 0;
+  
+  // We can't fold (ugt x, C) | (sgt x, C2).
+  if (!PredicatesFoldable(LHSCC, RHSCC))
+    return 0;
+  
+  // Ensure that the larger constant is on the RHS.
+  bool ShouldSwap;
+  if (ICmpInst::isSignedPredicate(LHSCC) ||
+      (ICmpInst::isEquality(LHSCC) && 
+       ICmpInst::isSignedPredicate(RHSCC)))
+    ShouldSwap = LHSCst->getValue().sgt(RHSCst->getValue());
+  else
+    ShouldSwap = LHSCst->getValue().ugt(RHSCst->getValue());
+  
+  if (ShouldSwap) {
+    std::swap(LHS, RHS);
+    std::swap(LHSCst, RHSCst);
+    std::swap(LHSCC, RHSCC);
+  }
+  
+  // At this point, we know we have have two icmp instructions
+  // comparing a value against two constants and or'ing the result
+  // together.  Because of the above check, we know that we only have
+  // ICMP_EQ, ICMP_NE, ICMP_LT, and ICMP_GT here. We also know (from the
+  // FoldICmpLogical check above), that the two constants are not
+  // equal.
+  assert(LHSCst != RHSCst && "Compares not folded above?");
+
+  switch (LHSCC) {
+  default: assert(0 && "Unknown integer condition code!");
+  case ICmpInst::ICMP_EQ:
+    switch (RHSCC) {
+    default: assert(0 && "Unknown integer condition code!");
+    case ICmpInst::ICMP_EQ:
+      if (LHSCst == SubOne(RHSCst)) { // (X == 13 | X == 14) -> X-13 <u 2
+        Constant *AddCST = ConstantExpr::getNeg(LHSCst);
+        Instruction *Add = BinaryOperator::CreateAdd(Val, AddCST,
+                                                     Val->getName()+".off");
+        InsertNewInstBefore(Add, I);
+        AddCST = Subtract(AddOne(RHSCst), LHSCst);
+        return new ICmpInst(ICmpInst::ICMP_ULT, Add, AddCST);
+      }
+      break;                         // (X == 13 | X == 15) -> no change
+    case ICmpInst::ICMP_UGT:         // (X == 13 | X u> 14) -> no change
+    case ICmpInst::ICMP_SGT:         // (X == 13 | X s> 14) -> no change
+      break;
+    case ICmpInst::ICMP_NE:          // (X == 13 | X != 15) -> X != 15
+    case ICmpInst::ICMP_ULT:         // (X == 13 | X u< 15) -> X u< 15
+    case ICmpInst::ICMP_SLT:         // (X == 13 | X s< 15) -> X s< 15
+      return ReplaceInstUsesWith(I, RHS);
+    }
+    break;
+  case ICmpInst::ICMP_NE:
+    switch (RHSCC) {
+    default: assert(0 && "Unknown integer condition code!");
+    case ICmpInst::ICMP_EQ:          // (X != 13 | X == 15) -> X != 13
+    case ICmpInst::ICMP_UGT:         // (X != 13 | X u> 15) -> X != 13
+    case ICmpInst::ICMP_SGT:         // (X != 13 | X s> 15) -> X != 13
+      return ReplaceInstUsesWith(I, LHS);
+    case ICmpInst::ICMP_NE:          // (X != 13 | X != 15) -> true
+    case ICmpInst::ICMP_ULT:         // (X != 13 | X u< 15) -> true
+    case ICmpInst::ICMP_SLT:         // (X != 13 | X s< 15) -> true
+      return ReplaceInstUsesWith(I, ConstantInt::getTrue());
+    }
+    break;
+  case ICmpInst::ICMP_ULT:
+    switch (RHSCC) {
+    default: assert(0 && "Unknown integer condition code!");
+    case ICmpInst::ICMP_EQ:         // (X u< 13 | X == 14) -> no change
+      break;
+    case ICmpInst::ICMP_UGT:        // (X u< 13 | X u> 15) -> (X-13) u> 2
+      // If RHSCst is [us]MAXINT, it is always false.  Not handling
+      // this can cause overflow.
+      if (RHSCst->isMaxValue(false))
+        return ReplaceInstUsesWith(I, LHS);
+      return InsertRangeTest(Val, LHSCst, AddOne(RHSCst), false, false, I);
+    case ICmpInst::ICMP_SGT:        // (X u< 13 | X s> 15) -> no change
+      break;
+    case ICmpInst::ICMP_NE:         // (X u< 13 | X != 15) -> X != 15
+    case ICmpInst::ICMP_ULT:        // (X u< 13 | X u< 15) -> X u< 15
+      return ReplaceInstUsesWith(I, RHS);
+    case ICmpInst::ICMP_SLT:        // (X u< 13 | X s< 15) -> no change
+      break;
+    }
+    break;
+  case ICmpInst::ICMP_SLT:
+    switch (RHSCC) {
+    default: assert(0 && "Unknown integer condition code!");
+    case ICmpInst::ICMP_EQ:         // (X s< 13 | X == 14) -> no change
+      break;
+    case ICmpInst::ICMP_SGT:        // (X s< 13 | X s> 15) -> (X-13) s> 2
+      // If RHSCst is [us]MAXINT, it is always false.  Not handling
+      // this can cause overflow.
+      if (RHSCst->isMaxValue(true))
+        return ReplaceInstUsesWith(I, LHS);
+      return InsertRangeTest(Val, LHSCst, AddOne(RHSCst), true, false, I);
+    case ICmpInst::ICMP_UGT:        // (X s< 13 | X u> 15) -> no change
+      break;
+    case ICmpInst::ICMP_NE:         // (X s< 13 | X != 15) -> X != 15
+    case ICmpInst::ICMP_SLT:        // (X s< 13 | X s< 15) -> X s< 15
+      return ReplaceInstUsesWith(I, RHS);
+    case ICmpInst::ICMP_ULT:        // (X s< 13 | X u< 15) -> no change
+      break;
+    }
+    break;
+  case ICmpInst::ICMP_UGT:
+    switch (RHSCC) {
+    default: assert(0 && "Unknown integer condition code!");
+    case ICmpInst::ICMP_EQ:         // (X u> 13 | X == 15) -> X u> 13
+    case ICmpInst::ICMP_UGT:        // (X u> 13 | X u> 15) -> X u> 13
+      return ReplaceInstUsesWith(I, LHS);
+    case ICmpInst::ICMP_SGT:        // (X u> 13 | X s> 15) -> no change
+      break;
+    case ICmpInst::ICMP_NE:         // (X u> 13 | X != 15) -> true
+    case ICmpInst::ICMP_ULT:        // (X u> 13 | X u< 15) -> true
+      return ReplaceInstUsesWith(I, ConstantInt::getTrue());
+    case ICmpInst::ICMP_SLT:        // (X u> 13 | X s< 15) -> no change
+      break;
+    }
+    break;
+  case ICmpInst::ICMP_SGT:
+    switch (RHSCC) {
+    default: assert(0 && "Unknown integer condition code!");
+    case ICmpInst::ICMP_EQ:         // (X s> 13 | X == 15) -> X > 13
+    case ICmpInst::ICMP_SGT:        // (X s> 13 | X s> 15) -> X > 13
+      return ReplaceInstUsesWith(I, LHS);
+    case ICmpInst::ICMP_UGT:        // (X s> 13 | X u> 15) -> no change
+      break;
+    case ICmpInst::ICMP_NE:         // (X s> 13 | X != 15) -> true
+    case ICmpInst::ICMP_SLT:        // (X s> 13 | X s< 15) -> true
+      return ReplaceInstUsesWith(I, ConstantInt::getTrue());
+    case ICmpInst::ICMP_ULT:        // (X s> 13 | X u< 15) -> no change
+      break;
+    }
+    break;
+  }
+  return 0;
+}
+
+/// FoldOrWithConstants - This helper function folds:
+///
+///     ((A | B) & C1) | (B & C2)
+///
+/// into:
+/// 
+///     (A & C1) | B
+///
+/// when the XOR of the two constants is "all ones" (-1).
+Instruction *InstCombiner::FoldOrWithConstants(BinaryOperator &I, Value *Op,
+                                               Value *A, Value *B, Value *C) {
+  ConstantInt *CI1 = dyn_cast<ConstantInt>(C);
+  if (!CI1) return 0;
+
+  Value *V1 = 0;
+  ConstantInt *CI2 = 0;
+  if (!match(Op, m_And(m_Value(V1), m_ConstantInt(CI2)))) return 0;
+
+  APInt Xor = CI1->getValue() ^ CI2->getValue();
+  if (!Xor.isAllOnesValue()) return 0;
+
+  if (V1 == A || V1 == B) {
+    Instruction *NewOp =
+      InsertNewInstBefore(BinaryOperator::CreateAnd((V1 == A) ? B : A, CI1), I);
+    return BinaryOperator::CreateOr(NewOp, V1);
+  }
+
+  return 0;
+}
+
+Instruction *InstCombiner::visitOr(BinaryOperator &I) {
+  bool Changed = SimplifyCommutative(I);
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+
+  if (isa<UndefValue>(Op1))                       // X | undef -> -1
+    return ReplaceInstUsesWith(I, Constant::getAllOnesValue(I.getType()));
+
+  // or X, X = X
+  if (Op0 == Op1)
+    return ReplaceInstUsesWith(I, Op0);
+
+  // See if we can simplify any instructions used by the instruction whose sole 
+  // purpose is to compute bits we don't care about.
+  if (!isa<VectorType>(I.getType())) {
+    if (SimplifyDemandedInstructionBits(I))
+      return &I;
+  } else if (isa<ConstantAggregateZero>(Op1)) {
+    return ReplaceInstUsesWith(I, Op0);  // X | <0,0> -> X
+  } else if (ConstantVector *CP = dyn_cast<ConstantVector>(Op1)) {
+    if (CP->isAllOnesValue())            // X | <-1,-1> -> <-1,-1>
+      return ReplaceInstUsesWith(I, I.getOperand(1));
+  }
+    
+
+  
+  // or X, -1 == -1
+  if (ConstantInt *RHS = dyn_cast<ConstantInt>(Op1)) {
+    ConstantInt *C1 = 0; Value *X = 0;
+    // (X & C1) | C2 --> (X | C2) & (C1|C2)
+    if (match(Op0, m_And(m_Value(X), m_ConstantInt(C1))) && isOnlyUse(Op0)) {
+      Instruction *Or = BinaryOperator::CreateOr(X, RHS);
+      InsertNewInstBefore(Or, I);
+      Or->takeName(Op0);
+      return BinaryOperator::CreateAnd(Or, 
+               ConstantInt::get(RHS->getValue() | C1->getValue()));
+    }
+
+    // (X ^ C1) | C2 --> (X | C2) ^ (C1&~C2)
+    if (match(Op0, m_Xor(m_Value(X), m_ConstantInt(C1))) && isOnlyUse(Op0)) {
+      Instruction *Or = BinaryOperator::CreateOr(X, RHS);
+      InsertNewInstBefore(Or, I);
+      Or->takeName(Op0);
+      return BinaryOperator::CreateXor(Or,
+                 ConstantInt::get(C1->getValue() & ~RHS->getValue()));
+    }
+
+    // Try to fold constant and into select arguments.
+    if (SelectInst *SI = dyn_cast<SelectInst>(Op0))
+      if (Instruction *R = FoldOpIntoSelect(I, SI, this))
+        return R;
+    if (isa<PHINode>(Op0))
+      if (Instruction *NV = FoldOpIntoPhi(I))
+        return NV;
+  }
+
+  Value *A = 0, *B = 0;
+  ConstantInt *C1 = 0, *C2 = 0;
+
+  if (match(Op0, m_And(m_Value(A), m_Value(B))))
+    if (A == Op1 || B == Op1)    // (A & ?) | A  --> A
+      return ReplaceInstUsesWith(I, Op1);
+  if (match(Op1, m_And(m_Value(A), m_Value(B))))
+    if (A == Op0 || B == Op0)    // A | (A & ?)  --> A
+      return ReplaceInstUsesWith(I, Op0);
+
+  // (A | B) | C  and  A | (B | C)                  -> bswap if possible.
+  // (A >> B) | (C << D)  and  (A << B) | (B >> C)  -> bswap if possible.
+  if (match(Op0, m_Or(m_Value(), m_Value())) ||
+      match(Op1, m_Or(m_Value(), m_Value())) ||
+      (match(Op0, m_Shift(m_Value(), m_Value())) &&
+       match(Op1, m_Shift(m_Value(), m_Value())))) {
+    if (Instruction *BSwap = MatchBSwap(I))
+      return BSwap;
+  }
+  
+  // (X^C)|Y -> (X|Y)^C iff Y&C == 0
+  if (Op0->hasOneUse() && match(Op0, m_Xor(m_Value(A), m_ConstantInt(C1))) &&
+      MaskedValueIsZero(Op1, C1->getValue())) {
+    Instruction *NOr = BinaryOperator::CreateOr(A, Op1);
+    InsertNewInstBefore(NOr, I);
+    NOr->takeName(Op0);
+    return BinaryOperator::CreateXor(NOr, C1);
+  }
+
+  // Y|(X^C) -> (X|Y)^C iff Y&C == 0
+  if (Op1->hasOneUse() && match(Op1, m_Xor(m_Value(A), m_ConstantInt(C1))) &&
+      MaskedValueIsZero(Op0, C1->getValue())) {
+    Instruction *NOr = BinaryOperator::CreateOr(A, Op0);
+    InsertNewInstBefore(NOr, I);
+    NOr->takeName(Op0);
+    return BinaryOperator::CreateXor(NOr, C1);
+  }
+
+  // (A & C)|(B & D)
+  Value *C = 0, *D = 0;
+  if (match(Op0, m_And(m_Value(A), m_Value(C))) &&
+      match(Op1, m_And(m_Value(B), m_Value(D)))) {
+    Value *V1 = 0, *V2 = 0, *V3 = 0;
+    C1 = dyn_cast<ConstantInt>(C);
+    C2 = dyn_cast<ConstantInt>(D);
+    if (C1 && C2) {  // (A & C1)|(B & C2)
+      // If we have: ((V + N) & C1) | (V & C2)
+      // .. and C2 = ~C1 and C2 is 0+1+ and (N & C2) == 0
+      // replace with V+N.
+      if (C1->getValue() == ~C2->getValue()) {
+        if ((C2->getValue() & (C2->getValue()+1)) == 0 && // C2 == 0+1+
+            match(A, m_Add(m_Value(V1), m_Value(V2)))) {
+          // Add commutes, try both ways.
+          if (V1 == B && MaskedValueIsZero(V2, C2->getValue()))
+            return ReplaceInstUsesWith(I, A);
+          if (V2 == B && MaskedValueIsZero(V1, C2->getValue()))
+            return ReplaceInstUsesWith(I, A);
+        }
+        // Or commutes, try both ways.
+        if ((C1->getValue() & (C1->getValue()+1)) == 0 &&
+            match(B, m_Add(m_Value(V1), m_Value(V2)))) {
+          // Add commutes, try both ways.
+          if (V1 == A && MaskedValueIsZero(V2, C1->getValue()))
+            return ReplaceInstUsesWith(I, B);
+          if (V2 == A && MaskedValueIsZero(V1, C1->getValue()))
+            return ReplaceInstUsesWith(I, B);
+        }
+      }
+      V1 = 0; V2 = 0; V3 = 0;
+    }
+    
+    // Check to see if we have any common things being and'ed.  If so, find the
+    // terms for V1 & (V2|V3).
+    if (isOnlyUse(Op0) || isOnlyUse(Op1)) {
+      if (A == B)      // (A & C)|(A & D) == A & (C|D)
+        V1 = A, V2 = C, V3 = D;
+      else if (A == D) // (A & C)|(B & A) == A & (B|C)
+        V1 = A, V2 = B, V3 = C;
+      else if (C == B) // (A & C)|(C & D) == C & (A|D)
+        V1 = C, V2 = A, V3 = D;
+      else if (C == D) // (A & C)|(B & C) == C & (A|B)
+        V1 = C, V2 = A, V3 = B;
+      
+      if (V1) {
+        Value *Or =
+          InsertNewInstBefore(BinaryOperator::CreateOr(V2, V3, "tmp"), I);
+        return BinaryOperator::CreateAnd(V1, Or);
+      }
+    }
+
+    // (A & (C0?-1:0)) | (B & ~(C0?-1:0)) ->  C0 ? A : B, and commuted variants
+    if (Instruction *Match = MatchSelectFromAndOr(A, B, C, D))
+      return Match;
+    if (Instruction *Match = MatchSelectFromAndOr(B, A, D, C))
+      return Match;
+    if (Instruction *Match = MatchSelectFromAndOr(C, B, A, D))
+      return Match;
+    if (Instruction *Match = MatchSelectFromAndOr(D, A, B, C))
+      return Match;
+
+    // ((A&~B)|(~A&B)) -> A^B
+    if ((match(C, m_Not(m_Specific(D))) &&
+         match(B, m_Not(m_Specific(A)))))
+      return BinaryOperator::CreateXor(A, D);
+    // ((~B&A)|(~A&B)) -> A^B
+    if ((match(A, m_Not(m_Specific(D))) &&
+         match(B, m_Not(m_Specific(C)))))
+      return BinaryOperator::CreateXor(C, D);
+    // ((A&~B)|(B&~A)) -> A^B
+    if ((match(C, m_Not(m_Specific(B))) &&
+         match(D, m_Not(m_Specific(A)))))
+      return BinaryOperator::CreateXor(A, B);
+    // ((~B&A)|(B&~A)) -> A^B
+    if ((match(A, m_Not(m_Specific(B))) &&
+         match(D, m_Not(m_Specific(C)))))
+      return BinaryOperator::CreateXor(C, B);
+  }
+  
+  // (X >> Z) | (Y >> Z)  -> (X|Y) >> Z  for all shifts.
+  if (BinaryOperator *SI1 = dyn_cast<BinaryOperator>(Op1)) {
+    if (BinaryOperator *SI0 = dyn_cast<BinaryOperator>(Op0))
+      if (SI0->isShift() && SI0->getOpcode() == SI1->getOpcode() && 
+          SI0->getOperand(1) == SI1->getOperand(1) &&
+          (SI0->hasOneUse() || SI1->hasOneUse())) {
+        Instruction *NewOp =
+        InsertNewInstBefore(BinaryOperator::CreateOr(SI0->getOperand(0),
+                                                     SI1->getOperand(0),
+                                                     SI0->getName()), I);
+        return BinaryOperator::Create(SI1->getOpcode(), NewOp, 
+                                      SI1->getOperand(1));
+      }
+  }
+
+  // ((A|B)&1)|(B&-2) -> (A&1) | B
+  if (match(Op0, m_And(m_Or(m_Value(A), m_Value(B)), m_Value(C))) ||
+      match(Op0, m_And(m_Value(C), m_Or(m_Value(A), m_Value(B))))) {
+    Instruction *Ret = FoldOrWithConstants(I, Op1, A, B, C);
+    if (Ret) return Ret;
+  }
+  // (B&-2)|((A|B)&1) -> (A&1) | B
+  if (match(Op1, m_And(m_Or(m_Value(A), m_Value(B)), m_Value(C))) ||
+      match(Op1, m_And(m_Value(C), m_Or(m_Value(A), m_Value(B))))) {
+    Instruction *Ret = FoldOrWithConstants(I, Op0, A, B, C);
+    if (Ret) return Ret;
+  }
+
+  if (match(Op0, m_Not(m_Value(A)))) {   // ~A | Op1
+    if (A == Op1)   // ~A | A == -1
+      return ReplaceInstUsesWith(I, Constant::getAllOnesValue(I.getType()));
+  } else {
+    A = 0;
+  }
+  // Note, A is still live here!
+  if (match(Op1, m_Not(m_Value(B)))) {   // Op0 | ~B
+    if (Op0 == B)
+      return ReplaceInstUsesWith(I, Constant::getAllOnesValue(I.getType()));
+
+    // (~A | ~B) == (~(A & B)) - De Morgan's Law
+    if (A && isOnlyUse(Op0) && isOnlyUse(Op1)) {
+      Value *And = InsertNewInstBefore(BinaryOperator::CreateAnd(A, B,
+                                              I.getName()+".demorgan"), I);
+      return BinaryOperator::CreateNot(And);
+    }
+  }
+
+  // (icmp1 A, B) | (icmp2 A, B) --> (icmp3 A, B)
+  if (ICmpInst *RHS = dyn_cast<ICmpInst>(I.getOperand(1))) {
+    if (Instruction *R = AssociativeOpt(I, FoldICmpLogical(*this, RHS)))
+      return R;
+
+    if (ICmpInst *LHS = dyn_cast<ICmpInst>(I.getOperand(0)))
+      if (Instruction *Res = FoldOrOfICmps(I, LHS, RHS))
+        return Res;
+  }
+    
+  // fold (or (cast A), (cast B)) -> (cast (or A, B))
+  if (CastInst *Op0C = dyn_cast<CastInst>(Op0)) {
+    if (CastInst *Op1C = dyn_cast<CastInst>(Op1))
+      if (Op0C->getOpcode() == Op1C->getOpcode()) {// same cast kind ?
+        if (!isa<ICmpInst>(Op0C->getOperand(0)) ||
+            !isa<ICmpInst>(Op1C->getOperand(0))) {
+          const Type *SrcTy = Op0C->getOperand(0)->getType();
+          if (SrcTy == Op1C->getOperand(0)->getType() && SrcTy->isInteger() &&
+              // Only do this if the casts both really cause code to be
+              // generated.
+              ValueRequiresCast(Op0C->getOpcode(), Op0C->getOperand(0), 
+                                I.getType(), TD) &&
+              ValueRequiresCast(Op1C->getOpcode(), Op1C->getOperand(0), 
+                                I.getType(), TD)) {
+            Instruction *NewOp = BinaryOperator::CreateOr(Op0C->getOperand(0),
+                                                          Op1C->getOperand(0),
+                                                          I.getName());
+            InsertNewInstBefore(NewOp, I);
+            return CastInst::Create(Op0C->getOpcode(), NewOp, I.getType());
+          }
+        }
+      }
+  }
+  
+    
+  // (fcmp uno x, c) | (fcmp uno y, c)  -> (fcmp uno x, y)
+  if (FCmpInst *LHS = dyn_cast<FCmpInst>(I.getOperand(0))) {
+    if (FCmpInst *RHS = dyn_cast<FCmpInst>(I.getOperand(1))) {
+      if (LHS->getPredicate() == FCmpInst::FCMP_UNO &&
+          RHS->getPredicate() == FCmpInst::FCMP_UNO && 
+          LHS->getOperand(0)->getType() == RHS->getOperand(0)->getType()) {
+        if (ConstantFP *LHSC = dyn_cast<ConstantFP>(LHS->getOperand(1)))
+          if (ConstantFP *RHSC = dyn_cast<ConstantFP>(RHS->getOperand(1))) {
+            // If either of the constants are nans, then the whole thing returns
+            // true.
+            if (LHSC->getValueAPF().isNaN() || RHSC->getValueAPF().isNaN())
+              return ReplaceInstUsesWith(I, ConstantInt::getTrue());
+            
+            // Otherwise, no need to compare the two constants, compare the
+            // rest.
+            return new FCmpInst(FCmpInst::FCMP_UNO, LHS->getOperand(0),
+                                RHS->getOperand(0));
+          }
+      } else {
+        Value *Op0LHS, *Op0RHS, *Op1LHS, *Op1RHS;
+        FCmpInst::Predicate Op0CC, Op1CC;
+        if (match(Op0, m_FCmp(Op0CC, m_Value(Op0LHS), m_Value(Op0RHS))) &&
+            match(Op1, m_FCmp(Op1CC, m_Value(Op1LHS), m_Value(Op1RHS)))) {
+          if (Op0LHS == Op1RHS && Op0RHS == Op1LHS) {
+            // Swap RHS operands to match LHS.
+            Op1CC = FCmpInst::getSwappedPredicate(Op1CC);
+            std::swap(Op1LHS, Op1RHS);
+          }
+          if (Op0LHS == Op1LHS && Op0RHS == Op1RHS) {
+            // Simplify (fcmp cc0 x, y) | (fcmp cc1 x, y).
+            if (Op0CC == Op1CC)
+              return new FCmpInst((FCmpInst::Predicate)Op0CC, Op0LHS, Op0RHS);
+            else if (Op0CC == FCmpInst::FCMP_TRUE ||
+                     Op1CC == FCmpInst::FCMP_TRUE)
+              return ReplaceInstUsesWith(I, ConstantInt::getTrue());
+            else if (Op0CC == FCmpInst::FCMP_FALSE)
+              return ReplaceInstUsesWith(I, Op1);
+            else if (Op1CC == FCmpInst::FCMP_FALSE)
+              return ReplaceInstUsesWith(I, Op0);
+            bool Op0Ordered;
+            bool Op1Ordered;
+            unsigned Op0Pred = getFCmpCode(Op0CC, Op0Ordered);
+            unsigned Op1Pred = getFCmpCode(Op1CC, Op1Ordered);
+            if (Op0Ordered == Op1Ordered) {
+              // If both are ordered or unordered, return a new fcmp with
+              // or'ed predicates.
+              Value *RV = getFCmpValue(Op0Ordered, Op0Pred|Op1Pred,
+                                       Op0LHS, Op0RHS);
+              if (Instruction *I = dyn_cast<Instruction>(RV))
+                return I;
+              // Otherwise, it's a constant boolean value...
+              return ReplaceInstUsesWith(I, RV);
+            }
+          }
+        }
+      }
+    }
+  }
+
+  return Changed ? &I : 0;
+}
+
+namespace {
+
+// XorSelf - Implements: X ^ X --> 0
+struct XorSelf {
+  Value *RHS;
+  XorSelf(Value *rhs) : RHS(rhs) {}
+  bool shouldApply(Value *LHS) const { return LHS == RHS; }
+  Instruction *apply(BinaryOperator &Xor) const {
+    return &Xor;
+  }
+};
+
+}
+
+Instruction *InstCombiner::visitXor(BinaryOperator &I) {
+  bool Changed = SimplifyCommutative(I);
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+
+  if (isa<UndefValue>(Op1)) {
+    if (isa<UndefValue>(Op0))
+      // Handle undef ^ undef -> 0 special case. This is a common
+      // idiom (misuse).
+      return ReplaceInstUsesWith(I, Constant::getNullValue(I.getType()));
+    return ReplaceInstUsesWith(I, Op1);  // X ^ undef -> undef
+  }
+
+  // xor X, X = 0, even if X is nested in a sequence of Xor's.
+  if (Instruction *Result = AssociativeOpt(I, XorSelf(Op1))) {
+    assert(Result == &I && "AssociativeOpt didn't work?"); Result=Result;
+    return ReplaceInstUsesWith(I, Constant::getNullValue(I.getType()));
+  }
+  
+  // See if we can simplify any instructions used by the instruction whose sole 
+  // purpose is to compute bits we don't care about.
+  if (!isa<VectorType>(I.getType())) {
+    if (SimplifyDemandedInstructionBits(I))
+      return &I;
+  } else if (isa<ConstantAggregateZero>(Op1)) {
+    return ReplaceInstUsesWith(I, Op0);  // X ^ <0,0> -> X
+  }
+
+  // Is this a ~ operation?
+  if (Value *NotOp = dyn_castNotVal(&I)) {
+    // ~(~X & Y) --> (X | ~Y) - De Morgan's Law
+    // ~(~X | Y) === (X & ~Y) - De Morgan's Law
+    if (BinaryOperator *Op0I = dyn_cast<BinaryOperator>(NotOp)) {
+      if (Op0I->getOpcode() == Instruction::And || 
+          Op0I->getOpcode() == Instruction::Or) {
+        if (dyn_castNotVal(Op0I->getOperand(1))) Op0I->swapOperands();
+        if (Value *Op0NotVal = dyn_castNotVal(Op0I->getOperand(0))) {
+          Instruction *NotY =
+            BinaryOperator::CreateNot(Op0I->getOperand(1),
+                                      Op0I->getOperand(1)->getName()+".not");
+          InsertNewInstBefore(NotY, I);
+          if (Op0I->getOpcode() == Instruction::And)
+            return BinaryOperator::CreateOr(Op0NotVal, NotY);
+          else
+            return BinaryOperator::CreateAnd(Op0NotVal, NotY);
+        }
+      }
+    }
+  }
+  
+  
+  if (ConstantInt *RHS = dyn_cast<ConstantInt>(Op1)) {
+    if (RHS == ConstantInt::getTrue() && Op0->hasOneUse()) {
+      // xor (cmp A, B), true = not (cmp A, B) = !cmp A, B
+      if (ICmpInst *ICI = dyn_cast<ICmpInst>(Op0))
+        return new ICmpInst(ICI->getInversePredicate(),
+                            ICI->getOperand(0), ICI->getOperand(1));
+
+      if (FCmpInst *FCI = dyn_cast<FCmpInst>(Op0))
+        return new FCmpInst(FCI->getInversePredicate(),
+                            FCI->getOperand(0), FCI->getOperand(1));
+    }
+
+    // fold (xor(zext(cmp)), 1) and (xor(sext(cmp)), -1) to ext(!cmp).
+    if (CastInst *Op0C = dyn_cast<CastInst>(Op0)) {
+      if (CmpInst *CI = dyn_cast<CmpInst>(Op0C->getOperand(0))) {
+        if (CI->hasOneUse() && Op0C->hasOneUse()) {
+          Instruction::CastOps Opcode = Op0C->getOpcode();
+          if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {
+            if (RHS == ConstantExpr::getCast(Opcode, ConstantInt::getTrue(),
+                                             Op0C->getDestTy())) {
+              Instruction *NewCI = InsertNewInstBefore(CmpInst::Create(
+                                     CI->getOpcode(), CI->getInversePredicate(),
+                                     CI->getOperand(0), CI->getOperand(1)), I);
+              NewCI->takeName(CI);
+              return CastInst::Create(Opcode, NewCI, Op0C->getType());
+            }
+          }
+        }
+      }
+    }
+
+    if (BinaryOperator *Op0I = dyn_cast<BinaryOperator>(Op0)) {
+      // ~(c-X) == X-c-1 == X+(-c-1)
+      if (Op0I->getOpcode() == Instruction::Sub && RHS->isAllOnesValue())
+        if (Constant *Op0I0C = dyn_cast<Constant>(Op0I->getOperand(0))) {
+          Constant *NegOp0I0C = ConstantExpr::getNeg(Op0I0C);
+          Constant *ConstantRHS = ConstantExpr::getSub(NegOp0I0C,
+                                              ConstantInt::get(I.getType(), 1));
+          return BinaryOperator::CreateAdd(Op0I->getOperand(1), ConstantRHS);
+        }
+          
+      if (ConstantInt *Op0CI = dyn_cast<ConstantInt>(Op0I->getOperand(1))) {
+        if (Op0I->getOpcode() == Instruction::Add) {
+          // ~(X-c) --> (-c-1)-X
+          if (RHS->isAllOnesValue()) {
+            Constant *NegOp0CI = ConstantExpr::getNeg(Op0CI);
+            return BinaryOperator::CreateSub(
+                           ConstantExpr::getSub(NegOp0CI,
+                                             ConstantInt::get(I.getType(), 1)),
+                                          Op0I->getOperand(0));
+          } else if (RHS->getValue().isSignBit()) {
+            // (X + C) ^ signbit -> (X + C + signbit)
+            Constant *C = ConstantInt::get(RHS->getValue() + Op0CI->getValue());
+            return BinaryOperator::CreateAdd(Op0I->getOperand(0), C);
+
+          }
+        } else if (Op0I->getOpcode() == Instruction::Or) {
+          // (X|C1)^C2 -> X^(C1|C2) iff X&~C1 == 0
+          if (MaskedValueIsZero(Op0I->getOperand(0), Op0CI->getValue())) {
+            Constant *NewRHS = ConstantExpr::getOr(Op0CI, RHS);
+            // Anything in both C1 and C2 is known to be zero, remove it from
+            // NewRHS.
+            Constant *CommonBits = And(Op0CI, RHS);
+            NewRHS = ConstantExpr::getAnd(NewRHS, 
+                                          ConstantExpr::getNot(CommonBits));
+            AddToWorkList(Op0I);
+            I.setOperand(0, Op0I->getOperand(0));
+            I.setOperand(1, NewRHS);
+            return &I;
+          }
+        }
+      }
+    }
+
+    // Try to fold constant and into select arguments.
+    if (SelectInst *SI = dyn_cast<SelectInst>(Op0))
+      if (Instruction *R = FoldOpIntoSelect(I, SI, this))
+        return R;
+    if (isa<PHINode>(Op0))
+      if (Instruction *NV = FoldOpIntoPhi(I))
+        return NV;
+  }
+
+  if (Value *X = dyn_castNotVal(Op0))   // ~A ^ A == -1
+    if (X == Op1)
+      return ReplaceInstUsesWith(I, Constant::getAllOnesValue(I.getType()));
+
+  if (Value *X = dyn_castNotVal(Op1))   // A ^ ~A == -1
+    if (X == Op0)
+      return ReplaceInstUsesWith(I, Constant::getAllOnesValue(I.getType()));
+
+  
+  BinaryOperator *Op1I = dyn_cast<BinaryOperator>(Op1);
+  if (Op1I) {
+    Value *A, *B;
+    if (match(Op1I, m_Or(m_Value(A), m_Value(B)))) {
+      if (A == Op0) {              // B^(B|A) == (A|B)^B
+        Op1I->swapOperands();
+        I.swapOperands();
+        std::swap(Op0, Op1);
+      } else if (B == Op0) {       // B^(A|B) == (A|B)^B
+        I.swapOperands();     // Simplified below.
+        std::swap(Op0, Op1);
+      }
+    } else if (match(Op1I, m_Xor(m_Specific(Op0), m_Value(B)))) {
+      return ReplaceInstUsesWith(I, B);                      // A^(A^B) == B
+    } else if (match(Op1I, m_Xor(m_Value(A), m_Specific(Op0)))) {
+      return ReplaceInstUsesWith(I, A);                      // A^(B^A) == B
+    } else if (match(Op1I, m_And(m_Value(A), m_Value(B))) && Op1I->hasOneUse()){
+      if (A == Op0) {                                      // A^(A&B) -> A^(B&A)
+        Op1I->swapOperands();
+        std::swap(A, B);
+      }
+      if (B == Op0) {                                      // A^(B&A) -> (B&A)^A
+        I.swapOperands();     // Simplified below.
+        std::swap(Op0, Op1);
+      }
+    }
+  }
+  
+  BinaryOperator *Op0I = dyn_cast<BinaryOperator>(Op0);
+  if (Op0I) {
+    Value *A, *B;
+    if (match(Op0I, m_Or(m_Value(A), m_Value(B))) && Op0I->hasOneUse()) {
+      if (A == Op1)                                  // (B|A)^B == (A|B)^B
+        std::swap(A, B);
+      if (B == Op1) {                                // (A|B)^B == A & ~B
+        Instruction *NotB =
+          InsertNewInstBefore(BinaryOperator::CreateNot(Op1, "tmp"), I);
+        return BinaryOperator::CreateAnd(A, NotB);
+      }
+    } else if (match(Op0I, m_Xor(m_Specific(Op1), m_Value(B)))) {
+      return ReplaceInstUsesWith(I, B);                      // (A^B)^A == B
+    } else if (match(Op0I, m_Xor(m_Value(A), m_Specific(Op1)))) {
+      return ReplaceInstUsesWith(I, A);                      // (B^A)^A == B
+    } else if (match(Op0I, m_And(m_Value(A), m_Value(B))) && Op0I->hasOneUse()){
+      if (A == Op1)                                        // (A&B)^A -> (B&A)^A
+        std::swap(A, B);
+      if (B == Op1 &&                                      // (B&A)^A == ~B & A
+          !isa<ConstantInt>(Op1)) {  // Canonical form is (B&C)^C
+        Instruction *N =
+          InsertNewInstBefore(BinaryOperator::CreateNot(A, "tmp"), I);
+        return BinaryOperator::CreateAnd(N, Op1);
+      }
+    }
+  }
+  
+  // (X >> Z) ^ (Y >> Z)  -> (X^Y) >> Z  for all shifts.
+  if (Op0I && Op1I && Op0I->isShift() && 
+      Op0I->getOpcode() == Op1I->getOpcode() && 
+      Op0I->getOperand(1) == Op1I->getOperand(1) &&
+      (Op1I->hasOneUse() || Op1I->hasOneUse())) {
+    Instruction *NewOp =
+      InsertNewInstBefore(BinaryOperator::CreateXor(Op0I->getOperand(0),
+                                                    Op1I->getOperand(0),
+                                                    Op0I->getName()), I);
+    return BinaryOperator::Create(Op1I->getOpcode(), NewOp, 
+                                  Op1I->getOperand(1));
+  }
+    
+  if (Op0I && Op1I) {
+    Value *A, *B, *C, *D;
+    // (A & B)^(A | B) -> A ^ B
+    if (match(Op0I, m_And(m_Value(A), m_Value(B))) &&
+        match(Op1I, m_Or(m_Value(C), m_Value(D)))) {
+      if ((A == C && B == D) || (A == D && B == C)) 
+        return BinaryOperator::CreateXor(A, B);
+    }
+    // (A | B)^(A & B) -> A ^ B
+    if (match(Op0I, m_Or(m_Value(A), m_Value(B))) &&
+        match(Op1I, m_And(m_Value(C), m_Value(D)))) {
+      if ((A == C && B == D) || (A == D && B == C)) 
+        return BinaryOperator::CreateXor(A, B);
+    }
+    
+    // (A & B)^(C & D)
+    if ((Op0I->hasOneUse() || Op1I->hasOneUse()) &&
+        match(Op0I, m_And(m_Value(A), m_Value(B))) &&
+        match(Op1I, m_And(m_Value(C), m_Value(D)))) {
+      // (X & Y)^(X & Y) -> (Y^Z) & X
+      Value *X = 0, *Y = 0, *Z = 0;
+      if (A == C)
+        X = A, Y = B, Z = D;
+      else if (A == D)
+        X = A, Y = B, Z = C;
+      else if (B == C)
+        X = B, Y = A, Z = D;
+      else if (B == D)
+        X = B, Y = A, Z = C;
+      
+      if (X) {
+        Instruction *NewOp =
+        InsertNewInstBefore(BinaryOperator::CreateXor(Y, Z, Op0->getName()), I);
+        return BinaryOperator::CreateAnd(NewOp, X);
+      }
+    }
+  }
+    
+  // (icmp1 A, B) ^ (icmp2 A, B) --> (icmp3 A, B)
+  if (ICmpInst *RHS = dyn_cast<ICmpInst>(I.getOperand(1)))
+    if (Instruction *R = AssociativeOpt(I, FoldICmpLogical(*this, RHS)))
+      return R;
+
+  // fold (xor (cast A), (cast B)) -> (cast (xor A, B))
+  if (CastInst *Op0C = dyn_cast<CastInst>(Op0)) {
+    if (CastInst *Op1C = dyn_cast<CastInst>(Op1))
+      if (Op0C->getOpcode() == Op1C->getOpcode()) { // same cast kind?
+        const Type *SrcTy = Op0C->getOperand(0)->getType();
+        if (SrcTy == Op1C->getOperand(0)->getType() && SrcTy->isInteger() &&
+            // Only do this if the casts both really cause code to be generated.
+            ValueRequiresCast(Op0C->getOpcode(), Op0C->getOperand(0), 
+                              I.getType(), TD) &&
+            ValueRequiresCast(Op1C->getOpcode(), Op1C->getOperand(0), 
+                              I.getType(), TD)) {
+          Instruction *NewOp = BinaryOperator::CreateXor(Op0C->getOperand(0),
+                                                         Op1C->getOperand(0),
+                                                         I.getName());
+          InsertNewInstBefore(NewOp, I);
+          return CastInst::Create(Op0C->getOpcode(), NewOp, I.getType());
+        }
+      }
+  }
+
+  return Changed ? &I : 0;
+}
+
+/// AddWithOverflow - Compute Result = In1+In2, returning true if the result
+/// overflowed for this type.
+static bool AddWithOverflow(ConstantInt *&Result, ConstantInt *In1,
+                            ConstantInt *In2, bool IsSigned = false) {
+  Result = cast<ConstantInt>(Add(In1, In2));
+
+  if (IsSigned)
+    if (In2->getValue().isNegative())
+      return Result->getValue().sgt(In1->getValue());
+    else
+      return Result->getValue().slt(In1->getValue());
+  else
+    return Result->getValue().ult(In1->getValue());
+}
+
+/// SubWithOverflow - Compute Result = In1-In2, returning true if the result
+/// overflowed for this type.
+static bool SubWithOverflow(ConstantInt *&Result, ConstantInt *In1,
+                            ConstantInt *In2, bool IsSigned = false) {
+  Result = cast<ConstantInt>(Subtract(In1, In2));
+
+  if (IsSigned)
+    if (In2->getValue().isNegative())
+      return Result->getValue().slt(In1->getValue());
+    else
+      return Result->getValue().sgt(In1->getValue());
+  else
+    return Result->getValue().ugt(In1->getValue());
+}
+
+/// EmitGEPOffset - Given a getelementptr instruction/constantexpr, emit the
+/// code necessary to compute the offset from the base pointer (without adding
+/// in the base pointer).  Return the result as a signed integer of intptr size.
+static Value *EmitGEPOffset(User *GEP, Instruction &I, InstCombiner &IC) {
+  TargetData &TD = IC.getTargetData();
+  gep_type_iterator GTI = gep_type_begin(GEP);
+  const Type *IntPtrTy = TD.getIntPtrType();
+  Value *Result = Constant::getNullValue(IntPtrTy);
+
+  // Build a mask for high order bits.
+  unsigned IntPtrWidth = TD.getPointerSizeInBits();
+  uint64_t PtrSizeMask = ~0ULL >> (64-IntPtrWidth);
+
+  for (User::op_iterator i = GEP->op_begin() + 1, e = GEP->op_end(); i != e;
+       ++i, ++GTI) {
+    Value *Op = *i;
+    uint64_t Size = TD.getTypeAllocSize(GTI.getIndexedType()) & PtrSizeMask;
+    if (ConstantInt *OpC = dyn_cast<ConstantInt>(Op)) {
+      if (OpC->isZero()) continue;
+      
+      // Handle a struct index, which adds its field offset to the pointer.
+      if (const StructType *STy = dyn_cast<StructType>(*GTI)) {
+        Size = TD.getStructLayout(STy)->getElementOffset(OpC->getZExtValue());
+        
+        if (ConstantInt *RC = dyn_cast<ConstantInt>(Result))
+          Result = ConstantInt::get(RC->getValue() + APInt(IntPtrWidth, Size));
+        else
+          Result = IC.InsertNewInstBefore(
+                   BinaryOperator::CreateAdd(Result,
+                                             ConstantInt::get(IntPtrTy, Size),
+                                             GEP->getName()+".offs"), I);
+        continue;
+      }
+      
+      Constant *Scale = ConstantInt::get(IntPtrTy, Size);
+      Constant *OC = ConstantExpr::getIntegerCast(OpC, IntPtrTy, true /*SExt*/);
+      Scale = ConstantExpr::getMul(OC, Scale);
+      if (Constant *RC = dyn_cast<Constant>(Result))
+        Result = ConstantExpr::getAdd(RC, Scale);
+      else {
+        // Emit an add instruction.
+        Result = IC.InsertNewInstBefore(
+           BinaryOperator::CreateAdd(Result, Scale,
+                                     GEP->getName()+".offs"), I);
+      }
+      continue;
+    }
+    // Convert to correct type.
+    if (Op->getType() != IntPtrTy) {
+      if (Constant *OpC = dyn_cast<Constant>(Op))
+        Op = ConstantExpr::getIntegerCast(OpC, IntPtrTy, true);
+      else
+        Op = IC.InsertNewInstBefore(CastInst::CreateIntegerCast(Op, IntPtrTy,
+                                                                true,
+                                                      Op->getName()+".c"), I);
+    }
+    if (Size != 1) {
+      Constant *Scale = ConstantInt::get(IntPtrTy, Size);
+      if (Constant *OpC = dyn_cast<Constant>(Op))
+        Op = ConstantExpr::getMul(OpC, Scale);
+      else    // We'll let instcombine(mul) convert this to a shl if possible.
+        Op = IC.InsertNewInstBefore(BinaryOperator::CreateMul(Op, Scale,
+                                                  GEP->getName()+".idx"), I);
+    }
+
+    // Emit an add instruction.
+    if (isa<Constant>(Op) && isa<Constant>(Result))
+      Result = ConstantExpr::getAdd(cast<Constant>(Op),
+                                    cast<Constant>(Result));
+    else
+      Result = IC.InsertNewInstBefore(BinaryOperator::CreateAdd(Op, Result,
+                                                  GEP->getName()+".offs"), I);
+  }
+  return Result;
+}
+
+
+/// EvaluateGEPOffsetExpression - Return an value that can be used to compare of
+/// the *offset* implied by GEP to zero.  For example, if we have &A[i], we want
+/// to return 'i' for "icmp ne i, 0".  Note that, in general, indices can be
+/// complex, and scales are involved.  The above expression would also be legal
+/// to codegen as "icmp ne (i*4), 0" (assuming A is a pointer to i32).  This
+/// later form is less amenable to optimization though, and we are allowed to
+/// generate the first by knowing that pointer arithmetic doesn't overflow.
+///
+/// If we can't emit an optimized form for this expression, this returns null.
+/// 
+static Value *EvaluateGEPOffsetExpression(User *GEP, Instruction &I,
+                                          InstCombiner &IC) {
+  TargetData &TD = IC.getTargetData();
+  gep_type_iterator GTI = gep_type_begin(GEP);
+
+  // Check to see if this gep only has a single variable index.  If so, and if
+  // any constant indices are a multiple of its scale, then we can compute this
+  // in terms of the scale of the variable index.  For example, if the GEP
+  // implies an offset of "12 + i*4", then we can codegen this as "3 + i",
+  // because the expression will cross zero at the same point.
+  unsigned i, e = GEP->getNumOperands();
+  int64_t Offset = 0;
+  for (i = 1; i != e; ++i, ++GTI) {
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(GEP->getOperand(i))) {
+      // Compute the aggregate offset of constant indices.
+      if (CI->isZero()) continue;
+
+      // Handle a struct index, which adds its field offset to the pointer.
+      if (const StructType *STy = dyn_cast<StructType>(*GTI)) {
+        Offset += TD.getStructLayout(STy)->getElementOffset(CI->getZExtValue());
+      } else {
+        uint64_t Size = TD.getTypeAllocSize(GTI.getIndexedType());
+        Offset += Size*CI->getSExtValue();
+      }
+    } else {
+      // Found our variable index.
+      break;
+    }
+  }
+  
+  // If there are no variable indices, we must have a constant offset, just
+  // evaluate it the general way.
+  if (i == e) return 0;
+  
+  Value *VariableIdx = GEP->getOperand(i);
+  // Determine the scale factor of the variable element.  For example, this is
+  // 4 if the variable index is into an array of i32.
+  uint64_t VariableScale = TD.getTypeAllocSize(GTI.getIndexedType());
+  
+  // Verify that there are no other variable indices.  If so, emit the hard way.
+  for (++i, ++GTI; i != e; ++i, ++GTI) {
+    ConstantInt *CI = dyn_cast<ConstantInt>(GEP->getOperand(i));
+    if (!CI) return 0;
+   
+    // Compute the aggregate offset of constant indices.
+    if (CI->isZero()) continue;
+    
+    // Handle a struct index, which adds its field offset to the pointer.
+    if (const StructType *STy = dyn_cast<StructType>(*GTI)) {
+      Offset += TD.getStructLayout(STy)->getElementOffset(CI->getZExtValue());
+    } else {
+      uint64_t Size = TD.getTypeAllocSize(GTI.getIndexedType());
+      Offset += Size*CI->getSExtValue();
+    }
+  }
+  
+  // Okay, we know we have a single variable index, which must be a
+  // pointer/array/vector index.  If there is no offset, life is simple, return
+  // the index.
+  unsigned IntPtrWidth = TD.getPointerSizeInBits();
+  if (Offset == 0) {
+    // Cast to intptrty in case a truncation occurs.  If an extension is needed,
+    // we don't need to bother extending: the extension won't affect where the
+    // computation crosses zero.
+    if (VariableIdx->getType()->getPrimitiveSizeInBits() > IntPtrWidth)
+      VariableIdx = new TruncInst(VariableIdx, TD.getIntPtrType(),
+                                  VariableIdx->getNameStart(), &I);
+    return VariableIdx;
+  }
+  
+  // Otherwise, there is an index.  The computation we will do will be modulo
+  // the pointer size, so get it.
+  uint64_t PtrSizeMask = ~0ULL >> (64-IntPtrWidth);
+  
+  Offset &= PtrSizeMask;
+  VariableScale &= PtrSizeMask;
+
+  // To do this transformation, any constant index must be a multiple of the
+  // variable scale factor.  For example, we can evaluate "12 + 4*i" as "3 + i",
+  // but we can't evaluate "10 + 3*i" in terms of i.  Check that the offset is a
+  // multiple of the variable scale.
+  int64_t NewOffs = Offset / (int64_t)VariableScale;
+  if (Offset != NewOffs*(int64_t)VariableScale)
+    return 0;
+
+  // Okay, we can do this evaluation.  Start by converting the index to intptr.
+  const Type *IntPtrTy = TD.getIntPtrType();
+  if (VariableIdx->getType() != IntPtrTy)
+    VariableIdx = CastInst::CreateIntegerCast(VariableIdx, IntPtrTy,
+                                              true /*SExt*/, 
+                                              VariableIdx->getNameStart(), &I);
+  Constant *OffsetVal = ConstantInt::get(IntPtrTy, NewOffs);
+  return BinaryOperator::CreateAdd(VariableIdx, OffsetVal, "offset", &I);
+}
+
+
+/// FoldGEPICmp - Fold comparisons between a GEP instruction and something
+/// else.  At this point we know that the GEP is on the LHS of the comparison.
+Instruction *InstCombiner::FoldGEPICmp(User *GEPLHS, Value *RHS,
+                                       ICmpInst::Predicate Cond,
+                                       Instruction &I) {
+  assert(dyn_castGetElementPtr(GEPLHS) && "LHS is not a getelementptr!");
+
+  // Look through bitcasts.
+  if (BitCastInst *BCI = dyn_cast<BitCastInst>(RHS))
+    RHS = BCI->getOperand(0);
+
+  Value *PtrBase = GEPLHS->getOperand(0);
+  if (PtrBase == RHS) {
+    // ((gep Ptr, OFFSET) cmp Ptr)   ---> (OFFSET cmp 0).
+    // This transformation (ignoring the base and scales) is valid because we
+    // know pointers can't overflow.  See if we can output an optimized form.
+    Value *Offset = EvaluateGEPOffsetExpression(GEPLHS, I, *this);
+    
+    // If not, synthesize the offset the hard way.
+    if (Offset == 0)
+      Offset = EmitGEPOffset(GEPLHS, I, *this);
+    return new ICmpInst(ICmpInst::getSignedPredicate(Cond), Offset,
+                        Constant::getNullValue(Offset->getType()));
+  } else if (User *GEPRHS = dyn_castGetElementPtr(RHS)) {
+    // If the base pointers are different, but the indices are the same, just
+    // compare the base pointer.
+    if (PtrBase != GEPRHS->getOperand(0)) {
+      bool IndicesTheSame = GEPLHS->getNumOperands()==GEPRHS->getNumOperands();
+      IndicesTheSame &= GEPLHS->getOperand(0)->getType() ==
+                        GEPRHS->getOperand(0)->getType();
+      if (IndicesTheSame)
+        for (unsigned i = 1, e = GEPLHS->getNumOperands(); i != e; ++i)
+          if (GEPLHS->getOperand(i) != GEPRHS->getOperand(i)) {
+            IndicesTheSame = false;
+            break;
+          }
+
+      // If all indices are the same, just compare the base pointers.
+      if (IndicesTheSame)
+        return new ICmpInst(ICmpInst::getSignedPredicate(Cond), 
+                            GEPLHS->getOperand(0), GEPRHS->getOperand(0));
+
+      // Otherwise, the base pointers are different and the indices are
+      // different, bail out.
+      return 0;
+    }
+
+    // If one of the GEPs has all zero indices, recurse.
+    bool AllZeros = true;
+    for (unsigned i = 1, e = GEPLHS->getNumOperands(); i != e; ++i)
+      if (!isa<Constant>(GEPLHS->getOperand(i)) ||
+          !cast<Constant>(GEPLHS->getOperand(i))->isNullValue()) {
+        AllZeros = false;
+        break;
+      }
+    if (AllZeros)
+      return FoldGEPICmp(GEPRHS, GEPLHS->getOperand(0),
+                          ICmpInst::getSwappedPredicate(Cond), I);
+
+    // If the other GEP has all zero indices, recurse.
+    AllZeros = true;
+    for (unsigned i = 1, e = GEPRHS->getNumOperands(); i != e; ++i)
+      if (!isa<Constant>(GEPRHS->getOperand(i)) ||
+          !cast<Constant>(GEPRHS->getOperand(i))->isNullValue()) {
+        AllZeros = false;
+        break;
+      }
+    if (AllZeros)
+      return FoldGEPICmp(GEPLHS, GEPRHS->getOperand(0), Cond, I);
+
+    if (GEPLHS->getNumOperands() == GEPRHS->getNumOperands()) {
+      // If the GEPs only differ by one index, compare it.
+      unsigned NumDifferences = 0;  // Keep track of # differences.
+      unsigned DiffOperand = 0;     // The operand that differs.
+      for (unsigned i = 1, e = GEPRHS->getNumOperands(); i != e; ++i)
+        if (GEPLHS->getOperand(i) != GEPRHS->getOperand(i)) {
+          if (GEPLHS->getOperand(i)->getType()->getPrimitiveSizeInBits() !=
+                   GEPRHS->getOperand(i)->getType()->getPrimitiveSizeInBits()) {
+            // Irreconcilable differences.
+            NumDifferences = 2;
+            break;
+          } else {
+            if (NumDifferences++) break;
+            DiffOperand = i;
+          }
+        }
+
+      if (NumDifferences == 0)   // SAME GEP?
+        return ReplaceInstUsesWith(I, // No comparison is needed here.
+                                   ConstantInt::get(Type::Int1Ty,
+                                             ICmpInst::isTrueWhenEqual(Cond)));
+
+      else if (NumDifferences == 1) {
+        Value *LHSV = GEPLHS->getOperand(DiffOperand);
+        Value *RHSV = GEPRHS->getOperand(DiffOperand);
+        // Make sure we do a signed comparison here.
+        return new ICmpInst(ICmpInst::getSignedPredicate(Cond), LHSV, RHSV);
+      }
+    }
+
+    // Only lower this if the icmp is the only user of the GEP or if we expect
+    // the result to fold to a constant!
+    if ((isa<ConstantExpr>(GEPLHS) || GEPLHS->hasOneUse()) &&
+        (isa<ConstantExpr>(GEPRHS) || GEPRHS->hasOneUse())) {
+      // ((gep Ptr, OFFSET1) cmp (gep Ptr, OFFSET2)  --->  (OFFSET1 cmp OFFSET2)
+      Value *L = EmitGEPOffset(GEPLHS, I, *this);
+      Value *R = EmitGEPOffset(GEPRHS, I, *this);
+      return new ICmpInst(ICmpInst::getSignedPredicate(Cond), L, R);
+    }
+  }
+  return 0;
+}
+
+/// FoldFCmp_IntToFP_Cst - Fold fcmp ([us]itofp x, cst) if possible.
+///
+Instruction *InstCombiner::FoldFCmp_IntToFP_Cst(FCmpInst &I,
+                                                Instruction *LHSI,
+                                                Constant *RHSC) {
+  if (!isa<ConstantFP>(RHSC)) return 0;
+  const APFloat &RHS = cast<ConstantFP>(RHSC)->getValueAPF();
+  
+  // Get the width of the mantissa.  We don't want to hack on conversions that
+  // might lose information from the integer, e.g. "i64 -> float"
+  int MantissaWidth = LHSI->getType()->getFPMantissaWidth();
+  if (MantissaWidth == -1) return 0;  // Unknown.
+  
+  // Check to see that the input is converted from an integer type that is small
+  // enough that preserves all bits.  TODO: check here for "known" sign bits.
+  // This would allow us to handle (fptosi (x >>s 62) to float) if x is i64 f.e.
+  unsigned InputSize = LHSI->getOperand(0)->getType()->getPrimitiveSizeInBits();
+  
+  // If this is a uitofp instruction, we need an extra bit to hold the sign.
+  bool LHSUnsigned = isa<UIToFPInst>(LHSI);
+  if (LHSUnsigned)
+    ++InputSize;
+  
+  // If the conversion would lose info, don't hack on this.
+  if ((int)InputSize > MantissaWidth)
+    return 0;
+  
+  // Otherwise, we can potentially simplify the comparison.  We know that it
+  // will always come through as an integer value and we know the constant is
+  // not a NAN (it would have been previously simplified).
+  assert(!RHS.isNaN() && "NaN comparison not already folded!");
+  
+  ICmpInst::Predicate Pred;
+  switch (I.getPredicate()) {
+  default: assert(0 && "Unexpected predicate!");
+  case FCmpInst::FCMP_UEQ:
+  case FCmpInst::FCMP_OEQ:
+    Pred = ICmpInst::ICMP_EQ;
+    break;
+  case FCmpInst::FCMP_UGT:
+  case FCmpInst::FCMP_OGT:
+    Pred = LHSUnsigned ? ICmpInst::ICMP_UGT : ICmpInst::ICMP_SGT;
+    break;
+  case FCmpInst::FCMP_UGE:
+  case FCmpInst::FCMP_OGE:
+    Pred = LHSUnsigned ? ICmpInst::ICMP_UGE : ICmpInst::ICMP_SGE;
+    break;
+  case FCmpInst::FCMP_ULT:
+  case FCmpInst::FCMP_OLT:
+    Pred = LHSUnsigned ? ICmpInst::ICMP_ULT : ICmpInst::ICMP_SLT;
+    break;
+  case FCmpInst::FCMP_ULE:
+  case FCmpInst::FCMP_OLE:
+    Pred = LHSUnsigned ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_SLE;
+    break;
+  case FCmpInst::FCMP_UNE:
+  case FCmpInst::FCMP_ONE:
+    Pred = ICmpInst::ICMP_NE;
+    break;
+  case FCmpInst::FCMP_ORD:
+    return ReplaceInstUsesWith(I, ConstantInt::getTrue());
+  case FCmpInst::FCMP_UNO:
+    return ReplaceInstUsesWith(I, ConstantInt::getFalse());
+  }
+  
+  const IntegerType *IntTy = cast<IntegerType>(LHSI->getOperand(0)->getType());
+  
+  // Now we know that the APFloat is a normal number, zero or inf.
+  
+  // See if the FP constant is too large for the integer.  For example,
+  // comparing an i8 to 300.0.
+  unsigned IntWidth = IntTy->getPrimitiveSizeInBits();
+  
+  if (!LHSUnsigned) {
+    // If the RHS value is > SignedMax, fold the comparison.  This handles +INF
+    // and large values.
+    APFloat SMax(RHS.getSemantics(), APFloat::fcZero, false);
+    SMax.convertFromAPInt(APInt::getSignedMaxValue(IntWidth), true,
+                          APFloat::rmNearestTiesToEven);
+    if (SMax.compare(RHS) == APFloat::cmpLessThan) {  // smax < 13123.0
+      if (Pred == ICmpInst::ICMP_NE  || Pred == ICmpInst::ICMP_SLT ||
+          Pred == ICmpInst::ICMP_SLE)
+        return ReplaceInstUsesWith(I, ConstantInt::getTrue());
+      return ReplaceInstUsesWith(I, ConstantInt::getFalse());
+    }
+  } else {
+    // If the RHS value is > UnsignedMax, fold the comparison. This handles
+    // +INF and large values.
+    APFloat UMax(RHS.getSemantics(), APFloat::fcZero, false);
+    UMax.convertFromAPInt(APInt::getMaxValue(IntWidth), false,
+                          APFloat::rmNearestTiesToEven);
+    if (UMax.compare(RHS) == APFloat::cmpLessThan) {  // umax < 13123.0
+      if (Pred == ICmpInst::ICMP_NE  || Pred == ICmpInst::ICMP_ULT ||
+          Pred == ICmpInst::ICMP_ULE)
+        return ReplaceInstUsesWith(I, ConstantInt::getTrue());
+      return ReplaceInstUsesWith(I, ConstantInt::getFalse());
+    }
+  }
+  
+  if (!LHSUnsigned) {
+    // See if the RHS value is < SignedMin.
+    APFloat SMin(RHS.getSemantics(), APFloat::fcZero, false);
+    SMin.convertFromAPInt(APInt::getSignedMinValue(IntWidth), true,
+                          APFloat::rmNearestTiesToEven);
+    if (SMin.compare(RHS) == APFloat::cmpGreaterThan) { // smin > 12312.0
+      if (Pred == ICmpInst::ICMP_NE || Pred == ICmpInst::ICMP_SGT ||
+          Pred == ICmpInst::ICMP_SGE)
+        return ReplaceInstUsesWith(I,ConstantInt::getTrue());
+      return ReplaceInstUsesWith(I, ConstantInt::getFalse());
+    }
+  }
+
+  // Okay, now we know that the FP constant fits in the range [SMIN, SMAX] or
+  // [0, UMAX], but it may still be fractional.  See if it is fractional by
+  // casting the FP value to the integer value and back, checking for equality.
+  // Don't do this for zero, because -0.0 is not fractional.
+  Constant *RHSInt = LHSUnsigned
+    ? ConstantExpr::getFPToUI(RHSC, IntTy)
+    : ConstantExpr::getFPToSI(RHSC, IntTy);
+  if (!RHS.isZero()) {
+    bool Equal = LHSUnsigned
+      ? ConstantExpr::getUIToFP(RHSInt, RHSC->getType()) == RHSC
+      : ConstantExpr::getSIToFP(RHSInt, RHSC->getType()) == RHSC;
+    if (!Equal) {
+      // If we had a comparison against a fractional value, we have to adjust
+      // the compare predicate and sometimes the value.  RHSC is rounded towards
+      // zero at this point.
+      switch (Pred) {
+      default: assert(0 && "Unexpected integer comparison!");
+      case ICmpInst::ICMP_NE:  // (float)int != 4.4   --> true
+        return ReplaceInstUsesWith(I, ConstantInt::getTrue());
+      case ICmpInst::ICMP_EQ:  // (float)int == 4.4   --> false
+        return ReplaceInstUsesWith(I, ConstantInt::getFalse());
+      case ICmpInst::ICMP_ULE:
+        // (float)int <= 4.4   --> int <= 4
+        // (float)int <= -4.4  --> false
+        if (RHS.isNegative())
+          return ReplaceInstUsesWith(I, ConstantInt::getFalse());
+        break;
+      case ICmpInst::ICMP_SLE:
+        // (float)int <= 4.4   --> int <= 4
+        // (float)int <= -4.4  --> int < -4
+        if (RHS.isNegative())
+          Pred = ICmpInst::ICMP_SLT;
+        break;
+      case ICmpInst::ICMP_ULT:
+        // (float)int < -4.4   --> false
+        // (float)int < 4.4    --> int <= 4
+        if (RHS.isNegative())
+          return ReplaceInstUsesWith(I, ConstantInt::getFalse());
+        Pred = ICmpInst::ICMP_ULE;
+        break;
+      case ICmpInst::ICMP_SLT:
+        // (float)int < -4.4   --> int < -4
+        // (float)int < 4.4    --> int <= 4
+        if (!RHS.isNegative())
+          Pred = ICmpInst::ICMP_SLE;
+        break;
+      case ICmpInst::ICMP_UGT:
+        // (float)int > 4.4    --> int > 4
+        // (float)int > -4.4   --> true
+        if (RHS.isNegative())
+          return ReplaceInstUsesWith(I, ConstantInt::getTrue());
+        break;
+      case ICmpInst::ICMP_SGT:
+        // (float)int > 4.4    --> int > 4
+        // (float)int > -4.4   --> int >= -4
+        if (RHS.isNegative())
+          Pred = ICmpInst::ICMP_SGE;
+        break;
+      case ICmpInst::ICMP_UGE:
+        // (float)int >= -4.4   --> true
+        // (float)int >= 4.4    --> int > 4
+        if (!RHS.isNegative())
+          return ReplaceInstUsesWith(I, ConstantInt::getTrue());
+        Pred = ICmpInst::ICMP_UGT;
+        break;
+      case ICmpInst::ICMP_SGE:
+        // (float)int >= -4.4   --> int >= -4
+        // (float)int >= 4.4    --> int > 4
+        if (!RHS.isNegative())
+          Pred = ICmpInst::ICMP_SGT;
+        break;
+      }
+    }
+  }
+
+  // Lower this FP comparison into an appropriate integer version of the
+  // comparison.
+  return new ICmpInst(Pred, LHSI->getOperand(0), RHSInt);
+}
+
+Instruction *InstCombiner::visitFCmpInst(FCmpInst &I) {
+  bool Changed = SimplifyCompare(I);
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+
+  // Fold trivial predicates.
+  if (I.getPredicate() == FCmpInst::FCMP_FALSE)
+    return ReplaceInstUsesWith(I, ConstantInt::getFalse());
+  if (I.getPredicate() == FCmpInst::FCMP_TRUE)
+    return ReplaceInstUsesWith(I, ConstantInt::getTrue());
+  
+  // Simplify 'fcmp pred X, X'
+  if (Op0 == Op1) {
+    switch (I.getPredicate()) {
+    default: assert(0 && "Unknown predicate!");
+    case FCmpInst::FCMP_UEQ:    // True if unordered or equal
+    case FCmpInst::FCMP_UGE:    // True if unordered, greater than, or equal
+    case FCmpInst::FCMP_ULE:    // True if unordered, less than, or equal
+      return ReplaceInstUsesWith(I, ConstantInt::getTrue());
+    case FCmpInst::FCMP_OGT:    // True if ordered and greater than
+    case FCmpInst::FCMP_OLT:    // True if ordered and less than
+    case FCmpInst::FCMP_ONE:    // True if ordered and operands are unequal
+      return ReplaceInstUsesWith(I, ConstantInt::getFalse());
+      
+    case FCmpInst::FCMP_UNO:    // True if unordered: isnan(X) | isnan(Y)
+    case FCmpInst::FCMP_ULT:    // True if unordered or less than
+    case FCmpInst::FCMP_UGT:    // True if unordered or greater than
+    case FCmpInst::FCMP_UNE:    // True if unordered or not equal
+      // Canonicalize these to be 'fcmp uno %X, 0.0'.
+      I.setPredicate(FCmpInst::FCMP_UNO);
+      I.setOperand(1, Constant::getNullValue(Op0->getType()));
+      return &I;
+      
+    case FCmpInst::FCMP_ORD:    // True if ordered (no nans)
+    case FCmpInst::FCMP_OEQ:    // True if ordered and equal
+    case FCmpInst::FCMP_OGE:    // True if ordered and greater than or equal
+    case FCmpInst::FCMP_OLE:    // True if ordered and less than or equal
+      // Canonicalize these to be 'fcmp ord %X, 0.0'.
+      I.setPredicate(FCmpInst::FCMP_ORD);
+      I.setOperand(1, Constant::getNullValue(Op0->getType()));
+      return &I;
+    }
+  }
+    
+  if (isa<UndefValue>(Op1))                  // fcmp pred X, undef -> undef
+    return ReplaceInstUsesWith(I, UndefValue::get(Type::Int1Ty));
+
+  // Handle fcmp with constant RHS
+  if (Constant *RHSC = dyn_cast<Constant>(Op1)) {
+    // If the constant is a nan, see if we can fold the comparison based on it.
+    if (ConstantFP *CFP = dyn_cast<ConstantFP>(RHSC)) {
+      if (CFP->getValueAPF().isNaN()) {
+        if (FCmpInst::isOrdered(I.getPredicate()))   // True if ordered and...
+          return ReplaceInstUsesWith(I, ConstantInt::getFalse());
+        assert(FCmpInst::isUnordered(I.getPredicate()) &&
+               "Comparison must be either ordered or unordered!");
+        // True if unordered.
+        return ReplaceInstUsesWith(I, ConstantInt::getTrue());
+      }
+    }
+    
+    if (Instruction *LHSI = dyn_cast<Instruction>(Op0))
+      switch (LHSI->getOpcode()) {
+      case Instruction::PHI:
+        // Only fold fcmp into the PHI if the phi and fcmp are in the same
+        // block.  If in the same block, we're encouraging jump threading.  If
+        // not, we are just pessimizing the code by making an i1 phi.
+        if (LHSI->getParent() == I.getParent())
+          if (Instruction *NV = FoldOpIntoPhi(I))
+            return NV;
+        break;
+      case Instruction::SIToFP:
+      case Instruction::UIToFP:
+        if (Instruction *NV = FoldFCmp_IntToFP_Cst(I, LHSI, RHSC))
+          return NV;
+        break;
+      case Instruction::Select:
+        // If either operand of the select is a constant, we can fold the
+        // comparison into the select arms, which will cause one to be
+        // constant folded and the select turned into a bitwise or.
+        Value *Op1 = 0, *Op2 = 0;
+        if (LHSI->hasOneUse()) {
+          if (Constant *C = dyn_cast<Constant>(LHSI->getOperand(1))) {
+            // Fold the known value into the constant operand.
+            Op1 = ConstantExpr::getCompare(I.getPredicate(), C, RHSC);
+            // Insert a new FCmp of the other select operand.
+            Op2 = InsertNewInstBefore(new FCmpInst(I.getPredicate(),
+                                                      LHSI->getOperand(2), RHSC,
+                                                      I.getName()), I);
+          } else if (Constant *C = dyn_cast<Constant>(LHSI->getOperand(2))) {
+            // Fold the known value into the constant operand.
+            Op2 = ConstantExpr::getCompare(I.getPredicate(), C, RHSC);
+            // Insert a new FCmp of the other select operand.
+            Op1 = InsertNewInstBefore(new FCmpInst(I.getPredicate(),
+                                                      LHSI->getOperand(1), RHSC,
+                                                      I.getName()), I);
+          }
+        }
+
+        if (Op1)
+          return SelectInst::Create(LHSI->getOperand(0), Op1, Op2);
+        break;
+      }
+  }
+
+  return Changed ? &I : 0;
+}
+
+Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
+  bool Changed = SimplifyCompare(I);
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+  const Type *Ty = Op0->getType();
+
+  // icmp X, X
+  if (Op0 == Op1)
+    return ReplaceInstUsesWith(I, ConstantInt::get(Type::Int1Ty, 
+                                                   I.isTrueWhenEqual()));
+
+  if (isa<UndefValue>(Op1))                  // X icmp undef -> undef
+    return ReplaceInstUsesWith(I, UndefValue::get(Type::Int1Ty));
+  
+  // icmp <global/alloca*/null>, <global/alloca*/null> - Global/Stack value
+  // addresses never equal each other!  We already know that Op0 != Op1.
+  if ((isa<GlobalValue>(Op0) || isa<AllocaInst>(Op0) ||
+       isa<ConstantPointerNull>(Op0)) &&
+      (isa<GlobalValue>(Op1) || isa<AllocaInst>(Op1) ||
+       isa<ConstantPointerNull>(Op1)))
+    return ReplaceInstUsesWith(I, ConstantInt::get(Type::Int1Ty, 
+                                                   !I.isTrueWhenEqual()));
+
+  // icmp's with boolean values can always be turned into bitwise operations
+  if (Ty == Type::Int1Ty) {
+    switch (I.getPredicate()) {
+    default: assert(0 && "Invalid icmp instruction!");
+    case ICmpInst::ICMP_EQ: {               // icmp eq i1 A, B -> ~(A^B)
+      Instruction *Xor = BinaryOperator::CreateXor(Op0, Op1, I.getName()+"tmp");
+      InsertNewInstBefore(Xor, I);
+      return BinaryOperator::CreateNot(Xor);
+    }
+    case ICmpInst::ICMP_NE:                  // icmp eq i1 A, B -> A^B
+      return BinaryOperator::CreateXor(Op0, Op1);
+
+    case ICmpInst::ICMP_UGT:
+      std::swap(Op0, Op1);                   // Change icmp ugt -> icmp ult
+      // FALL THROUGH
+    case ICmpInst::ICMP_ULT:{               // icmp ult i1 A, B -> ~A & B
+      Instruction *Not = BinaryOperator::CreateNot(Op0, I.getName()+"tmp");
+      InsertNewInstBefore(Not, I);
+      return BinaryOperator::CreateAnd(Not, Op1);
+    }
+    case ICmpInst::ICMP_SGT:
+      std::swap(Op0, Op1);                   // Change icmp sgt -> icmp slt
+      // FALL THROUGH
+    case ICmpInst::ICMP_SLT: {               // icmp slt i1 A, B -> A & ~B
+      Instruction *Not = BinaryOperator::CreateNot(Op1, I.getName()+"tmp");
+      InsertNewInstBefore(Not, I);
+      return BinaryOperator::CreateAnd(Not, Op0);
+    }
+    case ICmpInst::ICMP_UGE:
+      std::swap(Op0, Op1);                   // Change icmp uge -> icmp ule
+      // FALL THROUGH
+    case ICmpInst::ICMP_ULE: {               //  icmp ule i1 A, B -> ~A | B
+      Instruction *Not = BinaryOperator::CreateNot(Op0, I.getName()+"tmp");
+      InsertNewInstBefore(Not, I);
+      return BinaryOperator::CreateOr(Not, Op1);
+    }
+    case ICmpInst::ICMP_SGE:
+      std::swap(Op0, Op1);                   // Change icmp sge -> icmp sle
+      // FALL THROUGH
+    case ICmpInst::ICMP_SLE: {               //  icmp sle i1 A, B -> A | ~B
+      Instruction *Not = BinaryOperator::CreateNot(Op1, I.getName()+"tmp");
+      InsertNewInstBefore(Not, I);
+      return BinaryOperator::CreateOr(Not, Op0);
+    }
+    }
+  }
+
+  unsigned BitWidth = 0;
+  if (TD)
+    BitWidth = TD->getTypeSizeInBits(Ty);
+  else if (isa<IntegerType>(Ty))
+    BitWidth = Ty->getPrimitiveSizeInBits();
+
+  bool isSignBit = false;
+
+  // See if we are doing a comparison with a constant.
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(Op1)) {
+    Value *A = 0, *B = 0;
+    
+    // (icmp ne/eq (sub A B) 0) -> (icmp ne/eq A, B)
+    if (I.isEquality() && CI->isNullValue() &&
+        match(Op0, m_Sub(m_Value(A), m_Value(B)))) {
+      // (icmp cond A B) if cond is equality
+      return new ICmpInst(I.getPredicate(), A, B);
+    }
+    
+    // If we have an icmp le or icmp ge instruction, turn it into the
+    // appropriate icmp lt or icmp gt instruction.  This allows us to rely on
+    // them being folded in the code below.
+    switch (I.getPredicate()) {
+    default: break;
+    case ICmpInst::ICMP_ULE:
+      if (CI->isMaxValue(false))                 // A <=u MAX -> TRUE
+        return ReplaceInstUsesWith(I, ConstantInt::getTrue());
+      return new ICmpInst(ICmpInst::ICMP_ULT, Op0, AddOne(CI));
+    case ICmpInst::ICMP_SLE:
+      if (CI->isMaxValue(true))                  // A <=s MAX -> TRUE
+        return ReplaceInstUsesWith(I, ConstantInt::getTrue());
+      return new ICmpInst(ICmpInst::ICMP_SLT, Op0, AddOne(CI));
+    case ICmpInst::ICMP_UGE:
+      if (CI->isMinValue(false))                 // A >=u MIN -> TRUE
+        return ReplaceInstUsesWith(I, ConstantInt::getTrue());
+      return new ICmpInst( ICmpInst::ICMP_UGT, Op0, SubOne(CI));
+    case ICmpInst::ICMP_SGE:
+      if (CI->isMinValue(true))                  // A >=s MIN -> TRUE
+        return ReplaceInstUsesWith(I, ConstantInt::getTrue());
+      return new ICmpInst(ICmpInst::ICMP_SGT, Op0, SubOne(CI));
+    }
+    
+    // If this comparison is a normal comparison, it demands all
+    // bits, if it is a sign bit comparison, it only demands the sign bit.
+    bool UnusedBit;
+    isSignBit = isSignBitCheck(I.getPredicate(), CI, UnusedBit);
+  }
+
+  // See if we can fold the comparison based on range information we can get
+  // by checking whether bits are known to be zero or one in the input.
+  if (BitWidth != 0) {
+    APInt Op0KnownZero(BitWidth, 0), Op0KnownOne(BitWidth, 0);
+    APInt Op1KnownZero(BitWidth, 0), Op1KnownOne(BitWidth, 0);
+
+    if (SimplifyDemandedBits(I.getOperandUse(0),
+                             isSignBit ? APInt::getSignBit(BitWidth)
+                                       : APInt::getAllOnesValue(BitWidth),
+                             Op0KnownZero, Op0KnownOne, 0))
+      return &I;
+    if (SimplifyDemandedBits(I.getOperandUse(1),
+                             APInt::getAllOnesValue(BitWidth),
+                             Op1KnownZero, Op1KnownOne, 0))
+      return &I;
+
+    // Given the known and unknown bits, compute a range that the LHS could be
+    // in.  Compute the Min, Max and RHS values based on the known bits. For the
+    // EQ and NE we use unsigned values.
+    APInt Op0Min(BitWidth, 0), Op0Max(BitWidth, 0);
+    APInt Op1Min(BitWidth, 0), Op1Max(BitWidth, 0);
+    if (ICmpInst::isSignedPredicate(I.getPredicate())) {
+      ComputeSignedMinMaxValuesFromKnownBits(Op0KnownZero, Op0KnownOne,
+                                             Op0Min, Op0Max);
+      ComputeSignedMinMaxValuesFromKnownBits(Op1KnownZero, Op1KnownOne,
+                                             Op1Min, Op1Max);
+    } else {
+      ComputeUnsignedMinMaxValuesFromKnownBits(Op0KnownZero, Op0KnownOne,
+                                               Op0Min, Op0Max);
+      ComputeUnsignedMinMaxValuesFromKnownBits(Op1KnownZero, Op1KnownOne,
+                                               Op1Min, Op1Max);
+    }
+
+    // If Min and Max are known to be the same, then SimplifyDemandedBits
+    // figured out that the LHS is a constant.  Just constant fold this now so
+    // that code below can assume that Min != Max.
+    if (!isa<Constant>(Op0) && Op0Min == Op0Max)
+      return new ICmpInst(I.getPredicate(), ConstantInt::get(Op0Min), Op1);
+    if (!isa<Constant>(Op1) && Op1Min == Op1Max)
+      return new ICmpInst(I.getPredicate(), Op0, ConstantInt::get(Op1Min));
+
+    // Based on the range information we know about the LHS, see if we can
+    // simplify this comparison.  For example, (x&4) < 8  is always true.
+    switch (I.getPredicate()) {
+    default: assert(0 && "Unknown icmp opcode!");
+    case ICmpInst::ICMP_EQ:
+      if (Op0Max.ult(Op1Min) || Op0Min.ugt(Op1Max))
+        return ReplaceInstUsesWith(I, ConstantInt::getFalse());
+      break;
+    case ICmpInst::ICMP_NE:
+      if (Op0Max.ult(Op1Min) || Op0Min.ugt(Op1Max))
+        return ReplaceInstUsesWith(I, ConstantInt::getTrue());
+      break;
+    case ICmpInst::ICMP_ULT:
+      if (Op0Max.ult(Op1Min))          // A <u B -> true if max(A) < min(B)
+        return ReplaceInstUsesWith(I, ConstantInt::getTrue());
+      if (Op0Min.uge(Op1Max))          // A <u B -> false if min(A) >= max(B)
+        return ReplaceInstUsesWith(I, ConstantInt::getFalse());
+      if (Op1Min == Op0Max)            // A <u B -> A != B if max(A) == min(B)
+        return new ICmpInst(ICmpInst::ICMP_NE, Op0, Op1);
+      if (ConstantInt *CI = dyn_cast<ConstantInt>(Op1)) {
+        if (Op1Max == Op0Min+1)        // A <u C -> A == C-1 if min(A)+1 == C
+          return new ICmpInst(ICmpInst::ICMP_EQ, Op0, SubOne(CI));
+
+        // (x <u 2147483648) -> (x >s -1)  -> true if sign bit clear
+        if (CI->isMinValue(true))
+          return new ICmpInst(ICmpInst::ICMP_SGT, Op0,
+                            ConstantInt::getAllOnesValue(Op0->getType()));
+      }
+      break;
+    case ICmpInst::ICMP_UGT:
+      if (Op0Min.ugt(Op1Max))          // A >u B -> true if min(A) > max(B)
+        return ReplaceInstUsesWith(I, ConstantInt::getTrue());
+      if (Op0Max.ule(Op1Min))          // A >u B -> false if max(A) <= max(B)
+        return ReplaceInstUsesWith(I, ConstantInt::getFalse());
+
+      if (Op1Max == Op0Min)            // A >u B -> A != B if min(A) == max(B)
+        return new ICmpInst(ICmpInst::ICMP_NE, Op0, Op1);
+      if (ConstantInt *CI = dyn_cast<ConstantInt>(Op1)) {
+        if (Op1Min == Op0Max-1)        // A >u C -> A == C+1 if max(a)-1 == C
+          return new ICmpInst(ICmpInst::ICMP_EQ, Op0, AddOne(CI));
+
+        // (x >u 2147483647) -> (x <s 0)  -> true if sign bit set
+        if (CI->isMaxValue(true))
+          return new ICmpInst(ICmpInst::ICMP_SLT, Op0,
+                              ConstantInt::getNullValue(Op0->getType()));
+      }
+      break;
+    case ICmpInst::ICMP_SLT:
+      if (Op0Max.slt(Op1Min))          // A <s B -> true if max(A) < min(C)
+        return ReplaceInstUsesWith(I, ConstantInt::getTrue());
+      if (Op0Min.sge(Op1Max))          // A <s B -> false if min(A) >= max(C)
+        return ReplaceInstUsesWith(I, ConstantInt::getFalse());
+      if (Op1Min == Op0Max)            // A <s B -> A != B if max(A) == min(B)
+        return new ICmpInst(ICmpInst::ICMP_NE, Op0, Op1);
+      if (ConstantInt *CI = dyn_cast<ConstantInt>(Op1)) {
+        if (Op1Max == Op0Min+1)        // A <s C -> A == C-1 if min(A)+1 == C
+          return new ICmpInst(ICmpInst::ICMP_EQ, Op0, SubOne(CI));
+      }
+      break;
+    case ICmpInst::ICMP_SGT:
+      if (Op0Min.sgt(Op1Max))          // A >s B -> true if min(A) > max(B)
+        return ReplaceInstUsesWith(I, ConstantInt::getTrue());
+      if (Op0Max.sle(Op1Min))          // A >s B -> false if max(A) <= min(B)
+        return ReplaceInstUsesWith(I, ConstantInt::getFalse());
+
+      if (Op1Max == Op0Min)            // A >s B -> A != B if min(A) == max(B)
+        return new ICmpInst(ICmpInst::ICMP_NE, Op0, Op1);
+      if (ConstantInt *CI = dyn_cast<ConstantInt>(Op1)) {
+        if (Op1Min == Op0Max-1)        // A >s C -> A == C+1 if max(A)-1 == C
+          return new ICmpInst(ICmpInst::ICMP_EQ, Op0, AddOne(CI));
+      }
+      break;
+    case ICmpInst::ICMP_SGE:
+      assert(!isa<ConstantInt>(Op1) && "ICMP_SGE with ConstantInt not folded!");
+      if (Op0Min.sge(Op1Max))          // A >=s B -> true if min(A) >= max(B)
+        return ReplaceInstUsesWith(I, ConstantInt::getTrue());
+      if (Op0Max.slt(Op1Min))          // A >=s B -> false if max(A) < min(B)
+        return ReplaceInstUsesWith(I, ConstantInt::getFalse());
+      break;
+    case ICmpInst::ICMP_SLE:
+      assert(!isa<ConstantInt>(Op1) && "ICMP_SLE with ConstantInt not folded!");
+      if (Op0Max.sle(Op1Min))          // A <=s B -> true if max(A) <= min(B)
+        return ReplaceInstUsesWith(I, ConstantInt::getTrue());
+      if (Op0Min.sgt(Op1Max))          // A <=s B -> false if min(A) > max(B)
+        return ReplaceInstUsesWith(I, ConstantInt::getFalse());
+      break;
+    case ICmpInst::ICMP_UGE:
+      assert(!isa<ConstantInt>(Op1) && "ICMP_UGE with ConstantInt not folded!");
+      if (Op0Min.uge(Op1Max))          // A >=u B -> true if min(A) >= max(B)
+        return ReplaceInstUsesWith(I, ConstantInt::getTrue());
+      if (Op0Max.ult(Op1Min))          // A >=u B -> false if max(A) < min(B)
+        return ReplaceInstUsesWith(I, ConstantInt::getFalse());
+      break;
+    case ICmpInst::ICMP_ULE:
+      assert(!isa<ConstantInt>(Op1) && "ICMP_ULE with ConstantInt not folded!");
+      if (Op0Max.ule(Op1Min))          // A <=u B -> true if max(A) <= min(B)
+        return ReplaceInstUsesWith(I, ConstantInt::getTrue());
+      if (Op0Min.ugt(Op1Max))          // A <=u B -> false if min(A) > max(B)
+        return ReplaceInstUsesWith(I, ConstantInt::getFalse());
+      break;
+    }
+
+    // Turn a signed comparison into an unsigned one if both operands
+    // are known to have the same sign.
+    if (I.isSignedPredicate() &&
+        ((Op0KnownZero.isNegative() && Op1KnownZero.isNegative()) ||
+         (Op0KnownOne.isNegative() && Op1KnownOne.isNegative())))
+      return new ICmpInst(I.getUnsignedPredicate(), Op0, Op1);
+  }
+
+  // Test if the ICmpInst instruction is used exclusively by a select as
+  // part of a minimum or maximum operation. If so, refrain from doing
+  // any other folding. This helps out other analyses which understand
+  // non-obfuscated minimum and maximum idioms, such as ScalarEvolution
+  // and CodeGen. And in this case, at least one of the comparison
+  // operands has at least one user besides the compare (the select),
+  // which would often largely negate the benefit of folding anyway.
+  if (I.hasOneUse())
+    if (SelectInst *SI = dyn_cast<SelectInst>(*I.use_begin()))
+      if ((SI->getOperand(1) == Op0 && SI->getOperand(2) == Op1) ||
+          (SI->getOperand(2) == Op0 && SI->getOperand(1) == Op1))
+        return 0;
+
+  // See if we are doing a comparison between a constant and an instruction that
+  // can be folded into the comparison.
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(Op1)) {
+    // Since the RHS is a ConstantInt (CI), if the left hand side is an 
+    // instruction, see if that instruction also has constants so that the 
+    // instruction can be folded into the icmp 
+    if (Instruction *LHSI = dyn_cast<Instruction>(Op0))
+      if (Instruction *Res = visitICmpInstWithInstAndIntCst(I, LHSI, CI))
+        return Res;
+  }
+
+  // Handle icmp with constant (but not simple integer constant) RHS
+  if (Constant *RHSC = dyn_cast<Constant>(Op1)) {
+    if (Instruction *LHSI = dyn_cast<Instruction>(Op0))
+      switch (LHSI->getOpcode()) {
+      case Instruction::GetElementPtr:
+        if (RHSC->isNullValue()) {
+          // icmp pred GEP (P, int 0, int 0, int 0), null -> icmp pred P, null
+          bool isAllZeros = true;
+          for (unsigned i = 1, e = LHSI->getNumOperands(); i != e; ++i)
+            if (!isa<Constant>(LHSI->getOperand(i)) ||
+                !cast<Constant>(LHSI->getOperand(i))->isNullValue()) {
+              isAllZeros = false;
+              break;
+            }
+          if (isAllZeros)
+            return new ICmpInst(I.getPredicate(), LHSI->getOperand(0),
+                    Constant::getNullValue(LHSI->getOperand(0)->getType()));
+        }
+        break;
+
+      case Instruction::PHI:
+        // Only fold icmp into the PHI if the phi and fcmp are in the same
+        // block.  If in the same block, we're encouraging jump threading.  If
+        // not, we are just pessimizing the code by making an i1 phi.
+        if (LHSI->getParent() == I.getParent())
+          if (Instruction *NV = FoldOpIntoPhi(I))
+            return NV;
+        break;
+      case Instruction::Select: {
+        // If either operand of the select is a constant, we can fold the
+        // comparison into the select arms, which will cause one to be
+        // constant folded and the select turned into a bitwise or.
+        Value *Op1 = 0, *Op2 = 0;
+        if (LHSI->hasOneUse()) {
+          if (Constant *C = dyn_cast<Constant>(LHSI->getOperand(1))) {
+            // Fold the known value into the constant operand.
+            Op1 = ConstantExpr::getICmp(I.getPredicate(), C, RHSC);
+            // Insert a new ICmp of the other select operand.
+            Op2 = InsertNewInstBefore(new ICmpInst(I.getPredicate(),
+                                                   LHSI->getOperand(2), RHSC,
+                                                   I.getName()), I);
+          } else if (Constant *C = dyn_cast<Constant>(LHSI->getOperand(2))) {
+            // Fold the known value into the constant operand.
+            Op2 = ConstantExpr::getICmp(I.getPredicate(), C, RHSC);
+            // Insert a new ICmp of the other select operand.
+            Op1 = InsertNewInstBefore(new ICmpInst(I.getPredicate(),
+                                                   LHSI->getOperand(1), RHSC,
+                                                   I.getName()), I);
+          }
+        }
+
+        if (Op1)
+          return SelectInst::Create(LHSI->getOperand(0), Op1, Op2);
+        break;
+      }
+      case Instruction::Malloc:
+        // If we have (malloc != null), and if the malloc has a single use, we
+        // can assume it is successful and remove the malloc.
+        if (LHSI->hasOneUse() && isa<ConstantPointerNull>(RHSC)) {
+          AddToWorkList(LHSI);
+          return ReplaceInstUsesWith(I, ConstantInt::get(Type::Int1Ty,
+                                                         !I.isTrueWhenEqual()));
+        }
+        break;
+      }
+  }
+
+  // If we can optimize a 'icmp GEP, P' or 'icmp P, GEP', do so now.
+  if (User *GEP = dyn_castGetElementPtr(Op0))
+    if (Instruction *NI = FoldGEPICmp(GEP, Op1, I.getPredicate(), I))
+      return NI;
+  if (User *GEP = dyn_castGetElementPtr(Op1))
+    if (Instruction *NI = FoldGEPICmp(GEP, Op0,
+                           ICmpInst::getSwappedPredicate(I.getPredicate()), I))
+      return NI;
+
+  // Test to see if the operands of the icmp are casted versions of other
+  // values.  If the ptr->ptr cast can be stripped off both arguments, we do so
+  // now.
+  if (BitCastInst *CI = dyn_cast<BitCastInst>(Op0)) {
+    if (isa<PointerType>(Op0->getType()) && 
+        (isa<Constant>(Op1) || isa<BitCastInst>(Op1))) { 
+      // We keep moving the cast from the left operand over to the right
+      // operand, where it can often be eliminated completely.
+      Op0 = CI->getOperand(0);
+
+      // If operand #1 is a bitcast instruction, it must also be a ptr->ptr cast
+      // so eliminate it as well.
+      if (BitCastInst *CI2 = dyn_cast<BitCastInst>(Op1))
+        Op1 = CI2->getOperand(0);
+
+      // If Op1 is a constant, we can fold the cast into the constant.
+      if (Op0->getType() != Op1->getType()) {
+        if (Constant *Op1C = dyn_cast<Constant>(Op1)) {
+          Op1 = ConstantExpr::getBitCast(Op1C, Op0->getType());
+        } else {
+          // Otherwise, cast the RHS right before the icmp
+          Op1 = InsertBitCastBefore(Op1, Op0->getType(), I);
+        }
+      }
+      return new ICmpInst(I.getPredicate(), Op0, Op1);
+    }
+  }
+  
+  if (isa<CastInst>(Op0)) {
+    // Handle the special case of: icmp (cast bool to X), <cst>
+    // This comes up when you have code like
+    //   int X = A < B;
+    //   if (X) ...
+    // For generality, we handle any zero-extension of any operand comparison
+    // with a constant or another cast from the same type.
+    if (isa<ConstantInt>(Op1) || isa<CastInst>(Op1))
+      if (Instruction *R = visitICmpInstWithCastAndCast(I))
+        return R;
+  }
+  
+  // See if it's the same type of instruction on the left and right.
+  if (BinaryOperator *Op0I = dyn_cast<BinaryOperator>(Op0)) {
+    if (BinaryOperator *Op1I = dyn_cast<BinaryOperator>(Op1)) {
+      if (Op0I->getOpcode() == Op1I->getOpcode() && Op0I->hasOneUse() &&
+          Op1I->hasOneUse() && Op0I->getOperand(1) == Op1I->getOperand(1)) {
+        switch (Op0I->getOpcode()) {
+        default: break;
+        case Instruction::Add:
+        case Instruction::Sub:
+        case Instruction::Xor:
+          if (I.isEquality())    // a+x icmp eq/ne b+x --> a icmp b
+            return new ICmpInst(I.getPredicate(), Op0I->getOperand(0),
+                                Op1I->getOperand(0));
+          // icmp u/s (a ^ signbit), (b ^ signbit) --> icmp s/u a, b
+          if (ConstantInt *CI = dyn_cast<ConstantInt>(Op0I->getOperand(1))) {
+            if (CI->getValue().isSignBit()) {
+              ICmpInst::Predicate Pred = I.isSignedPredicate()
+                                             ? I.getUnsignedPredicate()
+                                             : I.getSignedPredicate();
+              return new ICmpInst(Pred, Op0I->getOperand(0),
+                                  Op1I->getOperand(0));
+            }
+            
+            if (CI->getValue().isMaxSignedValue()) {
+              ICmpInst::Predicate Pred = I.isSignedPredicate()
+                                             ? I.getUnsignedPredicate()
+                                             : I.getSignedPredicate();
+              Pred = I.getSwappedPredicate(Pred);
+              return new ICmpInst(Pred, Op0I->getOperand(0),
+                                  Op1I->getOperand(0));
+            }
+          }
+          break;
+        case Instruction::Mul:
+          if (!I.isEquality())
+            break;
+
+          if (ConstantInt *CI = dyn_cast<ConstantInt>(Op0I->getOperand(1))) {
+            // a * Cst icmp eq/ne b * Cst --> a & Mask icmp b & Mask
+            // Mask = -1 >> count-trailing-zeros(Cst).
+            if (!CI->isZero() && !CI->isOne()) {
+              const APInt &AP = CI->getValue();
+              ConstantInt *Mask = ConstantInt::get(
+                                      APInt::getLowBitsSet(AP.getBitWidth(),
+                                                           AP.getBitWidth() -
+                                                      AP.countTrailingZeros()));
+              Instruction *And1 = BinaryOperator::CreateAnd(Op0I->getOperand(0),
+                                                            Mask);
+              Instruction *And2 = BinaryOperator::CreateAnd(Op1I->getOperand(0),
+                                                            Mask);
+              InsertNewInstBefore(And1, I);
+              InsertNewInstBefore(And2, I);
+              return new ICmpInst(I.getPredicate(), And1, And2);
+            }
+          }
+          break;
+        }
+      }
+    }
+  }
+  
+  // ~x < ~y --> y < x
+  { Value *A, *B;
+    if (match(Op0, m_Not(m_Value(A))) &&
+        match(Op1, m_Not(m_Value(B))))
+      return new ICmpInst(I.getPredicate(), B, A);
+  }
+  
+  if (I.isEquality()) {
+    Value *A, *B, *C, *D;
+    
+    // -x == -y --> x == y
+    if (match(Op0, m_Neg(m_Value(A))) &&
+        match(Op1, m_Neg(m_Value(B))))
+      return new ICmpInst(I.getPredicate(), A, B);
+    
+    if (match(Op0, m_Xor(m_Value(A), m_Value(B)))) {
+      if (A == Op1 || B == Op1) {    // (A^B) == A  ->  B == 0
+        Value *OtherVal = A == Op1 ? B : A;
+        return new ICmpInst(I.getPredicate(), OtherVal,
+                            Constant::getNullValue(A->getType()));
+      }
+
+      if (match(Op1, m_Xor(m_Value(C), m_Value(D)))) {
+        // A^c1 == C^c2 --> A == C^(c1^c2)
+        ConstantInt *C1, *C2;
+        if (match(B, m_ConstantInt(C1)) &&
+            match(D, m_ConstantInt(C2)) && Op1->hasOneUse()) {
+          Constant *NC = ConstantInt::get(C1->getValue() ^ C2->getValue());
+          Instruction *Xor = BinaryOperator::CreateXor(C, NC, "tmp");
+          return new ICmpInst(I.getPredicate(), A,
+                              InsertNewInstBefore(Xor, I));
+        }
+        
+        // A^B == A^D -> B == D
+        if (A == C) return new ICmpInst(I.getPredicate(), B, D);
+        if (A == D) return new ICmpInst(I.getPredicate(), B, C);
+        if (B == C) return new ICmpInst(I.getPredicate(), A, D);
+        if (B == D) return new ICmpInst(I.getPredicate(), A, C);
+      }
+    }
+    
+    if (match(Op1, m_Xor(m_Value(A), m_Value(B))) &&
+        (A == Op0 || B == Op0)) {
+      // A == (A^B)  ->  B == 0
+      Value *OtherVal = A == Op0 ? B : A;
+      return new ICmpInst(I.getPredicate(), OtherVal,
+                          Constant::getNullValue(A->getType()));
+    }
+
+    // (A-B) == A  ->  B == 0
+    if (match(Op0, m_Sub(m_Specific(Op1), m_Value(B))))
+      return new ICmpInst(I.getPredicate(), B, 
+                          Constant::getNullValue(B->getType()));
+
+    // A == (A-B)  ->  B == 0
+    if (match(Op1, m_Sub(m_Specific(Op0), m_Value(B))))
+      return new ICmpInst(I.getPredicate(), B,
+                          Constant::getNullValue(B->getType()));
+    
+    // (X&Z) == (Y&Z) -> (X^Y) & Z == 0
+    if (Op0->hasOneUse() && Op1->hasOneUse() &&
+        match(Op0, m_And(m_Value(A), m_Value(B))) && 
+        match(Op1, m_And(m_Value(C), m_Value(D)))) {
+      Value *X = 0, *Y = 0, *Z = 0;
+      
+      if (A == C) {
+        X = B; Y = D; Z = A;
+      } else if (A == D) {
+        X = B; Y = C; Z = A;
+      } else if (B == C) {
+        X = A; Y = D; Z = B;
+      } else if (B == D) {
+        X = A; Y = C; Z = B;
+      }
+      
+      if (X) {   // Build (X^Y) & Z
+        Op1 = InsertNewInstBefore(BinaryOperator::CreateXor(X, Y, "tmp"), I);
+        Op1 = InsertNewInstBefore(BinaryOperator::CreateAnd(Op1, Z, "tmp"), I);
+        I.setOperand(0, Op1);
+        I.setOperand(1, Constant::getNullValue(Op1->getType()));
+        return &I;
+      }
+    }
+  }
+  return Changed ? &I : 0;
+}
+
+
+/// FoldICmpDivCst - Fold "icmp pred, ([su]div X, DivRHS), CmpRHS" where DivRHS
+/// and CmpRHS are both known to be integer constants.
+Instruction *InstCombiner::FoldICmpDivCst(ICmpInst &ICI, BinaryOperator *DivI,
+                                          ConstantInt *DivRHS) {
+  ConstantInt *CmpRHS = cast<ConstantInt>(ICI.getOperand(1));
+  const APInt &CmpRHSV = CmpRHS->getValue();
+  
+  // FIXME: If the operand types don't match the type of the divide 
+  // then don't attempt this transform. The code below doesn't have the
+  // logic to deal with a signed divide and an unsigned compare (and
+  // vice versa). This is because (x /s C1) <s C2  produces different 
+  // results than (x /s C1) <u C2 or (x /u C1) <s C2 or even
+  // (x /u C1) <u C2.  Simply casting the operands and result won't 
+  // work. :(  The if statement below tests that condition and bails 
+  // if it finds it. 
+  bool DivIsSigned = DivI->getOpcode() == Instruction::SDiv;
+  if (!ICI.isEquality() && DivIsSigned != ICI.isSignedPredicate())
+    return 0;
+  if (DivRHS->isZero())
+    return 0; // The ProdOV computation fails on divide by zero.
+  if (DivIsSigned && DivRHS->isAllOnesValue())
+    return 0; // The overflow computation also screws up here
+  if (DivRHS->isOne())
+    return 0; // Not worth bothering, and eliminates some funny cases
+              // with INT_MIN.
+
+  // Compute Prod = CI * DivRHS. We are essentially solving an equation
+  // of form X/C1=C2. We solve for X by multiplying C1 (DivRHS) and 
+  // C2 (CI). By solving for X we can turn this into a range check 
+  // instead of computing a divide. 
+  ConstantInt *Prod = Multiply(CmpRHS, DivRHS);
+
+  // Determine if the product overflows by seeing if the product is
+  // not equal to the divide. Make sure we do the same kind of divide
+  // as in the LHS instruction that we're folding. 
+  bool ProdOV = (DivIsSigned ? ConstantExpr::getSDiv(Prod, DivRHS) :
+                 ConstantExpr::getUDiv(Prod, DivRHS)) != CmpRHS;
+
+  // Get the ICmp opcode
+  ICmpInst::Predicate Pred = ICI.getPredicate();
+
+  // Figure out the interval that is being checked.  For example, a comparison
+  // like "X /u 5 == 0" is really checking that X is in the interval [0, 5). 
+  // Compute this interval based on the constants involved and the signedness of
+  // the compare/divide.  This computes a half-open interval, keeping track of
+  // whether either value in the interval overflows.  After analysis each
+  // overflow variable is set to 0 if it's corresponding bound variable is valid
+  // -1 if overflowed off the bottom end, or +1 if overflowed off the top end.
+  int LoOverflow = 0, HiOverflow = 0;
+  ConstantInt *LoBound = 0, *HiBound = 0;
+  
+  if (!DivIsSigned) {  // udiv
+    // e.g. X/5 op 3  --> [15, 20)
+    LoBound = Prod;
+    HiOverflow = LoOverflow = ProdOV;
+    if (!HiOverflow)
+      HiOverflow = AddWithOverflow(HiBound, LoBound, DivRHS, false);
+  } else if (DivRHS->getValue().isStrictlyPositive()) { // Divisor is > 0.
+    if (CmpRHSV == 0) {       // (X / pos) op 0
+      // Can't overflow.  e.g.  X/2 op 0 --> [-1, 2)
+      LoBound = cast<ConstantInt>(ConstantExpr::getNeg(SubOne(DivRHS)));
+      HiBound = DivRHS;
+    } else if (CmpRHSV.isStrictlyPositive()) {   // (X / pos) op pos
+      LoBound = Prod;     // e.g.   X/5 op 3 --> [15, 20)
+      HiOverflow = LoOverflow = ProdOV;
+      if (!HiOverflow)
+        HiOverflow = AddWithOverflow(HiBound, Prod, DivRHS, true);
+    } else {                       // (X / pos) op neg
+      // e.g. X/5 op -3  --> [-15-4, -15+1) --> [-19, -14)
+      HiBound = AddOne(Prod);
+      LoOverflow = HiOverflow = ProdOV ? -1 : 0;
+      if (!LoOverflow) {
+        ConstantInt* DivNeg = cast<ConstantInt>(ConstantExpr::getNeg(DivRHS));
+        LoOverflow = AddWithOverflow(LoBound, HiBound, DivNeg,
+                                     true) ? -1 : 0;
+       }
+    }
+  } else if (DivRHS->getValue().isNegative()) { // Divisor is < 0.
+    if (CmpRHSV == 0) {       // (X / neg) op 0
+      // e.g. X/-5 op 0  --> [-4, 5)
+      LoBound = AddOne(DivRHS);
+      HiBound = cast<ConstantInt>(ConstantExpr::getNeg(DivRHS));
+      if (HiBound == DivRHS) {     // -INTMIN = INTMIN
+        HiOverflow = 1;            // [INTMIN+1, overflow)
+        HiBound = 0;               // e.g. X/INTMIN = 0 --> X > INTMIN
+      }
+    } else if (CmpRHSV.isStrictlyPositive()) {   // (X / neg) op pos
+      // e.g. X/-5 op 3  --> [-19, -14)
+      HiBound = AddOne(Prod);
+      HiOverflow = LoOverflow = ProdOV ? -1 : 0;
+      if (!LoOverflow)
+        LoOverflow = AddWithOverflow(LoBound, HiBound, DivRHS, true) ? -1 : 0;
+    } else {                       // (X / neg) op neg
+      LoBound = Prod;       // e.g. X/-5 op -3  --> [15, 20)
+      LoOverflow = HiOverflow = ProdOV;
+      if (!HiOverflow)
+        HiOverflow = SubWithOverflow(HiBound, Prod, DivRHS, true);
+    }
+    
+    // Dividing by a negative swaps the condition.  LT <-> GT
+    Pred = ICmpInst::getSwappedPredicate(Pred);
+  }
+
+  Value *X = DivI->getOperand(0);
+  switch (Pred) {
+  default: assert(0 && "Unhandled icmp opcode!");
+  case ICmpInst::ICMP_EQ:
+    if (LoOverflow && HiOverflow)
+      return ReplaceInstUsesWith(ICI, ConstantInt::getFalse());
+    else if (HiOverflow)
+      return new ICmpInst(DivIsSigned ? ICmpInst::ICMP_SGE : 
+                          ICmpInst::ICMP_UGE, X, LoBound);
+    else if (LoOverflow)
+      return new ICmpInst(DivIsSigned ? ICmpInst::ICMP_SLT : 
+                          ICmpInst::ICMP_ULT, X, HiBound);
+    else
+      return InsertRangeTest(X, LoBound, HiBound, DivIsSigned, true, ICI);
+  case ICmpInst::ICMP_NE:
+    if (LoOverflow && HiOverflow)
+      return ReplaceInstUsesWith(ICI, ConstantInt::getTrue());
+    else if (HiOverflow)
+      return new ICmpInst(DivIsSigned ? ICmpInst::ICMP_SLT : 
+                          ICmpInst::ICMP_ULT, X, LoBound);
+    else if (LoOverflow)
+      return new ICmpInst(DivIsSigned ? ICmpInst::ICMP_SGE : 
+                          ICmpInst::ICMP_UGE, X, HiBound);
+    else
+      return InsertRangeTest(X, LoBound, HiBound, DivIsSigned, false, ICI);
+  case ICmpInst::ICMP_ULT:
+  case ICmpInst::ICMP_SLT:
+    if (LoOverflow == +1)   // Low bound is greater than input range.
+      return ReplaceInstUsesWith(ICI, ConstantInt::getTrue());
+    if (LoOverflow == -1)   // Low bound is less than input range.
+      return ReplaceInstUsesWith(ICI, ConstantInt::getFalse());
+    return new ICmpInst(Pred, X, LoBound);
+  case ICmpInst::ICMP_UGT:
+  case ICmpInst::ICMP_SGT:
+    if (HiOverflow == +1)       // High bound greater than input range.
+      return ReplaceInstUsesWith(ICI, ConstantInt::getFalse());
+    else if (HiOverflow == -1)  // High bound less than input range.
+      return ReplaceInstUsesWith(ICI, ConstantInt::getTrue());
+    if (Pred == ICmpInst::ICMP_UGT)
+      return new ICmpInst(ICmpInst::ICMP_UGE, X, HiBound);
+    else
+      return new ICmpInst(ICmpInst::ICMP_SGE, X, HiBound);
+  }
+}
+
+
+/// visitICmpInstWithInstAndIntCst - Handle "icmp (instr, intcst)".
+///
+Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
+                                                          Instruction *LHSI,
+                                                          ConstantInt *RHS) {
+  const APInt &RHSV = RHS->getValue();
+  
+  switch (LHSI->getOpcode()) {
+  case Instruction::Trunc:
+    if (ICI.isEquality() && LHSI->hasOneUse()) {
+      // Simplify icmp eq (trunc x to i8), 42 -> icmp eq x, 42|highbits if all
+      // of the high bits truncated out of x are known.
+      unsigned DstBits = LHSI->getType()->getPrimitiveSizeInBits(),
+             SrcBits = LHSI->getOperand(0)->getType()->getPrimitiveSizeInBits();
+      APInt Mask(APInt::getHighBitsSet(SrcBits, SrcBits-DstBits));
+      APInt KnownZero(SrcBits, 0), KnownOne(SrcBits, 0);
+      ComputeMaskedBits(LHSI->getOperand(0), Mask, KnownZero, KnownOne);
+      
+      // If all the high bits are known, we can do this xform.
+      if ((KnownZero|KnownOne).countLeadingOnes() >= SrcBits-DstBits) {
+        // Pull in the high bits from known-ones set.
+        APInt NewRHS(RHS->getValue());
+        NewRHS.zext(SrcBits);
+        NewRHS |= KnownOne;
+        return new ICmpInst(ICI.getPredicate(), LHSI->getOperand(0),
+                            ConstantInt::get(NewRHS));
+      }
+    }
+    break;
+      
+  case Instruction::Xor:         // (icmp pred (xor X, XorCST), CI)
+    if (ConstantInt *XorCST = dyn_cast<ConstantInt>(LHSI->getOperand(1))) {
+      // If this is a comparison that tests the signbit (X < 0) or (x > -1),
+      // fold the xor.
+      if ((ICI.getPredicate() == ICmpInst::ICMP_SLT && RHSV == 0) ||
+          (ICI.getPredicate() == ICmpInst::ICMP_SGT && RHSV.isAllOnesValue())) {
+        Value *CompareVal = LHSI->getOperand(0);
+        
+        // If the sign bit of the XorCST is not set, there is no change to
+        // the operation, just stop using the Xor.
+        if (!XorCST->getValue().isNegative()) {
+          ICI.setOperand(0, CompareVal);
+          AddToWorkList(LHSI);
+          return &ICI;
+        }
+        
+        // Was the old condition true if the operand is positive?
+        bool isTrueIfPositive = ICI.getPredicate() == ICmpInst::ICMP_SGT;
+        
+        // If so, the new one isn't.
+        isTrueIfPositive ^= true;
+        
+        if (isTrueIfPositive)
+          return new ICmpInst(ICmpInst::ICMP_SGT, CompareVal, SubOne(RHS));
+        else
+          return new ICmpInst(ICmpInst::ICMP_SLT, CompareVal, AddOne(RHS));
+      }
+
+      if (LHSI->hasOneUse()) {
+        // (icmp u/s (xor A SignBit), C) -> (icmp s/u A, (xor C SignBit))
+        if (!ICI.isEquality() && XorCST->getValue().isSignBit()) {
+          const APInt &SignBit = XorCST->getValue();
+          ICmpInst::Predicate Pred = ICI.isSignedPredicate()
+                                         ? ICI.getUnsignedPredicate()
+                                         : ICI.getSignedPredicate();
+          return new ICmpInst(Pred, LHSI->getOperand(0),
+                              ConstantInt::get(RHSV ^ SignBit));
+        }
+
+        // (icmp u/s (xor A ~SignBit), C) -> (icmp s/u (xor C ~SignBit), A)
+        if (!ICI.isEquality() && XorCST->getValue().isMaxSignedValue()) {
+          const APInt &NotSignBit = XorCST->getValue();
+          ICmpInst::Predicate Pred = ICI.isSignedPredicate()
+                                         ? ICI.getUnsignedPredicate()
+                                         : ICI.getSignedPredicate();
+          Pred = ICI.getSwappedPredicate(Pred);
+          return new ICmpInst(Pred, LHSI->getOperand(0),
+                              ConstantInt::get(RHSV ^ NotSignBit));
+        }
+      }
+    }
+    break;
+  case Instruction::And:         // (icmp pred (and X, AndCST), RHS)
+    if (LHSI->hasOneUse() && isa<ConstantInt>(LHSI->getOperand(1)) &&
+        LHSI->getOperand(0)->hasOneUse()) {
+      ConstantInt *AndCST = cast<ConstantInt>(LHSI->getOperand(1));
+      
+      // If the LHS is an AND of a truncating cast, we can widen the
+      // and/compare to be the input width without changing the value
+      // produced, eliminating a cast.
+      if (TruncInst *Cast = dyn_cast<TruncInst>(LHSI->getOperand(0))) {
+        // We can do this transformation if either the AND constant does not
+        // have its sign bit set or if it is an equality comparison. 
+        // Extending a relational comparison when we're checking the sign
+        // bit would not work.
+        if (Cast->hasOneUse() &&
+            (ICI.isEquality() ||
+             (AndCST->getValue().isNonNegative() && RHSV.isNonNegative()))) {
+          uint32_t BitWidth = 
+            cast<IntegerType>(Cast->getOperand(0)->getType())->getBitWidth();
+          APInt NewCST = AndCST->getValue();
+          NewCST.zext(BitWidth);
+          APInt NewCI = RHSV;
+          NewCI.zext(BitWidth);
+          Instruction *NewAnd = 
+            BinaryOperator::CreateAnd(Cast->getOperand(0),
+                                      ConstantInt::get(NewCST),LHSI->getName());
+          InsertNewInstBefore(NewAnd, ICI);
+          return new ICmpInst(ICI.getPredicate(), NewAnd,
+                              ConstantInt::get(NewCI));
+        }
+      }
+      
+      // If this is: (X >> C1) & C2 != C3 (where any shift and any compare
+      // could exist), turn it into (X & (C2 << C1)) != (C3 << C1).  This
+      // happens a LOT in code produced by the C front-end, for bitfield
+      // access.
+      BinaryOperator *Shift = dyn_cast<BinaryOperator>(LHSI->getOperand(0));
+      if (Shift && !Shift->isShift())
+        Shift = 0;
+      
+      ConstantInt *ShAmt;
+      ShAmt = Shift ? dyn_cast<ConstantInt>(Shift->getOperand(1)) : 0;
+      const Type *Ty = Shift ? Shift->getType() : 0;  // Type of the shift.
+      const Type *AndTy = AndCST->getType();          // Type of the and.
+      
+      // We can fold this as long as we can't shift unknown bits
+      // into the mask.  This can only happen with signed shift
+      // rights, as they sign-extend.
+      if (ShAmt) {
+        bool CanFold = Shift->isLogicalShift();
+        if (!CanFold) {
+          // To test for the bad case of the signed shr, see if any
+          // of the bits shifted in could be tested after the mask.
+          uint32_t TyBits = Ty->getPrimitiveSizeInBits();
+          int ShAmtVal = TyBits - ShAmt->getLimitedValue(TyBits);
+          
+          uint32_t BitWidth = AndTy->getPrimitiveSizeInBits();
+          if ((APInt::getHighBitsSet(BitWidth, BitWidth-ShAmtVal) & 
+               AndCST->getValue()) == 0)
+            CanFold = true;
+        }
+        
+        if (CanFold) {
+          Constant *NewCst;
+          if (Shift->getOpcode() == Instruction::Shl)
+            NewCst = ConstantExpr::getLShr(RHS, ShAmt);
+          else
+            NewCst = ConstantExpr::getShl(RHS, ShAmt);
+          
+          // Check to see if we are shifting out any of the bits being
+          // compared.
+          if (ConstantExpr::get(Shift->getOpcode(), NewCst, ShAmt) != RHS) {
+            // If we shifted bits out, the fold is not going to work out.
+            // As a special case, check to see if this means that the
+            // result is always true or false now.
+            if (ICI.getPredicate() == ICmpInst::ICMP_EQ)
+              return ReplaceInstUsesWith(ICI, ConstantInt::getFalse());
+            if (ICI.getPredicate() == ICmpInst::ICMP_NE)
+              return ReplaceInstUsesWith(ICI, ConstantInt::getTrue());
+          } else {
+            ICI.setOperand(1, NewCst);
+            Constant *NewAndCST;
+            if (Shift->getOpcode() == Instruction::Shl)
+              NewAndCST = ConstantExpr::getLShr(AndCST, ShAmt);
+            else
+              NewAndCST = ConstantExpr::getShl(AndCST, ShAmt);
+            LHSI->setOperand(1, NewAndCST);
+            LHSI->setOperand(0, Shift->getOperand(0));
+            AddToWorkList(Shift); // Shift is dead.
+            AddUsesToWorkList(ICI);
+            return &ICI;
+          }
+        }
+      }
+      
+      // Turn ((X >> Y) & C) == 0  into  (X & (C << Y)) == 0.  The later is
+      // preferable because it allows the C<<Y expression to be hoisted out
+      // of a loop if Y is invariant and X is not.
+      if (Shift && Shift->hasOneUse() && RHSV == 0 &&
+          ICI.isEquality() && !Shift->isArithmeticShift() &&
+          !isa<Constant>(Shift->getOperand(0))) {
+        // Compute C << Y.
+        Value *NS;
+        if (Shift->getOpcode() == Instruction::LShr) {
+          NS = BinaryOperator::CreateShl(AndCST, 
+                                         Shift->getOperand(1), "tmp");
+        } else {
+          // Insert a logical shift.
+          NS = BinaryOperator::CreateLShr(AndCST,
+                                          Shift->getOperand(1), "tmp");
+        }
+        InsertNewInstBefore(cast<Instruction>(NS), ICI);
+        
+        // Compute X & (C << Y).
+        Instruction *NewAnd = 
+          BinaryOperator::CreateAnd(Shift->getOperand(0), NS, LHSI->getName());
+        InsertNewInstBefore(NewAnd, ICI);
+        
+        ICI.setOperand(0, NewAnd);
+        return &ICI;
+      }
+    }
+    break;
+    
+  case Instruction::Shl: {       // (icmp pred (shl X, ShAmt), CI)
+    ConstantInt *ShAmt = dyn_cast<ConstantInt>(LHSI->getOperand(1));
+    if (!ShAmt) break;
+    
+    uint32_t TypeBits = RHSV.getBitWidth();
+    
+    // Check that the shift amount is in range.  If not, don't perform
+    // undefined shifts.  When the shift is visited it will be
+    // simplified.
+    if (ShAmt->uge(TypeBits))
+      break;
+    
+    if (ICI.isEquality()) {
+      // If we are comparing against bits always shifted out, the
+      // comparison cannot succeed.
+      Constant *Comp =
+        ConstantExpr::getShl(ConstantExpr::getLShr(RHS, ShAmt), ShAmt);
+      if (Comp != RHS) {// Comparing against a bit that we know is zero.
+        bool IsICMP_NE = ICI.getPredicate() == ICmpInst::ICMP_NE;
+        Constant *Cst = ConstantInt::get(Type::Int1Ty, IsICMP_NE);
+        return ReplaceInstUsesWith(ICI, Cst);
+      }
+      
+      if (LHSI->hasOneUse()) {
+        // Otherwise strength reduce the shift into an and.
+        uint32_t ShAmtVal = (uint32_t)ShAmt->getLimitedValue(TypeBits);
+        Constant *Mask =
+          ConstantInt::get(APInt::getLowBitsSet(TypeBits, TypeBits-ShAmtVal));
+        
+        Instruction *AndI =
+          BinaryOperator::CreateAnd(LHSI->getOperand(0),
+                                    Mask, LHSI->getName()+".mask");
+        Value *And = InsertNewInstBefore(AndI, ICI);
+        return new ICmpInst(ICI.getPredicate(), And,
+                            ConstantInt::get(RHSV.lshr(ShAmtVal)));
+      }
+    }
+    
+    // Otherwise, if this is a comparison of the sign bit, simplify to and/test.
+    bool TrueIfSigned = false;
+    if (LHSI->hasOneUse() &&
+        isSignBitCheck(ICI.getPredicate(), RHS, TrueIfSigned)) {
+      // (X << 31) <s 0  --> (X&1) != 0
+      Constant *Mask = ConstantInt::get(APInt(TypeBits, 1) <<
+                                           (TypeBits-ShAmt->getZExtValue()-1));
+      Instruction *AndI =
+        BinaryOperator::CreateAnd(LHSI->getOperand(0),
+                                  Mask, LHSI->getName()+".mask");
+      Value *And = InsertNewInstBefore(AndI, ICI);
+      
+      return new ICmpInst(TrueIfSigned ? ICmpInst::ICMP_NE : ICmpInst::ICMP_EQ,
+                          And, Constant::getNullValue(And->getType()));
+    }
+    break;
+  }
+    
+  case Instruction::LShr:         // (icmp pred (shr X, ShAmt), CI)
+  case Instruction::AShr: {
+    // Only handle equality comparisons of shift-by-constant.
+    ConstantInt *ShAmt = dyn_cast<ConstantInt>(LHSI->getOperand(1));
+    if (!ShAmt || !ICI.isEquality()) break;
+
+    // Check that the shift amount is in range.  If not, don't perform
+    // undefined shifts.  When the shift is visited it will be
+    // simplified.
+    uint32_t TypeBits = RHSV.getBitWidth();
+    if (ShAmt->uge(TypeBits))
+      break;
+    
+    uint32_t ShAmtVal = (uint32_t)ShAmt->getLimitedValue(TypeBits);
+      
+    // If we are comparing against bits always shifted out, the
+    // comparison cannot succeed.
+    APInt Comp = RHSV << ShAmtVal;
+    if (LHSI->getOpcode() == Instruction::LShr)
+      Comp = Comp.lshr(ShAmtVal);
+    else
+      Comp = Comp.ashr(ShAmtVal);
+    
+    if (Comp != RHSV) { // Comparing against a bit that we know is zero.
+      bool IsICMP_NE = ICI.getPredicate() == ICmpInst::ICMP_NE;
+      Constant *Cst = ConstantInt::get(Type::Int1Ty, IsICMP_NE);
+      return ReplaceInstUsesWith(ICI, Cst);
+    }
+    
+    // Otherwise, check to see if the bits shifted out are known to be zero.
+    // If so, we can compare against the unshifted value:
+    //  (X & 4) >> 1 == 2  --> (X & 4) == 4.
+    if (LHSI->hasOneUse() &&
+        MaskedValueIsZero(LHSI->getOperand(0), 
+                          APInt::getLowBitsSet(Comp.getBitWidth(), ShAmtVal))) {
+      return new ICmpInst(ICI.getPredicate(), LHSI->getOperand(0),
+                          ConstantExpr::getShl(RHS, ShAmt));
+    }
+      
+    if (LHSI->hasOneUse()) {
+      // Otherwise strength reduce the shift into an and.
+      APInt Val(APInt::getHighBitsSet(TypeBits, TypeBits - ShAmtVal));
+      Constant *Mask = ConstantInt::get(Val);
+      
+      Instruction *AndI =
+        BinaryOperator::CreateAnd(LHSI->getOperand(0),
+                                  Mask, LHSI->getName()+".mask");
+      Value *And = InsertNewInstBefore(AndI, ICI);
+      return new ICmpInst(ICI.getPredicate(), And,
+                          ConstantExpr::getShl(RHS, ShAmt));
+    }
+    break;
+  }
+    
+  case Instruction::SDiv:
+  case Instruction::UDiv:
+    // Fold: icmp pred ([us]div X, C1), C2 -> range test
+    // Fold this div into the comparison, producing a range check. 
+    // Determine, based on the divide type, what the range is being 
+    // checked.  If there is an overflow on the low or high side, remember 
+    // it, otherwise compute the range [low, hi) bounding the new value.
+    // See: InsertRangeTest above for the kinds of replacements possible.
+    if (ConstantInt *DivRHS = dyn_cast<ConstantInt>(LHSI->getOperand(1)))
+      if (Instruction *R = FoldICmpDivCst(ICI, cast<BinaryOperator>(LHSI),
+                                          DivRHS))
+        return R;
+    break;
+
+  case Instruction::Add:
+    // Fold: icmp pred (add, X, C1), C2
+
+    if (!ICI.isEquality()) {
+      ConstantInt *LHSC = dyn_cast<ConstantInt>(LHSI->getOperand(1));
+      if (!LHSC) break;
+      const APInt &LHSV = LHSC->getValue();
+
+      ConstantRange CR = ICI.makeConstantRange(ICI.getPredicate(), RHSV)
+                            .subtract(LHSV);
+
+      if (ICI.isSignedPredicate()) {
+        if (CR.getLower().isSignBit()) {
+          return new ICmpInst(ICmpInst::ICMP_SLT, LHSI->getOperand(0),
+                              ConstantInt::get(CR.getUpper()));
+        } else if (CR.getUpper().isSignBit()) {
+          return new ICmpInst(ICmpInst::ICMP_SGE, LHSI->getOperand(0),
+                              ConstantInt::get(CR.getLower()));
+        }
+      } else {
+        if (CR.getLower().isMinValue()) {
+          return new ICmpInst(ICmpInst::ICMP_ULT, LHSI->getOperand(0),
+                              ConstantInt::get(CR.getUpper()));
+        } else if (CR.getUpper().isMinValue()) {
+          return new ICmpInst(ICmpInst::ICMP_UGE, LHSI->getOperand(0),
+                              ConstantInt::get(CR.getLower()));
+        }
+      }
+    }
+    break;
+  }
+  
+  // Simplify icmp_eq and icmp_ne instructions with integer constant RHS.
+  if (ICI.isEquality()) {
+    bool isICMP_NE = ICI.getPredicate() == ICmpInst::ICMP_NE;
+    
+    // If the first operand is (add|sub|and|or|xor|rem) with a constant, and 
+    // the second operand is a constant, simplify a bit.
+    if (BinaryOperator *BO = dyn_cast<BinaryOperator>(LHSI)) {
+      switch (BO->getOpcode()) {
+      case Instruction::SRem:
+        // If we have a signed (X % (2^c)) == 0, turn it into an unsigned one.
+        if (RHSV == 0 && isa<ConstantInt>(BO->getOperand(1)) &&BO->hasOneUse()){
+          const APInt &V = cast<ConstantInt>(BO->getOperand(1))->getValue();
+          if (V.sgt(APInt(V.getBitWidth(), 1)) && V.isPowerOf2()) {
+            Instruction *NewRem =
+              BinaryOperator::CreateURem(BO->getOperand(0), BO->getOperand(1),
+                                         BO->getName());
+            InsertNewInstBefore(NewRem, ICI);
+            return new ICmpInst(ICI.getPredicate(), NewRem, 
+                                Constant::getNullValue(BO->getType()));
+          }
+        }
+        break;
+      case Instruction::Add:
+        // Replace ((add A, B) != C) with (A != C-B) if B & C are constants.
+        if (ConstantInt *BOp1C = dyn_cast<ConstantInt>(BO->getOperand(1))) {
+          if (BO->hasOneUse())
+            return new ICmpInst(ICI.getPredicate(), BO->getOperand(0),
+                                Subtract(RHS, BOp1C));
+        } else if (RHSV == 0) {
+          // Replace ((add A, B) != 0) with (A != -B) if A or B is
+          // efficiently invertible, or if the add has just this one use.
+          Value *BOp0 = BO->getOperand(0), *BOp1 = BO->getOperand(1);
+          
+          if (Value *NegVal = dyn_castNegVal(BOp1))
+            return new ICmpInst(ICI.getPredicate(), BOp0, NegVal);
+          else if (Value *NegVal = dyn_castNegVal(BOp0))
+            return new ICmpInst(ICI.getPredicate(), NegVal, BOp1);
+          else if (BO->hasOneUse()) {
+            Instruction *Neg = BinaryOperator::CreateNeg(BOp1);
+            InsertNewInstBefore(Neg, ICI);
+            Neg->takeName(BO);
+            return new ICmpInst(ICI.getPredicate(), BOp0, Neg);
+          }
+        }
+        break;
+      case Instruction::Xor:
+        // For the xor case, we can xor two constants together, eliminating
+        // the explicit xor.
+        if (Constant *BOC = dyn_cast<Constant>(BO->getOperand(1)))
+          return new ICmpInst(ICI.getPredicate(), BO->getOperand(0), 
+                              ConstantExpr::getXor(RHS, BOC));
+        
+        // FALLTHROUGH
+      case Instruction::Sub:
+        // Replace (([sub|xor] A, B) != 0) with (A != B)
+        if (RHSV == 0)
+          return new ICmpInst(ICI.getPredicate(), BO->getOperand(0),
+                              BO->getOperand(1));
+        break;
+        
+      case Instruction::Or:
+        // If bits are being or'd in that are not present in the constant we
+        // are comparing against, then the comparison could never succeed!
+        if (Constant *BOC = dyn_cast<Constant>(BO->getOperand(1))) {
+          Constant *NotCI = ConstantExpr::getNot(RHS);
+          if (!ConstantExpr::getAnd(BOC, NotCI)->isNullValue())
+            return ReplaceInstUsesWith(ICI, ConstantInt::get(Type::Int1Ty, 
+                                                             isICMP_NE));
+        }
+        break;
+        
+      case Instruction::And:
+        if (ConstantInt *BOC = dyn_cast<ConstantInt>(BO->getOperand(1))) {
+          // If bits are being compared against that are and'd out, then the
+          // comparison can never succeed!
+          if ((RHSV & ~BOC->getValue()) != 0)
+            return ReplaceInstUsesWith(ICI, ConstantInt::get(Type::Int1Ty,
+                                                             isICMP_NE));
+          
+          // If we have ((X & C) == C), turn it into ((X & C) != 0).
+          if (RHS == BOC && RHSV.isPowerOf2())
+            return new ICmpInst(isICMP_NE ? ICmpInst::ICMP_EQ :
+                                ICmpInst::ICMP_NE, LHSI,
+                                Constant::getNullValue(RHS->getType()));
+          
+          // Replace (and X, (1 << size(X)-1) != 0) with x s< 0
+          if (BOC->getValue().isSignBit()) {
+            Value *X = BO->getOperand(0);
+            Constant *Zero = Constant::getNullValue(X->getType());
+            ICmpInst::Predicate pred = isICMP_NE ? 
+              ICmpInst::ICMP_SLT : ICmpInst::ICMP_SGE;
+            return new ICmpInst(pred, X, Zero);
+          }
+          
+          // ((X & ~7) == 0) --> X < 8
+          if (RHSV == 0 && isHighOnes(BOC)) {
+            Value *X = BO->getOperand(0);
+            Constant *NegX = ConstantExpr::getNeg(BOC);
+            ICmpInst::Predicate pred = isICMP_NE ? 
+              ICmpInst::ICMP_UGE : ICmpInst::ICMP_ULT;
+            return new ICmpInst(pred, X, NegX);
+          }
+        }
+      default: break;
+      }
+    } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(LHSI)) {
+      // Handle icmp {eq|ne} <intrinsic>, intcst.
+      if (II->getIntrinsicID() == Intrinsic::bswap) {
+        AddToWorkList(II);
+        ICI.setOperand(0, II->getOperand(1));
+        ICI.setOperand(1, ConstantInt::get(RHSV.byteSwap()));
+        return &ICI;
+      }
+    }
+  }
+  return 0;
+}
+
+/// visitICmpInstWithCastAndCast - Handle icmp (cast x to y), (cast/cst).
+/// We only handle extending casts so far.
+///
+Instruction *InstCombiner::visitICmpInstWithCastAndCast(ICmpInst &ICI) {
+  const CastInst *LHSCI = cast<CastInst>(ICI.getOperand(0));
+  Value *LHSCIOp        = LHSCI->getOperand(0);
+  const Type *SrcTy     = LHSCIOp->getType();
+  const Type *DestTy    = LHSCI->getType();
+  Value *RHSCIOp;
+
+  // Turn icmp (ptrtoint x), (ptrtoint/c) into a compare of the input if the 
+  // integer type is the same size as the pointer type.
+  if (LHSCI->getOpcode() == Instruction::PtrToInt &&
+      getTargetData().getPointerSizeInBits() == 
+         cast<IntegerType>(DestTy)->getBitWidth()) {
+    Value *RHSOp = 0;
+    if (Constant *RHSC = dyn_cast<Constant>(ICI.getOperand(1))) {
+      RHSOp = ConstantExpr::getIntToPtr(RHSC, SrcTy);
+    } else if (PtrToIntInst *RHSC = dyn_cast<PtrToIntInst>(ICI.getOperand(1))) {
+      RHSOp = RHSC->getOperand(0);
+      // If the pointer types don't match, insert a bitcast.
+      if (LHSCIOp->getType() != RHSOp->getType())
+        RHSOp = InsertBitCastBefore(RHSOp, LHSCIOp->getType(), ICI);
+    }
+
+    if (RHSOp)
+      return new ICmpInst(ICI.getPredicate(), LHSCIOp, RHSOp);
+  }
+  
+  // The code below only handles extension cast instructions, so far.
+  // Enforce this.
+  if (LHSCI->getOpcode() != Instruction::ZExt &&
+      LHSCI->getOpcode() != Instruction::SExt)
+    return 0;
+
+  bool isSignedExt = LHSCI->getOpcode() == Instruction::SExt;
+  bool isSignedCmp = ICI.isSignedPredicate();
+
+  if (CastInst *CI = dyn_cast<CastInst>(ICI.getOperand(1))) {
+    // Not an extension from the same type?
+    RHSCIOp = CI->getOperand(0);
+    if (RHSCIOp->getType() != LHSCIOp->getType()) 
+      return 0;
+    
+    // If the signedness of the two casts doesn't agree (i.e. one is a sext
+    // and the other is a zext), then we can't handle this.
+    if (CI->getOpcode() != LHSCI->getOpcode())
+      return 0;
+
+    // Deal with equality cases early.
+    if (ICI.isEquality())
+      return new ICmpInst(ICI.getPredicate(), LHSCIOp, RHSCIOp);
+
+    // A signed comparison of sign extended values simplifies into a
+    // signed comparison.
+    if (isSignedCmp && isSignedExt)
+      return new ICmpInst(ICI.getPredicate(), LHSCIOp, RHSCIOp);
+
+    // The other three cases all fold into an unsigned comparison.
+    return new ICmpInst(ICI.getUnsignedPredicate(), LHSCIOp, RHSCIOp);
+  }
+
+  // If we aren't dealing with a constant on the RHS, exit early
+  ConstantInt *CI = dyn_cast<ConstantInt>(ICI.getOperand(1));
+  if (!CI)
+    return 0;
+
+  // Compute the constant that would happen if we truncated to SrcTy then
+  // reextended to DestTy.
+  Constant *Res1 = ConstantExpr::getTrunc(CI, SrcTy);
+  Constant *Res2 = ConstantExpr::getCast(LHSCI->getOpcode(), Res1, DestTy);
+
+  // If the re-extended constant didn't change...
+  if (Res2 == CI) {
+    // Make sure that sign of the Cmp and the sign of the Cast are the same.
+    // For example, we might have:
+    //    %A = sext short %X to uint
+    //    %B = icmp ugt uint %A, 1330
+    // It is incorrect to transform this into 
+    //    %B = icmp ugt short %X, 1330 
+    // because %A may have negative value. 
+    //
+    // However, we allow this when the compare is EQ/NE, because they are
+    // signless.
+    if (isSignedExt == isSignedCmp || ICI.isEquality())
+      return new ICmpInst(ICI.getPredicate(), LHSCIOp, Res1);
+    return 0;
+  }
+
+  // The re-extended constant changed so the constant cannot be represented 
+  // in the shorter type. Consequently, we cannot emit a simple comparison.
+
+  // First, handle some easy cases. We know the result cannot be equal at this
+  // point so handle the ICI.isEquality() cases
+  if (ICI.getPredicate() == ICmpInst::ICMP_EQ)
+    return ReplaceInstUsesWith(ICI, ConstantInt::getFalse());
+  if (ICI.getPredicate() == ICmpInst::ICMP_NE)
+    return ReplaceInstUsesWith(ICI, ConstantInt::getTrue());
+
+  // Evaluate the comparison for LT (we invert for GT below). LE and GE cases
+  // should have been folded away previously and not enter in here.
+  Value *Result;
+  if (isSignedCmp) {
+    // We're performing a signed comparison.
+    if (cast<ConstantInt>(CI)->getValue().isNegative())
+      Result = ConstantInt::getFalse();          // X < (small) --> false
+    else
+      Result = ConstantInt::getTrue();           // X < (large) --> true
+  } else {
+    // We're performing an unsigned comparison.
+    if (isSignedExt) {
+      // We're performing an unsigned comp with a sign extended value.
+      // This is true if the input is >= 0. [aka >s -1]
+      Constant *NegOne = ConstantInt::getAllOnesValue(SrcTy);
+      Result = InsertNewInstBefore(new ICmpInst(ICmpInst::ICMP_SGT, LHSCIOp,
+                                   NegOne, ICI.getName()), ICI);
+    } else {
+      // Unsigned extend & unsigned compare -> always true.
+      Result = ConstantInt::getTrue();
+    }
+  }
+
+  // Finally, return the value computed.
+  if (ICI.getPredicate() == ICmpInst::ICMP_ULT ||
+      ICI.getPredicate() == ICmpInst::ICMP_SLT)
+    return ReplaceInstUsesWith(ICI, Result);
+
+  assert((ICI.getPredicate()==ICmpInst::ICMP_UGT || 
+          ICI.getPredicate()==ICmpInst::ICMP_SGT) &&
+         "ICmp should be folded!");
+  if (Constant *CI = dyn_cast<Constant>(Result))
+    return ReplaceInstUsesWith(ICI, ConstantExpr::getNot(CI));
+  return BinaryOperator::CreateNot(Result);
+}
+
+Instruction *InstCombiner::visitShl(BinaryOperator &I) {
+  return commonShiftTransforms(I);
+}
+
+Instruction *InstCombiner::visitLShr(BinaryOperator &I) {
+  return commonShiftTransforms(I);
+}
+
+Instruction *InstCombiner::visitAShr(BinaryOperator &I) {
+  if (Instruction *R = commonShiftTransforms(I))
+    return R;
+  
+  Value *Op0 = I.getOperand(0);
+  
+  // ashr int -1, X = -1   (for any arithmetic shift rights of ~0)
+  if (ConstantInt *CSI = dyn_cast<ConstantInt>(Op0))
+    if (CSI->isAllOnesValue())
+      return ReplaceInstUsesWith(I, CSI);
+  
+  // See if we can turn a signed shr into an unsigned shr.
+  if (!isa<VectorType>(I.getType())) {
+    if (MaskedValueIsZero(Op0,
+                      APInt::getSignBit(I.getType()->getPrimitiveSizeInBits())))
+      return BinaryOperator::CreateLShr(Op0, I.getOperand(1));
+
+    // Arithmetic shifting an all-sign-bit value is a no-op.
+    unsigned NumSignBits = ComputeNumSignBits(Op0);
+    if (NumSignBits == Op0->getType()->getPrimitiveSizeInBits())
+      return ReplaceInstUsesWith(I, Op0);
+  }
+
+  return 0;
+}
+
+Instruction *InstCombiner::commonShiftTransforms(BinaryOperator &I) {
+  assert(I.getOperand(1)->getType() == I.getOperand(0)->getType());
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+
+  // shl X, 0 == X and shr X, 0 == X
+  // shl 0, X == 0 and shr 0, X == 0
+  if (Op1 == Constant::getNullValue(Op1->getType()) ||
+      Op0 == Constant::getNullValue(Op0->getType()))
+    return ReplaceInstUsesWith(I, Op0);
+  
+  if (isa<UndefValue>(Op0)) {            
+    if (I.getOpcode() == Instruction::AShr) // undef >>s X -> undef
+      return ReplaceInstUsesWith(I, Op0);
+    else                                    // undef << X -> 0, undef >>u X -> 0
+      return ReplaceInstUsesWith(I, Constant::getNullValue(I.getType()));
+  }
+  if (isa<UndefValue>(Op1)) {
+    if (I.getOpcode() == Instruction::AShr)  // X >>s undef -> X
+      return ReplaceInstUsesWith(I, Op0);          
+    else                                     // X << undef, X >>u undef -> 0
+      return ReplaceInstUsesWith(I, Constant::getNullValue(I.getType()));
+  }
+
+  // See if we can fold away this shift.
+  if (!isa<VectorType>(I.getType()) && SimplifyDemandedInstructionBits(I))
+    return &I;
+
+  // Try to fold constant and into select arguments.
+  if (isa<Constant>(Op0))
+    if (SelectInst *SI = dyn_cast<SelectInst>(Op1))
+      if (Instruction *R = FoldOpIntoSelect(I, SI, this))
+        return R;
+
+  if (ConstantInt *CUI = dyn_cast<ConstantInt>(Op1))
+    if (Instruction *Res = FoldShiftByConstant(Op0, CUI, I))
+      return Res;
+  return 0;
+}
+
+Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, ConstantInt *Op1,
+                                               BinaryOperator &I) {
+  bool isLeftShift = I.getOpcode() == Instruction::Shl;
+
+  // See if we can simplify any instructions used by the instruction whose sole 
+  // purpose is to compute bits we don't care about.
+  uint32_t TypeBits = Op0->getType()->getPrimitiveSizeInBits();
+  
+  // shl uint X, 32 = 0 and shr ubyte Y, 9 = 0, ... just don't eliminate shr
+  // of a signed value.
+  //
+  if (Op1->uge(TypeBits)) {
+    if (I.getOpcode() != Instruction::AShr)
+      return ReplaceInstUsesWith(I, Constant::getNullValue(Op0->getType()));
+    else {
+      I.setOperand(1, ConstantInt::get(I.getType(), TypeBits-1));
+      return &I;
+    }
+  }
+  
+  // ((X*C1) << C2) == (X * (C1 << C2))
+  if (BinaryOperator *BO = dyn_cast<BinaryOperator>(Op0))
+    if (BO->getOpcode() == Instruction::Mul && isLeftShift)
+      if (Constant *BOOp = dyn_cast<Constant>(BO->getOperand(1)))
+        return BinaryOperator::CreateMul(BO->getOperand(0),
+                                         ConstantExpr::getShl(BOOp, Op1));
+  
+  // Try to fold constant and into select arguments.
+  if (SelectInst *SI = dyn_cast<SelectInst>(Op0))
+    if (Instruction *R = FoldOpIntoSelect(I, SI, this))
+      return R;
+  if (isa<PHINode>(Op0))
+    if (Instruction *NV = FoldOpIntoPhi(I))
+      return NV;
+  
+  // Fold shift2(trunc(shift1(x,c1)), c2) -> trunc(shift2(shift1(x,c1),c2))
+  if (TruncInst *TI = dyn_cast<TruncInst>(Op0)) {
+    Instruction *TrOp = dyn_cast<Instruction>(TI->getOperand(0));
+    // If 'shift2' is an ashr, we would have to get the sign bit into a funny
+    // place.  Don't try to do this transformation in this case.  Also, we
+    // require that the input operand is a shift-by-constant so that we have
+    // confidence that the shifts will get folded together.  We could do this
+    // xform in more cases, but it is unlikely to be profitable.
+    if (TrOp && I.isLogicalShift() && TrOp->isShift() && 
+        isa<ConstantInt>(TrOp->getOperand(1))) {
+      // Okay, we'll do this xform.  Make the shift of shift.
+      Constant *ShAmt = ConstantExpr::getZExt(Op1, TrOp->getType());
+      Instruction *NSh = BinaryOperator::Create(I.getOpcode(), TrOp, ShAmt,
+                                                I.getName());
+      InsertNewInstBefore(NSh, I); // (shift2 (shift1 & 0x00FF), c2)
+
+      // For logical shifts, the truncation has the effect of making the high
+      // part of the register be zeros.  Emulate this by inserting an AND to
+      // clear the top bits as needed.  This 'and' will usually be zapped by
+      // other xforms later if dead.
+      unsigned SrcSize = TrOp->getType()->getPrimitiveSizeInBits();
+      unsigned DstSize = TI->getType()->getPrimitiveSizeInBits();
+      APInt MaskV(APInt::getLowBitsSet(SrcSize, DstSize));
+      
+      // The mask we constructed says what the trunc would do if occurring
+      // between the shifts.  We want to know the effect *after* the second
+      // shift.  We know that it is a logical shift by a constant, so adjust the
+      // mask as appropriate.
+      if (I.getOpcode() == Instruction::Shl)
+        MaskV <<= Op1->getZExtValue();
+      else {
+        assert(I.getOpcode() == Instruction::LShr && "Unknown logical shift");
+        MaskV = MaskV.lshr(Op1->getZExtValue());
+      }
+
+      Instruction *And = BinaryOperator::CreateAnd(NSh, ConstantInt::get(MaskV),
+                                                   TI->getName());
+      InsertNewInstBefore(And, I); // shift1 & 0x00FF
+
+      // Return the value truncated to the interesting size.
+      return new TruncInst(And, I.getType());
+    }
+  }
+  
+  if (Op0->hasOneUse()) {
+    if (BinaryOperator *Op0BO = dyn_cast<BinaryOperator>(Op0)) {
+      // Turn ((X >> C) + Y) << C  ->  (X + (Y << C)) & (~0 << C)
+      Value *V1, *V2;
+      ConstantInt *CC;
+      switch (Op0BO->getOpcode()) {
+        default: break;
+        case Instruction::Add:
+        case Instruction::And:
+        case Instruction::Or:
+        case Instruction::Xor: {
+          // These operators commute.
+          // Turn (Y + (X >> C)) << C  ->  (X + (Y << C)) & (~0 << C)
+          if (isLeftShift && Op0BO->getOperand(1)->hasOneUse() &&
+              match(Op0BO->getOperand(1), m_Shr(m_Value(V1), m_Specific(Op1)))){
+            Instruction *YS = BinaryOperator::CreateShl(
+                                            Op0BO->getOperand(0), Op1,
+                                            Op0BO->getName());
+            InsertNewInstBefore(YS, I); // (Y << C)
+            Instruction *X = 
+              BinaryOperator::Create(Op0BO->getOpcode(), YS, V1,
+                                     Op0BO->getOperand(1)->getName());
+            InsertNewInstBefore(X, I);  // (X + (Y << C))
+            uint32_t Op1Val = Op1->getLimitedValue(TypeBits);
+            return BinaryOperator::CreateAnd(X, ConstantInt::get(
+                       APInt::getHighBitsSet(TypeBits, TypeBits-Op1Val)));
+          }
+          
+          // Turn (Y + ((X >> C) & CC)) << C  ->  ((X & (CC << C)) + (Y << C))
+          Value *Op0BOOp1 = Op0BO->getOperand(1);
+          if (isLeftShift && Op0BOOp1->hasOneUse() &&
+              match(Op0BOOp1, 
+                    m_And(m_Shr(m_Value(V1), m_Specific(Op1)),
+                          m_ConstantInt(CC))) &&
+              cast<BinaryOperator>(Op0BOOp1)->getOperand(0)->hasOneUse()) {
+            Instruction *YS = BinaryOperator::CreateShl(
+                                                     Op0BO->getOperand(0), Op1,
+                                                     Op0BO->getName());
+            InsertNewInstBefore(YS, I); // (Y << C)
+            Instruction *XM =
+              BinaryOperator::CreateAnd(V1, ConstantExpr::getShl(CC, Op1),
+                                        V1->getName()+".mask");
+            InsertNewInstBefore(XM, I); // X & (CC << C)
+            
+            return BinaryOperator::Create(Op0BO->getOpcode(), YS, XM);
+          }
+        }
+          
+        // FALL THROUGH.
+        case Instruction::Sub: {
+          // Turn ((X >> C) + Y) << C  ->  (X + (Y << C)) & (~0 << C)
+          if (isLeftShift && Op0BO->getOperand(0)->hasOneUse() &&
+              match(Op0BO->getOperand(0), m_Shr(m_Value(V1), m_Specific(Op1)))){
+            Instruction *YS = BinaryOperator::CreateShl(
+                                                     Op0BO->getOperand(1), Op1,
+                                                     Op0BO->getName());
+            InsertNewInstBefore(YS, I); // (Y << C)
+            Instruction *X =
+              BinaryOperator::Create(Op0BO->getOpcode(), V1, YS,
+                                     Op0BO->getOperand(0)->getName());
+            InsertNewInstBefore(X, I);  // (X + (Y << C))
+            uint32_t Op1Val = Op1->getLimitedValue(TypeBits);
+            return BinaryOperator::CreateAnd(X, ConstantInt::get(
+                       APInt::getHighBitsSet(TypeBits, TypeBits-Op1Val)));
+          }
+          
+          // Turn (((X >> C)&CC) + Y) << C  ->  (X + (Y << C)) & (CC << C)
+          if (isLeftShift && Op0BO->getOperand(0)->hasOneUse() &&
+              match(Op0BO->getOperand(0),
+                    m_And(m_Shr(m_Value(V1), m_Value(V2)),
+                          m_ConstantInt(CC))) && V2 == Op1 &&
+              cast<BinaryOperator>(Op0BO->getOperand(0))
+                  ->getOperand(0)->hasOneUse()) {
+            Instruction *YS = BinaryOperator::CreateShl(
+                                                     Op0BO->getOperand(1), Op1,
+                                                     Op0BO->getName());
+            InsertNewInstBefore(YS, I); // (Y << C)
+            Instruction *XM =
+              BinaryOperator::CreateAnd(V1, ConstantExpr::getShl(CC, Op1),
+                                        V1->getName()+".mask");
+            InsertNewInstBefore(XM, I); // X & (CC << C)
+            
+            return BinaryOperator::Create(Op0BO->getOpcode(), XM, YS);
+          }
+          
+          break;
+        }
+      }
+      
+      
+      // If the operand is an bitwise operator with a constant RHS, and the
+      // shift is the only use, we can pull it out of the shift.
+      if (ConstantInt *Op0C = dyn_cast<ConstantInt>(Op0BO->getOperand(1))) {
+        bool isValid = true;     // Valid only for And, Or, Xor
+        bool highBitSet = false; // Transform if high bit of constant set?
+        
+        switch (Op0BO->getOpcode()) {
+          default: isValid = false; break;   // Do not perform transform!
+          case Instruction::Add:
+            isValid = isLeftShift;
+            break;
+          case Instruction::Or:
+          case Instruction::Xor:
+            highBitSet = false;
+            break;
+          case Instruction::And:
+            highBitSet = true;
+            break;
+        }
+        
+        // If this is a signed shift right, and the high bit is modified
+        // by the logical operation, do not perform the transformation.
+        // The highBitSet boolean indicates the value of the high bit of
+        // the constant which would cause it to be modified for this
+        // operation.
+        //
+        if (isValid && I.getOpcode() == Instruction::AShr)
+          isValid = Op0C->getValue()[TypeBits-1] == highBitSet;
+        
+        if (isValid) {
+          Constant *NewRHS = ConstantExpr::get(I.getOpcode(), Op0C, Op1);
+          
+          Instruction *NewShift =
+            BinaryOperator::Create(I.getOpcode(), Op0BO->getOperand(0), Op1);
+          InsertNewInstBefore(NewShift, I);
+          NewShift->takeName(Op0BO);
+          
+          return BinaryOperator::Create(Op0BO->getOpcode(), NewShift,
+                                        NewRHS);
+        }
+      }
+    }
+  }
+  
+  // Find out if this is a shift of a shift by a constant.
+  BinaryOperator *ShiftOp = dyn_cast<BinaryOperator>(Op0);
+  if (ShiftOp && !ShiftOp->isShift())
+    ShiftOp = 0;
+  
+  if (ShiftOp && isa<ConstantInt>(ShiftOp->getOperand(1))) {
+    ConstantInt *ShiftAmt1C = cast<ConstantInt>(ShiftOp->getOperand(1));
+    uint32_t ShiftAmt1 = ShiftAmt1C->getLimitedValue(TypeBits);
+    uint32_t ShiftAmt2 = Op1->getLimitedValue(TypeBits);
+    assert(ShiftAmt2 != 0 && "Should have been simplified earlier");
+    if (ShiftAmt1 == 0) return 0;  // Will be simplified in the future.
+    Value *X = ShiftOp->getOperand(0);
+    
+    uint32_t AmtSum = ShiftAmt1+ShiftAmt2;   // Fold into one big shift.
+    
+    const IntegerType *Ty = cast<IntegerType>(I.getType());
+    
+    // Check for (X << c1) << c2  and  (X >> c1) >> c2
+    if (I.getOpcode() == ShiftOp->getOpcode()) {
+      // If this is oversized composite shift, then unsigned shifts get 0, ashr
+      // saturates.
+      if (AmtSum >= TypeBits) {
+        if (I.getOpcode() != Instruction::AShr)
+          return ReplaceInstUsesWith(I, Constant::getNullValue(I.getType()));
+        AmtSum = TypeBits-1;  // Saturate to 31 for i32 ashr.
+      }
+      
+      return BinaryOperator::Create(I.getOpcode(), X,
+                                    ConstantInt::get(Ty, AmtSum));
+    } else if (ShiftOp->getOpcode() == Instruction::LShr &&
+               I.getOpcode() == Instruction::AShr) {
+      if (AmtSum >= TypeBits)
+        return ReplaceInstUsesWith(I, Constant::getNullValue(I.getType()));
+      
+      // ((X >>u C1) >>s C2) -> (X >>u (C1+C2))  since C1 != 0.
+      return BinaryOperator::CreateLShr(X, ConstantInt::get(Ty, AmtSum));
+    } else if (ShiftOp->getOpcode() == Instruction::AShr &&
+               I.getOpcode() == Instruction::LShr) {
+      // ((X >>s C1) >>u C2) -> ((X >>s (C1+C2)) & mask) since C1 != 0.
+      if (AmtSum >= TypeBits)
+        AmtSum = TypeBits-1;
+      
+      Instruction *Shift =
+        BinaryOperator::CreateAShr(X, ConstantInt::get(Ty, AmtSum));
+      InsertNewInstBefore(Shift, I);
+
+      APInt Mask(APInt::getLowBitsSet(TypeBits, TypeBits - ShiftAmt2));
+      return BinaryOperator::CreateAnd(Shift, ConstantInt::get(Mask));
+    }
+    
+    // Okay, if we get here, one shift must be left, and the other shift must be
+    // right.  See if the amounts are equal.
+    if (ShiftAmt1 == ShiftAmt2) {
+      // If we have ((X >>? C) << C), turn this into X & (-1 << C).
+      if (I.getOpcode() == Instruction::Shl) {
+        APInt Mask(APInt::getHighBitsSet(TypeBits, TypeBits - ShiftAmt1));
+        return BinaryOperator::CreateAnd(X, ConstantInt::get(Mask));
+      }
+      // If we have ((X << C) >>u C), turn this into X & (-1 >>u C).
+      if (I.getOpcode() == Instruction::LShr) {
+        APInt Mask(APInt::getLowBitsSet(TypeBits, TypeBits - ShiftAmt1));
+        return BinaryOperator::CreateAnd(X, ConstantInt::get(Mask));
+      }
+      // We can simplify ((X << C) >>s C) into a trunc + sext.
+      // NOTE: we could do this for any C, but that would make 'unusual' integer
+      // types.  For now, just stick to ones well-supported by the code
+      // generators.
+      const Type *SExtType = 0;
+      switch (Ty->getBitWidth() - ShiftAmt1) {
+      case 1  :
+      case 8  :
+      case 16 :
+      case 32 :
+      case 64 :
+      case 128:
+        SExtType = IntegerType::get(Ty->getBitWidth() - ShiftAmt1);
+        break;
+      default: break;
+      }
+      if (SExtType) {
+        Instruction *NewTrunc = new TruncInst(X, SExtType, "sext");
+        InsertNewInstBefore(NewTrunc, I);
+        return new SExtInst(NewTrunc, Ty);
+      }
+      // Otherwise, we can't handle it yet.
+    } else if (ShiftAmt1 < ShiftAmt2) {
+      uint32_t ShiftDiff = ShiftAmt2-ShiftAmt1;
+      
+      // (X >>? C1) << C2 --> X << (C2-C1) & (-1 << C2)
+      if (I.getOpcode() == Instruction::Shl) {
+        assert(ShiftOp->getOpcode() == Instruction::LShr ||
+               ShiftOp->getOpcode() == Instruction::AShr);
+        Instruction *Shift =
+          BinaryOperator::CreateShl(X, ConstantInt::get(Ty, ShiftDiff));
+        InsertNewInstBefore(Shift, I);
+        
+        APInt Mask(APInt::getHighBitsSet(TypeBits, TypeBits - ShiftAmt2));
+        return BinaryOperator::CreateAnd(Shift, ConstantInt::get(Mask));
+      }
+      
+      // (X << C1) >>u C2  --> X >>u (C2-C1) & (-1 >> C2)
+      if (I.getOpcode() == Instruction::LShr) {
+        assert(ShiftOp->getOpcode() == Instruction::Shl);
+        Instruction *Shift =
+          BinaryOperator::CreateLShr(X, ConstantInt::get(Ty, ShiftDiff));
+        InsertNewInstBefore(Shift, I);
+        
+        APInt Mask(APInt::getLowBitsSet(TypeBits, TypeBits - ShiftAmt2));
+        return BinaryOperator::CreateAnd(Shift, ConstantInt::get(Mask));
+      }
+      
+      // We can't handle (X << C1) >>s C2, it shifts arbitrary bits in.
+    } else {
+      assert(ShiftAmt2 < ShiftAmt1);
+      uint32_t ShiftDiff = ShiftAmt1-ShiftAmt2;
+
+      // (X >>? C1) << C2 --> X >>? (C1-C2) & (-1 << C2)
+      if (I.getOpcode() == Instruction::Shl) {
+        assert(ShiftOp->getOpcode() == Instruction::LShr ||
+               ShiftOp->getOpcode() == Instruction::AShr);
+        Instruction *Shift =
+          BinaryOperator::Create(ShiftOp->getOpcode(), X,
+                                 ConstantInt::get(Ty, ShiftDiff));
+        InsertNewInstBefore(Shift, I);
+        
+        APInt Mask(APInt::getHighBitsSet(TypeBits, TypeBits - ShiftAmt2));
+        return BinaryOperator::CreateAnd(Shift, ConstantInt::get(Mask));
+      }
+      
+      // (X << C1) >>u C2  --> X << (C1-C2) & (-1 >> C2)
+      if (I.getOpcode() == Instruction::LShr) {
+        assert(ShiftOp->getOpcode() == Instruction::Shl);
+        Instruction *Shift =
+          BinaryOperator::CreateShl(X, ConstantInt::get(Ty, ShiftDiff));
+        InsertNewInstBefore(Shift, I);
+        
+        APInt Mask(APInt::getLowBitsSet(TypeBits, TypeBits - ShiftAmt2));
+        return BinaryOperator::CreateAnd(Shift, ConstantInt::get(Mask));
+      }
+      
+      // We can't handle (X << C1) >>a C2, it shifts arbitrary bits in.
+    }
+  }
+  return 0;
+}
+
+
+/// DecomposeSimpleLinearExpr - Analyze 'Val', seeing if it is a simple linear
+/// expression.  If so, decompose it, returning some value X, such that Val is
+/// X*Scale+Offset.
+///
+static Value *DecomposeSimpleLinearExpr(Value *Val, unsigned &Scale,
+                                        int &Offset) {
+  assert(Val->getType() == Type::Int32Ty && "Unexpected allocation size type!");
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(Val)) {
+    Offset = CI->getZExtValue();
+    Scale  = 0;
+    return ConstantInt::get(Type::Int32Ty, 0);
+  } else if (BinaryOperator *I = dyn_cast<BinaryOperator>(Val)) {
+    if (ConstantInt *RHS = dyn_cast<ConstantInt>(I->getOperand(1))) {
+      if (I->getOpcode() == Instruction::Shl) {
+        // This is a value scaled by '1 << the shift amt'.
+        Scale = 1U << RHS->getZExtValue();
+        Offset = 0;
+        return I->getOperand(0);
+      } else if (I->getOpcode() == Instruction::Mul) {
+        // This value is scaled by 'RHS'.
+        Scale = RHS->getZExtValue();
+        Offset = 0;
+        return I->getOperand(0);
+      } else if (I->getOpcode() == Instruction::Add) {
+        // We have X+C.  Check to see if we really have (X*C2)+C1, 
+        // where C1 is divisible by C2.
+        unsigned SubScale;
+        Value *SubVal = 
+          DecomposeSimpleLinearExpr(I->getOperand(0), SubScale, Offset);
+        Offset += RHS->getZExtValue();
+        Scale = SubScale;
+        return SubVal;
+      }
+    }
+  }
+
+  // Otherwise, we can't look past this.
+  Scale = 1;
+  Offset = 0;
+  return Val;
+}
+
+
+/// PromoteCastOfAllocation - If we find a cast of an allocation instruction,
+/// try to eliminate the cast by moving the type information into the alloc.
+Instruction *InstCombiner::PromoteCastOfAllocation(BitCastInst &CI,
+                                                   AllocationInst &AI) {
+  const PointerType *PTy = cast<PointerType>(CI.getType());
+  
+  // Remove any uses of AI that are dead.
+  assert(!CI.use_empty() && "Dead instructions should be removed earlier!");
+  
+  for (Value::use_iterator UI = AI.use_begin(), E = AI.use_end(); UI != E; ) {
+    Instruction *User = cast<Instruction>(*UI++);
+    if (isInstructionTriviallyDead(User)) {
+      while (UI != E && *UI == User)
+        ++UI; // If this instruction uses AI more than once, don't break UI.
+      
+      ++NumDeadInst;
+      DOUT << "IC: DCE: " << *User;
+      EraseInstFromFunction(*User);
+    }
+  }
+  
+  // Get the type really allocated and the type casted to.
+  const Type *AllocElTy = AI.getAllocatedType();
+  const Type *CastElTy = PTy->getElementType();
+  if (!AllocElTy->isSized() || !CastElTy->isSized()) return 0;
+
+  unsigned AllocElTyAlign = TD->getABITypeAlignment(AllocElTy);
+  unsigned CastElTyAlign = TD->getABITypeAlignment(CastElTy);
+  if (CastElTyAlign < AllocElTyAlign) return 0;
+
+  // If the allocation has multiple uses, only promote it if we are strictly
+  // increasing the alignment of the resultant allocation.  If we keep it the
+  // same, we open the door to infinite loops of various kinds.  (A reference
+  // from a dbg.declare doesn't count as a use for this purpose.)
+  if (!AI.hasOneUse() && !hasOneUsePlusDeclare(&AI) &&
+      CastElTyAlign == AllocElTyAlign) return 0;
+
+  uint64_t AllocElTySize = TD->getTypeAllocSize(AllocElTy);
+  uint64_t CastElTySize = TD->getTypeAllocSize(CastElTy);
+  if (CastElTySize == 0 || AllocElTySize == 0) return 0;
+
+  // See if we can satisfy the modulus by pulling a scale out of the array
+  // size argument.
+  unsigned ArraySizeScale;
+  int ArrayOffset;
+  Value *NumElements = // See if the array size is a decomposable linear expr.
+    DecomposeSimpleLinearExpr(AI.getOperand(0), ArraySizeScale, ArrayOffset);
+ 
+  // If we can now satisfy the modulus, by using a non-1 scale, we really can
+  // do the xform.
+  if ((AllocElTySize*ArraySizeScale) % CastElTySize != 0 ||
+      (AllocElTySize*ArrayOffset   ) % CastElTySize != 0) return 0;
+
+  unsigned Scale = (AllocElTySize*ArraySizeScale)/CastElTySize;
+  Value *Amt = 0;
+  if (Scale == 1) {
+    Amt = NumElements;
+  } else {
+    // If the allocation size is constant, form a constant mul expression
+    Amt = ConstantInt::get(Type::Int32Ty, Scale);
+    if (isa<ConstantInt>(NumElements))
+      Amt = Multiply(cast<ConstantInt>(NumElements), cast<ConstantInt>(Amt));
+    // otherwise multiply the amount and the number of elements
+    else {
+      Instruction *Tmp = BinaryOperator::CreateMul(Amt, NumElements, "tmp");
+      Amt = InsertNewInstBefore(Tmp, AI);
+    }
+  }
+  
+  if (int Offset = (AllocElTySize*ArrayOffset)/CastElTySize) {
+    Value *Off = ConstantInt::get(Type::Int32Ty, Offset, true);
+    Instruction *Tmp = BinaryOperator::CreateAdd(Amt, Off, "tmp");
+    Amt = InsertNewInstBefore(Tmp, AI);
+  }
+  
+  AllocationInst *New;
+  if (isa<MallocInst>(AI))
+    New = new MallocInst(CastElTy, Amt, AI.getAlignment());
+  else
+    New = new AllocaInst(CastElTy, Amt, AI.getAlignment());
+  InsertNewInstBefore(New, AI);
+  New->takeName(&AI);
+  
+  // If the allocation has one real use plus a dbg.declare, just remove the
+  // declare.
+  if (DbgDeclareInst *DI = hasOneUsePlusDeclare(&AI)) {
+    EraseInstFromFunction(*DI);
+  }
+  // If the allocation has multiple real uses, insert a cast and change all
+  // things that used it to use the new cast.  This will also hack on CI, but it
+  // will die soon.
+  else if (!AI.hasOneUse()) {
+    AddUsesToWorkList(AI);
+    // New is the allocation instruction, pointer typed. AI is the original
+    // allocation instruction, also pointer typed. Thus, cast to use is BitCast.
+    CastInst *NewCast = new BitCastInst(New, AI.getType(), "tmpcast");
+    InsertNewInstBefore(NewCast, AI);
+    AI.replaceAllUsesWith(NewCast);
+  }
+  return ReplaceInstUsesWith(CI, New);
+}
+
+/// CanEvaluateInDifferentType - Return true if we can take the specified value
+/// and return it as type Ty without inserting any new casts and without
+/// changing the computed value.  This is used by code that tries to decide
+/// whether promoting or shrinking integer operations to wider or smaller types
+/// will allow us to eliminate a truncate or extend.
+///
+/// This is a truncation operation if Ty is smaller than V->getType(), or an
+/// extension operation if Ty is larger.
+///
+/// If CastOpc is a truncation, then Ty will be a type smaller than V.  We
+/// should return true if trunc(V) can be computed by computing V in the smaller
+/// type.  If V is an instruction, then trunc(inst(x,y)) can be computed as
+/// inst(trunc(x),trunc(y)), which only makes sense if x and y can be
+/// efficiently truncated.
+///
+/// If CastOpc is a sext or zext, we are asking if the low bits of the value can
+/// bit computed in a larger type, which is then and'd or sext_in_reg'd to get
+/// the final result.
+bool InstCombiner::CanEvaluateInDifferentType(Value *V, const IntegerType *Ty,
+                                              unsigned CastOpc,
+                                              int &NumCastsRemoved){
+  // We can always evaluate constants in another type.
+  if (isa<ConstantInt>(V))
+    return true;
+  
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (!I) return false;
+  
+  const IntegerType *OrigTy = cast<IntegerType>(V->getType());
+  
+  // If this is an extension or truncate, we can often eliminate it.
+  if (isa<TruncInst>(I) || isa<ZExtInst>(I) || isa<SExtInst>(I)) {
+    // If this is a cast from the destination type, we can trivially eliminate
+    // it, and this will remove a cast overall.
+    if (I->getOperand(0)->getType() == Ty) {
+      // If the first operand is itself a cast, and is eliminable, do not count
+      // this as an eliminable cast.  We would prefer to eliminate those two
+      // casts first.
+      if (!isa<CastInst>(I->getOperand(0)) && I->hasOneUse())
+        ++NumCastsRemoved;
+      return true;
+    }
+  }
+
+  // We can't extend or shrink something that has multiple uses: doing so would
+  // require duplicating the instruction in general, which isn't profitable.
+  if (!I->hasOneUse()) return false;
+
+  unsigned Opc = I->getOpcode();
+  switch (Opc) {
+  case Instruction::Add:
+  case Instruction::Sub:
+  case Instruction::Mul:
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:
+    // These operators can all arbitrarily be extended or truncated.
+    return CanEvaluateInDifferentType(I->getOperand(0), Ty, CastOpc,
+                                      NumCastsRemoved) &&
+           CanEvaluateInDifferentType(I->getOperand(1), Ty, CastOpc,
+                                      NumCastsRemoved);
+
+  case Instruction::Shl:
+    // If we are truncating the result of this SHL, and if it's a shift of a
+    // constant amount, we can always perform a SHL in a smaller type.
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(I->getOperand(1))) {
+      uint32_t BitWidth = Ty->getBitWidth();
+      if (BitWidth < OrigTy->getBitWidth() && 
+          CI->getLimitedValue(BitWidth) < BitWidth)
+        return CanEvaluateInDifferentType(I->getOperand(0), Ty, CastOpc,
+                                          NumCastsRemoved);
+    }
+    break;
+  case Instruction::LShr:
+    // If this is a truncate of a logical shr, we can truncate it to a smaller
+    // lshr iff we know that the bits we would otherwise be shifting in are
+    // already zeros.
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(I->getOperand(1))) {
+      uint32_t OrigBitWidth = OrigTy->getBitWidth();
+      uint32_t BitWidth = Ty->getBitWidth();
+      if (BitWidth < OrigBitWidth &&
+          MaskedValueIsZero(I->getOperand(0),
+            APInt::getHighBitsSet(OrigBitWidth, OrigBitWidth-BitWidth)) &&
+          CI->getLimitedValue(BitWidth) < BitWidth) {
+        return CanEvaluateInDifferentType(I->getOperand(0), Ty, CastOpc,
+                                          NumCastsRemoved);
+      }
+    }
+    break;
+  case Instruction::ZExt:
+  case Instruction::SExt:
+  case Instruction::Trunc:
+    // If this is the same kind of case as our original (e.g. zext+zext), we
+    // can safely replace it.  Note that replacing it does not reduce the number
+    // of casts in the input.
+    if (Opc == CastOpc)
+      return true;
+
+    // sext (zext ty1), ty2 -> zext ty2
+    if (CastOpc == Instruction::SExt && Opc == Instruction::ZExt)
+      return true;
+    break;
+  case Instruction::Select: {
+    SelectInst *SI = cast<SelectInst>(I);
+    return CanEvaluateInDifferentType(SI->getTrueValue(), Ty, CastOpc,
+                                      NumCastsRemoved) &&
+           CanEvaluateInDifferentType(SI->getFalseValue(), Ty, CastOpc,
+                                      NumCastsRemoved);
+  }
+  case Instruction::PHI: {
+    // We can change a phi if we can change all operands.
+    PHINode *PN = cast<PHINode>(I);
+    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+      if (!CanEvaluateInDifferentType(PN->getIncomingValue(i), Ty, CastOpc,
+                                      NumCastsRemoved))
+        return false;
+    return true;
+  }
+  default:
+    // TODO: Can handle more cases here.
+    break;
+  }
+  
+  return false;
+}
+
+/// EvaluateInDifferentType - Given an expression that 
+/// CanEvaluateInDifferentType returns true for, actually insert the code to
+/// evaluate the expression.
+Value *InstCombiner::EvaluateInDifferentType(Value *V, const Type *Ty, 
+                                             bool isSigned) {
+  if (Constant *C = dyn_cast<Constant>(V))
+    return ConstantExpr::getIntegerCast(C, Ty, isSigned /*Sext or ZExt*/);
+
+  // Otherwise, it must be an instruction.
+  Instruction *I = cast<Instruction>(V);
+  Instruction *Res = 0;
+  unsigned Opc = I->getOpcode();
+  switch (Opc) {
+  case Instruction::Add:
+  case Instruction::Sub:
+  case Instruction::Mul:
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:
+  case Instruction::AShr:
+  case Instruction::LShr:
+  case Instruction::Shl: {
+    Value *LHS = EvaluateInDifferentType(I->getOperand(0), Ty, isSigned);
+    Value *RHS = EvaluateInDifferentType(I->getOperand(1), Ty, isSigned);
+    Res = BinaryOperator::Create((Instruction::BinaryOps)Opc, LHS, RHS);
+    break;
+  }    
+  case Instruction::Trunc:
+  case Instruction::ZExt:
+  case Instruction::SExt:
+    // If the source type of the cast is the type we're trying for then we can
+    // just return the source.  There's no need to insert it because it is not
+    // new.
+    if (I->getOperand(0)->getType() == Ty)
+      return I->getOperand(0);
+    
+    // Otherwise, must be the same type of cast, so just reinsert a new one.
+    Res = CastInst::Create(cast<CastInst>(I)->getOpcode(), I->getOperand(0),
+                           Ty);
+    break;
+  case Instruction::Select: {
+    Value *True = EvaluateInDifferentType(I->getOperand(1), Ty, isSigned);
+    Value *False = EvaluateInDifferentType(I->getOperand(2), Ty, isSigned);
+    Res = SelectInst::Create(I->getOperand(0), True, False);
+    break;
+  }
+  case Instruction::PHI: {
+    PHINode *OPN = cast<PHINode>(I);
+    PHINode *NPN = PHINode::Create(Ty);
+    for (unsigned i = 0, e = OPN->getNumIncomingValues(); i != e; ++i) {
+      Value *V =EvaluateInDifferentType(OPN->getIncomingValue(i), Ty, isSigned);
+      NPN->addIncoming(V, OPN->getIncomingBlock(i));
+    }
+    Res = NPN;
+    break;
+  }
+  default: 
+    // TODO: Can handle more cases here.
+    assert(0 && "Unreachable!");
+    break;
+  }
+  
+  Res->takeName(I);
+  return InsertNewInstBefore(Res, *I);
+}
+
+/// @brief Implement the transforms common to all CastInst visitors.
+Instruction *InstCombiner::commonCastTransforms(CastInst &CI) {
+  Value *Src = CI.getOperand(0);
+
+  // Many cases of "cast of a cast" are eliminable. If it's eliminable we just
+  // eliminate it now.
+  if (CastInst *CSrc = dyn_cast<CastInst>(Src)) {   // A->B->C cast
+    if (Instruction::CastOps opc = 
+        isEliminableCastPair(CSrc, CI.getOpcode(), CI.getType(), TD)) {
+      // The first cast (CSrc) is eliminable so we need to fix up or replace
+      // the second cast (CI). CSrc will then have a good chance of being dead.
+      return CastInst::Create(opc, CSrc->getOperand(0), CI.getType());
+    }
+  }
+
+  // If we are casting a select then fold the cast into the select
+  if (SelectInst *SI = dyn_cast<SelectInst>(Src))
+    if (Instruction *NV = FoldOpIntoSelect(CI, SI, this))
+      return NV;
+
+  // If we are casting a PHI then fold the cast into the PHI
+  if (isa<PHINode>(Src))
+    if (Instruction *NV = FoldOpIntoPhi(CI))
+      return NV;
+  
+  return 0;
+}
+
+/// FindElementAtOffset - Given a type and a constant offset, determine whether
+/// or not there is a sequence of GEP indices into the type that will land us at
+/// the specified offset.  If so, fill them into NewIndices and return the
+/// resultant element type, otherwise return null.
+static const Type *FindElementAtOffset(const Type *Ty, int64_t Offset, 
+                                       SmallVectorImpl<Value*> &NewIndices,
+                                       const TargetData *TD) {
+  if (!Ty->isSized()) return 0;
+  
+  // Start with the index over the outer type.  Note that the type size
+  // might be zero (even if the offset isn't zero) if the indexed type
+  // is something like [0 x {int, int}]
+  const Type *IntPtrTy = TD->getIntPtrType();
+  int64_t FirstIdx = 0;
+  if (int64_t TySize = TD->getTypeAllocSize(Ty)) {
+    FirstIdx = Offset/TySize;
+    Offset -= FirstIdx*TySize;
+    
+    // Handle hosts where % returns negative instead of values [0..TySize).
+    if (Offset < 0) {
+      --FirstIdx;
+      Offset += TySize;
+      assert(Offset >= 0);
+    }
+    assert((uint64_t)Offset < (uint64_t)TySize && "Out of range offset");
+  }
+  
+  NewIndices.push_back(ConstantInt::get(IntPtrTy, FirstIdx));
+    
+  // Index into the types.  If we fail, set OrigBase to null.
+  while (Offset) {
+    // Indexing into tail padding between struct/array elements.
+    if (uint64_t(Offset*8) >= TD->getTypeSizeInBits(Ty))
+      return 0;
+    
+    if (const StructType *STy = dyn_cast<StructType>(Ty)) {
+      const StructLayout *SL = TD->getStructLayout(STy);
+      assert(Offset < (int64_t)SL->getSizeInBytes() &&
+             "Offset must stay within the indexed type");
+      
+      unsigned Elt = SL->getElementContainingOffset(Offset);
+      NewIndices.push_back(ConstantInt::get(Type::Int32Ty, Elt));
+      
+      Offset -= SL->getElementOffset(Elt);
+      Ty = STy->getElementType(Elt);
+    } else if (const ArrayType *AT = dyn_cast<ArrayType>(Ty)) {
+      uint64_t EltSize = TD->getTypeAllocSize(AT->getElementType());
+      assert(EltSize && "Cannot index into a zero-sized array");
+      NewIndices.push_back(ConstantInt::get(IntPtrTy,Offset/EltSize));
+      Offset %= EltSize;
+      Ty = AT->getElementType();
+    } else {
+      // Otherwise, we can't index into the middle of this atomic type, bail.
+      return 0;
+    }
+  }
+  
+  return Ty;
+}
+
+/// @brief Implement the transforms for cast of pointer (bitcast/ptrtoint)
+Instruction *InstCombiner::commonPointerCastTransforms(CastInst &CI) {
+  Value *Src = CI.getOperand(0);
+  
+  if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Src)) {
+    // If casting the result of a getelementptr instruction with no offset, turn
+    // this into a cast of the original pointer!
+    if (GEP->hasAllZeroIndices()) {
+      // Changing the cast operand is usually not a good idea but it is safe
+      // here because the pointer operand is being replaced with another 
+      // pointer operand so the opcode doesn't need to change.
+      AddToWorkList(GEP);
+      CI.setOperand(0, GEP->getOperand(0));
+      return &CI;
+    }
+    
+    // If the GEP has a single use, and the base pointer is a bitcast, and the
+    // GEP computes a constant offset, see if we can convert these three
+    // instructions into fewer.  This typically happens with unions and other
+    // non-type-safe code.
+    if (GEP->hasOneUse() && isa<BitCastInst>(GEP->getOperand(0))) {
+      if (GEP->hasAllConstantIndices()) {
+        // We are guaranteed to get a constant from EmitGEPOffset.
+        ConstantInt *OffsetV = cast<ConstantInt>(EmitGEPOffset(GEP, CI, *this));
+        int64_t Offset = OffsetV->getSExtValue();
+        
+        // Get the base pointer input of the bitcast, and the type it points to.
+        Value *OrigBase = cast<BitCastInst>(GEP->getOperand(0))->getOperand(0);
+        const Type *GEPIdxTy =
+          cast<PointerType>(OrigBase->getType())->getElementType();
+        SmallVector<Value*, 8> NewIndices;
+        if (FindElementAtOffset(GEPIdxTy, Offset, NewIndices, TD)) {
+          // If we were able to index down into an element, create the GEP
+          // and bitcast the result.  This eliminates one bitcast, potentially
+          // two.
+          Instruction *NGEP = GetElementPtrInst::Create(OrigBase, 
+                                                        NewIndices.begin(),
+                                                        NewIndices.end(), "");
+          InsertNewInstBefore(NGEP, CI);
+          NGEP->takeName(GEP);
+          
+          if (isa<BitCastInst>(CI))
+            return new BitCastInst(NGEP, CI.getType());
+          assert(isa<PtrToIntInst>(CI));
+          return new PtrToIntInst(NGEP, CI.getType());
+        }
+      }      
+    }
+  }
+    
+  return commonCastTransforms(CI);
+}
+
+/// isSafeIntegerType - Return true if this is a basic integer type, not a crazy
+/// type like i42.  We don't want to introduce operations on random non-legal
+/// integer types where they don't already exist in the code.  In the future,
+/// we should consider making this based off target-data, so that 32-bit targets
+/// won't get i64 operations etc.
+static bool isSafeIntegerType(const Type *Ty) {
+  switch (Ty->getPrimitiveSizeInBits()) {
+  case 8:
+  case 16:
+  case 32:
+  case 64:
+    return true;
+  default: 
+    return false;
+  }
+}
+
+/// Only the TRUNC, ZEXT, SEXT, and BITCAST can both operand and result as
+/// integer types. This function implements the common transforms for all those
+/// cases.
+/// @brief Implement the transforms common to CastInst with integer operands
+Instruction *InstCombiner::commonIntCastTransforms(CastInst &CI) {
+  if (Instruction *Result = commonCastTransforms(CI))
+    return Result;
+
+  Value *Src = CI.getOperand(0);
+  const Type *SrcTy = Src->getType();
+  const Type *DestTy = CI.getType();
+  uint32_t SrcBitSize = SrcTy->getPrimitiveSizeInBits();
+  uint32_t DestBitSize = DestTy->getPrimitiveSizeInBits();
+
+  // See if we can simplify any instructions used by the LHS whose sole 
+  // purpose is to compute bits we don't care about.
+  if (SimplifyDemandedInstructionBits(CI))
+    return &CI;
+
+  // If the source isn't an instruction or has more than one use then we
+  // can't do anything more. 
+  Instruction *SrcI = dyn_cast<Instruction>(Src);
+  if (!SrcI || !Src->hasOneUse())
+    return 0;
+
+  // Attempt to propagate the cast into the instruction for int->int casts.
+  int NumCastsRemoved = 0;
+  if (!isa<BitCastInst>(CI) &&
+      // Only do this if the dest type is a simple type, don't convert the
+      // expression tree to something weird like i93 unless the source is also
+      // strange.
+      (isSafeIntegerType(DestTy) || !isSafeIntegerType(SrcI->getType())) &&
+      CanEvaluateInDifferentType(SrcI, cast<IntegerType>(DestTy),
+                                 CI.getOpcode(), NumCastsRemoved)) {
+    // If this cast is a truncate, evaluting in a different type always
+    // eliminates the cast, so it is always a win.  If this is a zero-extension,
+    // we need to do an AND to maintain the clear top-part of the computation,
+    // so we require that the input have eliminated at least one cast.  If this
+    // is a sign extension, we insert two new casts (to do the extension) so we
+    // require that two casts have been eliminated.
+    bool DoXForm = false;
+    bool JustReplace = false;
+    switch (CI.getOpcode()) {
+    default:
+      // All the others use floating point so we shouldn't actually 
+      // get here because of the check above.
+      assert(0 && "Unknown cast type");
+    case Instruction::Trunc:
+      DoXForm = true;
+      break;
+    case Instruction::ZExt: {
+      DoXForm = NumCastsRemoved >= 1;
+      if (!DoXForm && 0) {
+        // If it's unnecessary to issue an AND to clear the high bits, it's
+        // always profitable to do this xform.
+        Value *TryRes = EvaluateInDifferentType(SrcI, DestTy, false);
+        APInt Mask(APInt::getBitsSet(DestBitSize, SrcBitSize, DestBitSize));
+        if (MaskedValueIsZero(TryRes, Mask))
+          return ReplaceInstUsesWith(CI, TryRes);
+        
+        if (Instruction *TryI = dyn_cast<Instruction>(TryRes))
+          if (TryI->use_empty())
+            EraseInstFromFunction(*TryI);
+      }
+      break;
+    }
+    case Instruction::SExt: {
+      DoXForm = NumCastsRemoved >= 2;
+      if (!DoXForm && !isa<TruncInst>(SrcI) && 0) {
+        // If we do not have to emit the truncate + sext pair, then it's always
+        // profitable to do this xform.
+        //
+        // It's not safe to eliminate the trunc + sext pair if one of the
+        // eliminated cast is a truncate. e.g.
+        // t2 = trunc i32 t1 to i16
+        // t3 = sext i16 t2 to i32
+        // !=
+        // i32 t1
+        Value *TryRes = EvaluateInDifferentType(SrcI, DestTy, true);
+        unsigned NumSignBits = ComputeNumSignBits(TryRes);
+        if (NumSignBits > (DestBitSize - SrcBitSize))
+          return ReplaceInstUsesWith(CI, TryRes);
+        
+        if (Instruction *TryI = dyn_cast<Instruction>(TryRes))
+          if (TryI->use_empty())
+            EraseInstFromFunction(*TryI);
+      }
+      break;
+    }
+    }
+    
+    if (DoXForm) {
+      DOUT << "ICE: EvaluateInDifferentType converting expression type to avoid"
+           << " cast: " << CI;
+      Value *Res = EvaluateInDifferentType(SrcI, DestTy, 
+                                           CI.getOpcode() == Instruction::SExt);
+      if (JustReplace)
+        // Just replace this cast with the result.
+        return ReplaceInstUsesWith(CI, Res);
+
+      assert(Res->getType() == DestTy);
+      switch (CI.getOpcode()) {
+      default: assert(0 && "Unknown cast type!");
+      case Instruction::Trunc:
+      case Instruction::BitCast:
+        // Just replace this cast with the result.
+        return ReplaceInstUsesWith(CI, Res);
+      case Instruction::ZExt: {
+        assert(SrcBitSize < DestBitSize && "Not a zext?");
+
+        // If the high bits are already zero, just replace this cast with the
+        // result.
+        APInt Mask(APInt::getBitsSet(DestBitSize, SrcBitSize, DestBitSize));
+        if (MaskedValueIsZero(Res, Mask))
+          return ReplaceInstUsesWith(CI, Res);
+
+        // We need to emit an AND to clear the high bits.
+        Constant *C = ConstantInt::get(APInt::getLowBitsSet(DestBitSize,
+                                                            SrcBitSize));
+        return BinaryOperator::CreateAnd(Res, C);
+      }
+      case Instruction::SExt: {
+        // If the high bits are already filled with sign bit, just replace this
+        // cast with the result.
+        unsigned NumSignBits = ComputeNumSignBits(Res);
+        if (NumSignBits > (DestBitSize - SrcBitSize))
+          return ReplaceInstUsesWith(CI, Res);
+
+        // We need to emit a cast to truncate, then a cast to sext.
+        return CastInst::Create(Instruction::SExt,
+            InsertCastBefore(Instruction::Trunc, Res, Src->getType(), 
+                             CI), DestTy);
+      }
+      }
+    }
+  }
+  
+  Value *Op0 = SrcI->getNumOperands() > 0 ? SrcI->getOperand(0) : 0;
+  Value *Op1 = SrcI->getNumOperands() > 1 ? SrcI->getOperand(1) : 0;
+
+  switch (SrcI->getOpcode()) {
+  case Instruction::Add:
+  case Instruction::Mul:
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:
+    // If we are discarding information, rewrite.
+    if (DestBitSize <= SrcBitSize && DestBitSize != 1) {
+      // Don't insert two casts if they cannot be eliminated.  We allow 
+      // two casts to be inserted if the sizes are the same.  This could 
+      // only be converting signedness, which is a noop.
+      if (DestBitSize == SrcBitSize || 
+          !ValueRequiresCast(CI.getOpcode(), Op1, DestTy,TD) ||
+          !ValueRequiresCast(CI.getOpcode(), Op0, DestTy, TD)) {
+        Instruction::CastOps opcode = CI.getOpcode();
+        Value *Op0c = InsertCastBefore(opcode, Op0, DestTy, *SrcI);
+        Value *Op1c = InsertCastBefore(opcode, Op1, DestTy, *SrcI);
+        return BinaryOperator::Create(
+            cast<BinaryOperator>(SrcI)->getOpcode(), Op0c, Op1c);
+      }
+    }
+
+    // cast (xor bool X, true) to int  --> xor (cast bool X to int), 1
+    if (isa<ZExtInst>(CI) && SrcBitSize == 1 && 
+        SrcI->getOpcode() == Instruction::Xor &&
+        Op1 == ConstantInt::getTrue() &&
+        (!Op0->hasOneUse() || !isa<CmpInst>(Op0))) {
+      Value *New = InsertCastBefore(Instruction::ZExt, Op0, DestTy, CI);
+      return BinaryOperator::CreateXor(New, ConstantInt::get(CI.getType(), 1));
+    }
+    break;
+  case Instruction::SDiv:
+  case Instruction::UDiv:
+  case Instruction::SRem:
+  case Instruction::URem:
+    // If we are just changing the sign, rewrite.
+    if (DestBitSize == SrcBitSize) {
+      // Don't insert two casts if they cannot be eliminated.  We allow 
+      // two casts to be inserted if the sizes are the same.  This could 
+      // only be converting signedness, which is a noop.
+      if (!ValueRequiresCast(CI.getOpcode(), Op1, DestTy, TD) || 
+          !ValueRequiresCast(CI.getOpcode(), Op0, DestTy, TD)) {
+        Value *Op0c = InsertCastBefore(Instruction::BitCast, 
+                                       Op0, DestTy, *SrcI);
+        Value *Op1c = InsertCastBefore(Instruction::BitCast, 
+                                       Op1, DestTy, *SrcI);
+        return BinaryOperator::Create(
+          cast<BinaryOperator>(SrcI)->getOpcode(), Op0c, Op1c);
+      }
+    }
+    break;
+
+  case Instruction::Shl:
+    // Allow changing the sign of the source operand.  Do not allow 
+    // changing the size of the shift, UNLESS the shift amount is a 
+    // constant.  We must not change variable sized shifts to a smaller 
+    // size, because it is undefined to shift more bits out than exist 
+    // in the value.
+    if (DestBitSize == SrcBitSize ||
+        (DestBitSize < SrcBitSize && isa<Constant>(Op1))) {
+      Instruction::CastOps opcode = (DestBitSize == SrcBitSize ?
+          Instruction::BitCast : Instruction::Trunc);
+      Value *Op0c = InsertCastBefore(opcode, Op0, DestTy, *SrcI);
+      Value *Op1c = InsertCastBefore(opcode, Op1, DestTy, *SrcI);
+      return BinaryOperator::CreateShl(Op0c, Op1c);
+    }
+    break;
+  case Instruction::AShr:
+    // If this is a signed shr, and if all bits shifted in are about to be
+    // truncated off, turn it into an unsigned shr to allow greater
+    // simplifications.
+    if (DestBitSize < SrcBitSize &&
+        isa<ConstantInt>(Op1)) {
+      uint32_t ShiftAmt = cast<ConstantInt>(Op1)->getLimitedValue(SrcBitSize);
+      if (SrcBitSize > ShiftAmt && SrcBitSize-ShiftAmt >= DestBitSize) {
+        // Insert the new logical shift right.
+        return BinaryOperator::CreateLShr(Op0, Op1);
+      }
+    }
+    break;
+  }
+  return 0;
+}
+
+Instruction *InstCombiner::visitTrunc(TruncInst &CI) {
+  if (Instruction *Result = commonIntCastTransforms(CI))
+    return Result;
+  
+  Value *Src = CI.getOperand(0);
+  const Type *Ty = CI.getType();
+  uint32_t DestBitWidth = Ty->getPrimitiveSizeInBits();
+  uint32_t SrcBitWidth = cast<IntegerType>(Src->getType())->getBitWidth();
+
+  // Canonicalize trunc x to i1 -> (icmp ne (and x, 1), 0)
+  if (DestBitWidth == 1) {
+    Constant *One = ConstantInt::get(Src->getType(), 1);
+    Src = InsertNewInstBefore(BinaryOperator::CreateAnd(Src, One, "tmp"), CI);
+    Value *Zero = Constant::getNullValue(Src->getType());
+    return new ICmpInst(ICmpInst::ICMP_NE, Src, Zero);
+  }
+  
+  // Optimize trunc(lshr(), c) to pull the shift through the truncate.
+  ConstantInt *ShAmtV = 0;
+  Value *ShiftOp = 0;
+  if (Src->hasOneUse() &&
+      match(Src, m_LShr(m_Value(ShiftOp), m_ConstantInt(ShAmtV)))) {
+    uint32_t ShAmt = ShAmtV->getLimitedValue(SrcBitWidth);
+    
+    // Get a mask for the bits shifting in.
+    APInt Mask(APInt::getLowBitsSet(SrcBitWidth, ShAmt).shl(DestBitWidth));
+    if (MaskedValueIsZero(ShiftOp, Mask)) {
+      if (ShAmt >= DestBitWidth)        // All zeros.
+        return ReplaceInstUsesWith(CI, Constant::getNullValue(Ty));
+      
+      // Okay, we can shrink this.  Truncate the input, then return a new
+      // shift.
+      Value *V1 = InsertCastBefore(Instruction::Trunc, ShiftOp, Ty, CI);
+      Value *V2 = ConstantExpr::getTrunc(ShAmtV, Ty);
+      return BinaryOperator::CreateLShr(V1, V2);
+    }
+  }
+  
+  return 0;
+}
+
+/// transformZExtICmp - Transform (zext icmp) to bitwise / integer operations
+/// in order to eliminate the icmp.
+Instruction *InstCombiner::transformZExtICmp(ICmpInst *ICI, Instruction &CI,
+                                             bool DoXform) {
+  // If we are just checking for a icmp eq of a single bit and zext'ing it
+  // to an integer, then shift the bit to the appropriate place and then
+  // cast to integer to avoid the comparison.
+  if (ConstantInt *Op1C = dyn_cast<ConstantInt>(ICI->getOperand(1))) {
+    const APInt &Op1CV = Op1C->getValue();
+      
+    // zext (x <s  0) to i32 --> x>>u31      true if signbit set.
+    // zext (x >s -1) to i32 --> (x>>u31)^1  true if signbit clear.
+    if ((ICI->getPredicate() == ICmpInst::ICMP_SLT && Op1CV == 0) ||
+        (ICI->getPredicate() == ICmpInst::ICMP_SGT &&Op1CV.isAllOnesValue())) {
+      if (!DoXform) return ICI;
+
+      Value *In = ICI->getOperand(0);
+      Value *Sh = ConstantInt::get(In->getType(),
+                                   In->getType()->getPrimitiveSizeInBits()-1);
+      In = InsertNewInstBefore(BinaryOperator::CreateLShr(In, Sh,
+                                                        In->getName()+".lobit"),
+                               CI);
+      if (In->getType() != CI.getType())
+        In = CastInst::CreateIntegerCast(In, CI.getType(),
+                                         false/*ZExt*/, "tmp", &CI);
+
+      if (ICI->getPredicate() == ICmpInst::ICMP_SGT) {
+        Constant *One = ConstantInt::get(In->getType(), 1);
+        In = InsertNewInstBefore(BinaryOperator::CreateXor(In, One,
+                                                         In->getName()+".not"),
+                                 CI);
+      }
+
+      return ReplaceInstUsesWith(CI, In);
+    }
+      
+      
+      
+    // zext (X == 0) to i32 --> X^1      iff X has only the low bit set.
+    // zext (X == 0) to i32 --> (X>>1)^1 iff X has only the 2nd bit set.
+    // zext (X == 1) to i32 --> X        iff X has only the low bit set.
+    // zext (X == 2) to i32 --> X>>1     iff X has only the 2nd bit set.
+    // zext (X != 0) to i32 --> X        iff X has only the low bit set.
+    // zext (X != 0) to i32 --> X>>1     iff X has only the 2nd bit set.
+    // zext (X != 1) to i32 --> X^1      iff X has only the low bit set.
+    // zext (X != 2) to i32 --> (X>>1)^1 iff X has only the 2nd bit set.
+    if ((Op1CV == 0 || Op1CV.isPowerOf2()) && 
+        // This only works for EQ and NE
+        ICI->isEquality()) {
+      // If Op1C some other power of two, convert:
+      uint32_t BitWidth = Op1C->getType()->getBitWidth();
+      APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0);
+      APInt TypeMask(APInt::getAllOnesValue(BitWidth));
+      ComputeMaskedBits(ICI->getOperand(0), TypeMask, KnownZero, KnownOne);
+        
+      APInt KnownZeroMask(~KnownZero);
+      if (KnownZeroMask.isPowerOf2()) { // Exactly 1 possible 1?
+        if (!DoXform) return ICI;
+
+        bool isNE = ICI->getPredicate() == ICmpInst::ICMP_NE;
+        if (Op1CV != 0 && (Op1CV != KnownZeroMask)) {
+          // (X&4) == 2 --> false
+          // (X&4) != 2 --> true
+          Constant *Res = ConstantInt::get(Type::Int1Ty, isNE);
+          Res = ConstantExpr::getZExt(Res, CI.getType());
+          return ReplaceInstUsesWith(CI, Res);
+        }
+          
+        uint32_t ShiftAmt = KnownZeroMask.logBase2();
+        Value *In = ICI->getOperand(0);
+        if (ShiftAmt) {
+          // Perform a logical shr by shiftamt.
+          // Insert the shift to put the result in the low bit.
+          In = InsertNewInstBefore(BinaryOperator::CreateLShr(In,
+                                  ConstantInt::get(In->getType(), ShiftAmt),
+                                                   In->getName()+".lobit"), CI);
+        }
+          
+        if ((Op1CV != 0) == isNE) { // Toggle the low bit.
+          Constant *One = ConstantInt::get(In->getType(), 1);
+          In = BinaryOperator::CreateXor(In, One, "tmp");
+          InsertNewInstBefore(cast<Instruction>(In), CI);
+        }
+          
+        if (CI.getType() == In->getType())
+          return ReplaceInstUsesWith(CI, In);
+        else
+          return CastInst::CreateIntegerCast(In, CI.getType(), false/*ZExt*/);
+      }
+    }
+  }
+
+  return 0;
+}
+
+Instruction *InstCombiner::visitZExt(ZExtInst &CI) {
+  // If one of the common conversion will work ..
+  if (Instruction *Result = commonIntCastTransforms(CI))
+    return Result;
+
+  Value *Src = CI.getOperand(0);
+
+  // If this is a TRUNC followed by a ZEXT then we are dealing with integral
+  // types and if the sizes are just right we can convert this into a logical
+  // 'and' which will be much cheaper than the pair of casts.
+  if (TruncInst *CSrc = dyn_cast<TruncInst>(Src)) {   // A->B->C cast
+    // Get the sizes of the types involved.  We know that the intermediate type
+    // will be smaller than A or C, but don't know the relation between A and C.
+    Value *A = CSrc->getOperand(0);
+    unsigned SrcSize = A->getType()->getPrimitiveSizeInBits();
+    unsigned MidSize = CSrc->getType()->getPrimitiveSizeInBits();
+    unsigned DstSize = CI.getType()->getPrimitiveSizeInBits();
+    // If we're actually extending zero bits, then if
+    // SrcSize <  DstSize: zext(a & mask)
+    // SrcSize == DstSize: a & mask
+    // SrcSize  > DstSize: trunc(a) & mask
+    if (SrcSize < DstSize) {
+      APInt AndValue(APInt::getLowBitsSet(SrcSize, MidSize));
+      Constant *AndConst = ConstantInt::get(AndValue);
+      Instruction *And =
+        BinaryOperator::CreateAnd(A, AndConst, CSrc->getName()+".mask");
+      InsertNewInstBefore(And, CI);
+      return new ZExtInst(And, CI.getType());
+    } else if (SrcSize == DstSize) {
+      APInt AndValue(APInt::getLowBitsSet(SrcSize, MidSize));
+      return BinaryOperator::CreateAnd(A, ConstantInt::get(AndValue));
+    } else if (SrcSize > DstSize) {
+      Instruction *Trunc = new TruncInst(A, CI.getType(), "tmp");
+      InsertNewInstBefore(Trunc, CI);
+      APInt AndValue(APInt::getLowBitsSet(DstSize, MidSize));
+      return BinaryOperator::CreateAnd(Trunc, ConstantInt::get(AndValue));
+    }
+  }
+
+  if (ICmpInst *ICI = dyn_cast<ICmpInst>(Src))
+    return transformZExtICmp(ICI, CI);
+
+  BinaryOperator *SrcI = dyn_cast<BinaryOperator>(Src);
+  if (SrcI && SrcI->getOpcode() == Instruction::Or) {
+    // zext (or icmp, icmp) --> or (zext icmp), (zext icmp) if at least one
+    // of the (zext icmp) will be transformed.
+    ICmpInst *LHS = dyn_cast<ICmpInst>(SrcI->getOperand(0));
+    ICmpInst *RHS = dyn_cast<ICmpInst>(SrcI->getOperand(1));
+    if (LHS && RHS && LHS->hasOneUse() && RHS->hasOneUse() &&
+        (transformZExtICmp(LHS, CI, false) ||
+         transformZExtICmp(RHS, CI, false))) {
+      Value *LCast = InsertCastBefore(Instruction::ZExt, LHS, CI.getType(), CI);
+      Value *RCast = InsertCastBefore(Instruction::ZExt, RHS, CI.getType(), CI);
+      return BinaryOperator::Create(Instruction::Or, LCast, RCast);
+    }
+  }
+
+  return 0;
+}
+
+Instruction *InstCombiner::visitSExt(SExtInst &CI) {
+  if (Instruction *I = commonIntCastTransforms(CI))
+    return I;
+  
+  Value *Src = CI.getOperand(0);
+  
+  // Canonicalize sign-extend from i1 to a select.
+  if (Src->getType() == Type::Int1Ty)
+    return SelectInst::Create(Src,
+                              ConstantInt::getAllOnesValue(CI.getType()),
+                              Constant::getNullValue(CI.getType()));
+
+  // See if the value being truncated is already sign extended.  If so, just
+  // eliminate the trunc/sext pair.
+  if (getOpcode(Src) == Instruction::Trunc) {
+    Value *Op = cast<User>(Src)->getOperand(0);
+    unsigned OpBits   = cast<IntegerType>(Op->getType())->getBitWidth();
+    unsigned MidBits  = cast<IntegerType>(Src->getType())->getBitWidth();
+    unsigned DestBits = cast<IntegerType>(CI.getType())->getBitWidth();
+    unsigned NumSignBits = ComputeNumSignBits(Op);
+
+    if (OpBits == DestBits) {
+      // Op is i32, Mid is i8, and Dest is i32.  If Op has more than 24 sign
+      // bits, it is already ready.
+      if (NumSignBits > DestBits-MidBits)
+        return ReplaceInstUsesWith(CI, Op);
+    } else if (OpBits < DestBits) {
+      // Op is i32, Mid is i8, and Dest is i64.  If Op has more than 24 sign
+      // bits, just sext from i32.
+      if (NumSignBits > OpBits-MidBits)
+        return new SExtInst(Op, CI.getType(), "tmp");
+    } else {
+      // Op is i64, Mid is i8, and Dest is i32.  If Op has more than 56 sign
+      // bits, just truncate to i32.
+      if (NumSignBits > OpBits-MidBits)
+        return new TruncInst(Op, CI.getType(), "tmp");
+    }
+  }
+
+  // If the input is a shl/ashr pair of a same constant, then this is a sign
+  // extension from a smaller value.  If we could trust arbitrary bitwidth
+  // integers, we could turn this into a truncate to the smaller bit and then
+  // use a sext for the whole extension.  Since we don't, look deeper and check
+  // for a truncate.  If the source and dest are the same type, eliminate the
+  // trunc and extend and just do shifts.  For example, turn:
+  //   %a = trunc i32 %i to i8
+  //   %b = shl i8 %a, 6
+  //   %c = ashr i8 %b, 6
+  //   %d = sext i8 %c to i32
+  // into:
+  //   %a = shl i32 %i, 30
+  //   %d = ashr i32 %a, 30
+  Value *A = 0;
+  ConstantInt *BA = 0, *CA = 0;
+  if (match(Src, m_AShr(m_Shl(m_Value(A), m_ConstantInt(BA)),
+                        m_ConstantInt(CA))) &&
+      BA == CA && isa<TruncInst>(A)) {
+    Value *I = cast<TruncInst>(A)->getOperand(0);
+    if (I->getType() == CI.getType()) {
+      unsigned MidSize = Src->getType()->getPrimitiveSizeInBits();
+      unsigned SrcDstSize = CI.getType()->getPrimitiveSizeInBits();
+      unsigned ShAmt = CA->getZExtValue()+SrcDstSize-MidSize;
+      Constant *ShAmtV = ConstantInt::get(CI.getType(), ShAmt);
+      I = InsertNewInstBefore(BinaryOperator::CreateShl(I, ShAmtV,
+                                                        CI.getName()), CI);
+      return BinaryOperator::CreateAShr(I, ShAmtV);
+    }
+  }
+  
+  return 0;
+}
+
+/// FitsInFPType - Return a Constant* for the specified FP constant if it fits
+/// in the specified FP type without changing its value.
+static Constant *FitsInFPType(ConstantFP *CFP, const fltSemantics &Sem) {
+  bool losesInfo;
+  APFloat F = CFP->getValueAPF();
+  (void)F.convert(Sem, APFloat::rmNearestTiesToEven, &losesInfo);
+  if (!losesInfo)
+    return ConstantFP::get(F);
+  return 0;
+}
+
+/// LookThroughFPExtensions - If this is an fp extension instruction, look
+/// through it until we get the source value.
+static Value *LookThroughFPExtensions(Value *V) {
+  if (Instruction *I = dyn_cast<Instruction>(V))
+    if (I->getOpcode() == Instruction::FPExt)
+      return LookThroughFPExtensions(I->getOperand(0));
+  
+  // If this value is a constant, return the constant in the smallest FP type
+  // that can accurately represent it.  This allows us to turn
+  // (float)((double)X+2.0) into x+2.0f.
+  if (ConstantFP *CFP = dyn_cast<ConstantFP>(V)) {
+    if (CFP->getType() == Type::PPC_FP128Ty)
+      return V;  // No constant folding of this.
+    // See if the value can be truncated to float and then reextended.
+    if (Value *V = FitsInFPType(CFP, APFloat::IEEEsingle))
+      return V;
+    if (CFP->getType() == Type::DoubleTy)
+      return V;  // Won't shrink.
+    if (Value *V = FitsInFPType(CFP, APFloat::IEEEdouble))
+      return V;
+    // Don't try to shrink to various long double types.
+  }
+  
+  return V;
+}
+
+Instruction *InstCombiner::visitFPTrunc(FPTruncInst &CI) {
+  if (Instruction *I = commonCastTransforms(CI))
+    return I;
+  
+  // If we have fptrunc(add (fpextend x), (fpextend y)), where x and y are
+  // smaller than the destination type, we can eliminate the truncate by doing
+  // the add as the smaller type.  This applies to add/sub/mul/div as well as
+  // many builtins (sqrt, etc).
+  BinaryOperator *OpI = dyn_cast<BinaryOperator>(CI.getOperand(0));
+  if (OpI && OpI->hasOneUse()) {
+    switch (OpI->getOpcode()) {
+    default: break;
+    case Instruction::Add:
+    case Instruction::Sub:
+    case Instruction::Mul:
+    case Instruction::FDiv:
+    case Instruction::FRem:
+      const Type *SrcTy = OpI->getType();
+      Value *LHSTrunc = LookThroughFPExtensions(OpI->getOperand(0));
+      Value *RHSTrunc = LookThroughFPExtensions(OpI->getOperand(1));
+      if (LHSTrunc->getType() != SrcTy && 
+          RHSTrunc->getType() != SrcTy) {
+        unsigned DstSize = CI.getType()->getPrimitiveSizeInBits();
+        // If the source types were both smaller than the destination type of
+        // the cast, do this xform.
+        if (LHSTrunc->getType()->getPrimitiveSizeInBits() <= DstSize &&
+            RHSTrunc->getType()->getPrimitiveSizeInBits() <= DstSize) {
+          LHSTrunc = InsertCastBefore(Instruction::FPExt, LHSTrunc,
+                                      CI.getType(), CI);
+          RHSTrunc = InsertCastBefore(Instruction::FPExt, RHSTrunc,
+                                      CI.getType(), CI);
+          return BinaryOperator::Create(OpI->getOpcode(), LHSTrunc, RHSTrunc);
+        }
+      }
+      break;  
+    }
+  }
+  return 0;
+}
+
+Instruction *InstCombiner::visitFPExt(CastInst &CI) {
+  return commonCastTransforms(CI);
+}
+
+Instruction *InstCombiner::visitFPToUI(FPToUIInst &FI) {
+  Instruction *OpI = dyn_cast<Instruction>(FI.getOperand(0));
+  if (OpI == 0)
+    return commonCastTransforms(FI);
+
+  // fptoui(uitofp(X)) --> X
+  // fptoui(sitofp(X)) --> X
+  // This is safe if the intermediate type has enough bits in its mantissa to
+  // accurately represent all values of X.  For example, do not do this with
+  // i64->float->i64.  This is also safe for sitofp case, because any negative
+  // 'X' value would cause an undefined result for the fptoui. 
+  if ((isa<UIToFPInst>(OpI) || isa<SIToFPInst>(OpI)) &&
+      OpI->getOperand(0)->getType() == FI.getType() &&
+      (int)FI.getType()->getPrimitiveSizeInBits() < /*extra bit for sign */
+                    OpI->getType()->getFPMantissaWidth())
+    return ReplaceInstUsesWith(FI, OpI->getOperand(0));
+
+  return commonCastTransforms(FI);
+}
+
+Instruction *InstCombiner::visitFPToSI(FPToSIInst &FI) {
+  Instruction *OpI = dyn_cast<Instruction>(FI.getOperand(0));
+  if (OpI == 0)
+    return commonCastTransforms(FI);
+  
+  // fptosi(sitofp(X)) --> X
+  // fptosi(uitofp(X)) --> X
+  // This is safe if the intermediate type has enough bits in its mantissa to
+  // accurately represent all values of X.  For example, do not do this with
+  // i64->float->i64.  This is also safe for sitofp case, because any negative
+  // 'X' value would cause an undefined result for the fptoui. 
+  if ((isa<UIToFPInst>(OpI) || isa<SIToFPInst>(OpI)) &&
+      OpI->getOperand(0)->getType() == FI.getType() &&
+      (int)FI.getType()->getPrimitiveSizeInBits() <= 
+                    OpI->getType()->getFPMantissaWidth())
+    return ReplaceInstUsesWith(FI, OpI->getOperand(0));
+  
+  return commonCastTransforms(FI);
+}
+
+Instruction *InstCombiner::visitUIToFP(CastInst &CI) {
+  return commonCastTransforms(CI);
+}
+
+Instruction *InstCombiner::visitSIToFP(CastInst &CI) {
+  return commonCastTransforms(CI);
+}
+
+Instruction *InstCombiner::visitPtrToInt(PtrToIntInst &CI) {
+  // If the destination integer type is smaller than the intptr_t type for
+  // this target, do a ptrtoint to intptr_t then do a trunc.  This allows the
+  // trunc to be exposed to other transforms.  Don't do this for extending
+  // ptrtoint's, because we don't know if the target sign or zero extends its
+  // pointers.
+  if (CI.getType()->getPrimitiveSizeInBits() < TD->getPointerSizeInBits()) {
+    Value *P = InsertNewInstBefore(new PtrToIntInst(CI.getOperand(0),
+                                                    TD->getIntPtrType(),
+                                                    "tmp"), CI);
+    return new TruncInst(P, CI.getType());
+  }
+  
+  return commonPointerCastTransforms(CI);
+}
+
+Instruction *InstCombiner::visitIntToPtr(IntToPtrInst &CI) {
+  // If the source integer type is larger than the intptr_t type for
+  // this target, do a trunc to the intptr_t type, then inttoptr of it.  This
+  // allows the trunc to be exposed to other transforms.  Don't do this for
+  // extending inttoptr's, because we don't know if the target sign or zero
+  // extends to pointers.
+  if (CI.getOperand(0)->getType()->getPrimitiveSizeInBits() >
+      TD->getPointerSizeInBits()) {
+    Value *P = InsertNewInstBefore(new TruncInst(CI.getOperand(0),
+                                                 TD->getIntPtrType(),
+                                                 "tmp"), CI);
+    return new IntToPtrInst(P, CI.getType());
+  }
+  
+  if (Instruction *I = commonCastTransforms(CI))
+    return I;
+  
+  const Type *DestPointee = cast<PointerType>(CI.getType())->getElementType();
+  if (!DestPointee->isSized()) return 0;
+
+  // If this is inttoptr(add (ptrtoint x), cst), try to turn this into a GEP.
+  ConstantInt *Cst;
+  Value *X;
+  if (match(CI.getOperand(0), m_Add(m_Cast<PtrToIntInst>(m_Value(X)),
+                                    m_ConstantInt(Cst)))) {
+    // If the source and destination operands have the same type, see if this
+    // is a single-index GEP.
+    if (X->getType() == CI.getType()) {
+      // Get the size of the pointee type.
+      uint64_t Size = TD->getTypeAllocSize(DestPointee);
+
+      // Convert the constant to intptr type.
+      APInt Offset = Cst->getValue();
+      Offset.sextOrTrunc(TD->getPointerSizeInBits());
+
+      // If Offset is evenly divisible by Size, we can do this xform.
+      if (Size && !APIntOps::srem(Offset, APInt(Offset.getBitWidth(), Size))){
+        Offset = APIntOps::sdiv(Offset, APInt(Offset.getBitWidth(), Size));
+        return GetElementPtrInst::Create(X, ConstantInt::get(Offset));
+      }
+    }
+    // TODO: Could handle other cases, e.g. where add is indexing into field of
+    // struct etc.
+  } else if (CI.getOperand(0)->hasOneUse() &&
+             match(CI.getOperand(0), m_Add(m_Value(X), m_ConstantInt(Cst)))) {
+    // Otherwise, if this is inttoptr(add x, cst), try to turn this into an
+    // "inttoptr+GEP" instead of "add+intptr".
+    
+    // Get the size of the pointee type.
+    uint64_t Size = TD->getTypeAllocSize(DestPointee);
+    
+    // Convert the constant to intptr type.
+    APInt Offset = Cst->getValue();
+    Offset.sextOrTrunc(TD->getPointerSizeInBits());
+    
+    // If Offset is evenly divisible by Size, we can do this xform.
+    if (Size && !APIntOps::srem(Offset, APInt(Offset.getBitWidth(), Size))){
+      Offset = APIntOps::sdiv(Offset, APInt(Offset.getBitWidth(), Size));
+      
+      Instruction *P = InsertNewInstBefore(new IntToPtrInst(X, CI.getType(),
+                                                            "tmp"), CI);
+      return GetElementPtrInst::Create(P, ConstantInt::get(Offset), "tmp");
+    }
+  }
+  return 0;
+}
+
+Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
+  // If the operands are integer typed then apply the integer transforms,
+  // otherwise just apply the common ones.
+  Value *Src = CI.getOperand(0);
+  const Type *SrcTy = Src->getType();
+  const Type *DestTy = CI.getType();
+
+  if (SrcTy->isInteger() && DestTy->isInteger()) {
+    if (Instruction *Result = commonIntCastTransforms(CI))
+      return Result;
+  } else if (isa<PointerType>(SrcTy)) {
+    if (Instruction *I = commonPointerCastTransforms(CI))
+      return I;
+  } else {
+    if (Instruction *Result = commonCastTransforms(CI))
+      return Result;
+  }
+
+
+  // Get rid of casts from one type to the same type. These are useless and can
+  // be replaced by the operand.
+  if (DestTy == Src->getType())
+    return ReplaceInstUsesWith(CI, Src);
+
+  if (const PointerType *DstPTy = dyn_cast<PointerType>(DestTy)) {
+    const PointerType *SrcPTy = cast<PointerType>(SrcTy);
+    const Type *DstElTy = DstPTy->getElementType();
+    const Type *SrcElTy = SrcPTy->getElementType();
+    
+    // If the address spaces don't match, don't eliminate the bitcast, which is
+    // required for changing types.
+    if (SrcPTy->getAddressSpace() != DstPTy->getAddressSpace())
+      return 0;
+    
+    // If we are casting a malloc or alloca to a pointer to a type of the same
+    // size, rewrite the allocation instruction to allocate the "right" type.
+    if (AllocationInst *AI = dyn_cast<AllocationInst>(Src))
+      if (Instruction *V = PromoteCastOfAllocation(CI, *AI))
+        return V;
+    
+    // If the source and destination are pointers, and this cast is equivalent
+    // to a getelementptr X, 0, 0, 0...  turn it into the appropriate gep.
+    // This can enhance SROA and other transforms that want type-safe pointers.
+    Constant *ZeroUInt = Constant::getNullValue(Type::Int32Ty);
+    unsigned NumZeros = 0;
+    while (SrcElTy != DstElTy && 
+           isa<CompositeType>(SrcElTy) && !isa<PointerType>(SrcElTy) &&
+           SrcElTy->getNumContainedTypes() /* not "{}" */) {
+      SrcElTy = cast<CompositeType>(SrcElTy)->getTypeAtIndex(ZeroUInt);
+      ++NumZeros;
+    }
+
+    // If we found a path from the src to dest, create the getelementptr now.
+    if (SrcElTy == DstElTy) {
+      SmallVector<Value*, 8> Idxs(NumZeros+1, ZeroUInt);
+      return GetElementPtrInst::Create(Src, Idxs.begin(), Idxs.end(), "", 
+                                       ((Instruction*) NULL));
+    }
+  }
+
+  if (ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(Src)) {
+    if (SVI->hasOneUse()) {
+      // Okay, we have (bitconvert (shuffle ..)).  Check to see if this is
+      // a bitconvert to a vector with the same # elts.
+      if (isa<VectorType>(DestTy) && 
+          cast<VectorType>(DestTy)->getNumElements() ==
+                SVI->getType()->getNumElements() &&
+          SVI->getType()->getNumElements() ==
+            cast<VectorType>(SVI->getOperand(0)->getType())->getNumElements()) {
+        CastInst *Tmp;
+        // If either of the operands is a cast from CI.getType(), then
+        // evaluating the shuffle in the casted destination's type will allow
+        // us to eliminate at least one cast.
+        if (((Tmp = dyn_cast<CastInst>(SVI->getOperand(0))) && 
+             Tmp->getOperand(0)->getType() == DestTy) ||
+            ((Tmp = dyn_cast<CastInst>(SVI->getOperand(1))) && 
+             Tmp->getOperand(0)->getType() == DestTy)) {
+          Value *LHS = InsertCastBefore(Instruction::BitCast,
+                                        SVI->getOperand(0), DestTy, CI);
+          Value *RHS = InsertCastBefore(Instruction::BitCast,
+                                        SVI->getOperand(1), DestTy, CI);
+          // Return a new shuffle vector.  Use the same element ID's, as we
+          // know the vector types match #elts.
+          return new ShuffleVectorInst(LHS, RHS, SVI->getOperand(2));
+        }
+      }
+    }
+  }
+  return 0;
+}
+
+/// GetSelectFoldableOperands - We want to turn code that looks like this:
+///   %C = or %A, %B
+///   %D = select %cond, %C, %A
+/// into:
+///   %C = select %cond, %B, 0
+///   %D = or %A, %C
+///
+/// Assuming that the specified instruction is an operand to the select, return
+/// a bitmask indicating which operands of this instruction are foldable if they
+/// equal the other incoming value of the select.
+///
+static unsigned GetSelectFoldableOperands(Instruction *I) {
+  switch (I->getOpcode()) {
+  case Instruction::Add:
+  case Instruction::Mul:
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:
+    return 3;              // Can fold through either operand.
+  case Instruction::Sub:   // Can only fold on the amount subtracted.
+  case Instruction::Shl:   // Can only fold on the shift amount.
+  case Instruction::LShr:
+  case Instruction::AShr:
+    return 1;
+  default:
+    return 0;              // Cannot fold
+  }
+}
+
+/// GetSelectFoldableConstant - For the same transformation as the previous
+/// function, return the identity constant that goes into the select.
+static Constant *GetSelectFoldableConstant(Instruction *I) {
+  switch (I->getOpcode()) {
+  default: assert(0 && "This cannot happen!"); abort();
+  case Instruction::Add:
+  case Instruction::Sub:
+  case Instruction::Or:
+  case Instruction::Xor:
+  case Instruction::Shl:
+  case Instruction::LShr:
+  case Instruction::AShr:
+    return Constant::getNullValue(I->getType());
+  case Instruction::And:
+    return Constant::getAllOnesValue(I->getType());
+  case Instruction::Mul:
+    return ConstantInt::get(I->getType(), 1);
+  }
+}
+
+/// FoldSelectOpOp - Here we have (select c, TI, FI), and we know that TI and FI
+/// have the same opcode and only one use each.  Try to simplify this.
+Instruction *InstCombiner::FoldSelectOpOp(SelectInst &SI, Instruction *TI,
+                                          Instruction *FI) {
+  if (TI->getNumOperands() == 1) {
+    // If this is a non-volatile load or a cast from the same type,
+    // merge.
+    if (TI->isCast()) {
+      if (TI->getOperand(0)->getType() != FI->getOperand(0)->getType())
+        return 0;
+    } else {
+      return 0;  // unknown unary op.
+    }
+
+    // Fold this by inserting a select from the input values.
+    SelectInst *NewSI = SelectInst::Create(SI.getCondition(), TI->getOperand(0),
+                                           FI->getOperand(0), SI.getName()+".v");
+    InsertNewInstBefore(NewSI, SI);
+    return CastInst::Create(Instruction::CastOps(TI->getOpcode()), NewSI, 
+                            TI->getType());
+  }
+
+  // Only handle binary operators here.
+  if (!isa<BinaryOperator>(TI))
+    return 0;
+
+  // Figure out if the operations have any operands in common.
+  Value *MatchOp, *OtherOpT, *OtherOpF;
+  bool MatchIsOpZero;
+  if (TI->getOperand(0) == FI->getOperand(0)) {
+    MatchOp  = TI->getOperand(0);
+    OtherOpT = TI->getOperand(1);
+    OtherOpF = FI->getOperand(1);
+    MatchIsOpZero = true;
+  } else if (TI->getOperand(1) == FI->getOperand(1)) {
+    MatchOp  = TI->getOperand(1);
+    OtherOpT = TI->getOperand(0);
+    OtherOpF = FI->getOperand(0);
+    MatchIsOpZero = false;
+  } else if (!TI->isCommutative()) {
+    return 0;
+  } else if (TI->getOperand(0) == FI->getOperand(1)) {
+    MatchOp  = TI->getOperand(0);
+    OtherOpT = TI->getOperand(1);
+    OtherOpF = FI->getOperand(0);
+    MatchIsOpZero = true;
+  } else if (TI->getOperand(1) == FI->getOperand(0)) {
+    MatchOp  = TI->getOperand(1);
+    OtherOpT = TI->getOperand(0);
+    OtherOpF = FI->getOperand(1);
+    MatchIsOpZero = true;
+  } else {
+    return 0;
+  }
+
+  // If we reach here, they do have operations in common.
+  SelectInst *NewSI = SelectInst::Create(SI.getCondition(), OtherOpT,
+                                         OtherOpF, SI.getName()+".v");
+  InsertNewInstBefore(NewSI, SI);
+
+  if (BinaryOperator *BO = dyn_cast<BinaryOperator>(TI)) {
+    if (MatchIsOpZero)
+      return BinaryOperator::Create(BO->getOpcode(), MatchOp, NewSI);
+    else
+      return BinaryOperator::Create(BO->getOpcode(), NewSI, MatchOp);
+  }
+  assert(0 && "Shouldn't get here");
+  return 0;
+}
+
+static bool isSelect01(Constant *C1, Constant *C2) {
+  ConstantInt *C1I = dyn_cast<ConstantInt>(C1);
+  if (!C1I)
+    return false;
+  ConstantInt *C2I = dyn_cast<ConstantInt>(C2);
+  if (!C2I)
+    return false;
+  return (C1I->isZero() || C1I->isOne()) && (C2I->isZero() || C2I->isOne());
+}
+
+/// FoldSelectIntoOp - Try fold the select into one of the operands to
+/// facilitate further optimization.
+Instruction *InstCombiner::FoldSelectIntoOp(SelectInst &SI, Value *TrueVal,
+                                            Value *FalseVal) {
+  // See the comment above GetSelectFoldableOperands for a description of the
+  // transformation we are doing here.
+  if (Instruction *TVI = dyn_cast<Instruction>(TrueVal)) {
+    if (TVI->hasOneUse() && TVI->getNumOperands() == 2 &&
+        !isa<Constant>(FalseVal)) {
+      if (unsigned SFO = GetSelectFoldableOperands(TVI)) {
+        unsigned OpToFold = 0;
+        if ((SFO & 1) && FalseVal == TVI->getOperand(0)) {
+          OpToFold = 1;
+        } else  if ((SFO & 2) && FalseVal == TVI->getOperand(1)) {
+          OpToFold = 2;
+        }
+
+        if (OpToFold) {
+          Constant *C = GetSelectFoldableConstant(TVI);
+          Value *OOp = TVI->getOperand(2-OpToFold);
+          // Avoid creating select between 2 constants unless it's selecting
+          // between 0 and 1.
+          if (!isa<Constant>(OOp) || isSelect01(C, cast<Constant>(OOp))) {
+            Instruction *NewSel = SelectInst::Create(SI.getCondition(), OOp, C);
+            InsertNewInstBefore(NewSel, SI);
+            NewSel->takeName(TVI);
+            if (BinaryOperator *BO = dyn_cast<BinaryOperator>(TVI))
+              return BinaryOperator::Create(BO->getOpcode(), FalseVal, NewSel);
+            assert(0 && "Unknown instruction!!");
+          }
+        }
+      }
+    }
+  }
+
+  if (Instruction *FVI = dyn_cast<Instruction>(FalseVal)) {
+    if (FVI->hasOneUse() && FVI->getNumOperands() == 2 &&
+        !isa<Constant>(TrueVal)) {
+      if (unsigned SFO = GetSelectFoldableOperands(FVI)) {
+        unsigned OpToFold = 0;
+        if ((SFO & 1) && TrueVal == FVI->getOperand(0)) {
+          OpToFold = 1;
+        } else  if ((SFO & 2) && TrueVal == FVI->getOperand(1)) {
+          OpToFold = 2;
+        }
+
+        if (OpToFold) {
+          Constant *C = GetSelectFoldableConstant(FVI);
+          Value *OOp = FVI->getOperand(2-OpToFold);
+          // Avoid creating select between 2 constants unless it's selecting
+          // between 0 and 1.
+          if (!isa<Constant>(OOp) || isSelect01(C, cast<Constant>(OOp))) {
+            Instruction *NewSel = SelectInst::Create(SI.getCondition(), C, OOp);
+            InsertNewInstBefore(NewSel, SI);
+            NewSel->takeName(FVI);
+            if (BinaryOperator *BO = dyn_cast<BinaryOperator>(FVI))
+              return BinaryOperator::Create(BO->getOpcode(), TrueVal, NewSel);
+            assert(0 && "Unknown instruction!!");
+          }
+        }
+      }
+    }
+  }
+
+  return 0;
+}
+
+/// visitSelectInstWithICmp - Visit a SelectInst that has an
+/// ICmpInst as its first operand.
+///
+Instruction *InstCombiner::visitSelectInstWithICmp(SelectInst &SI,
+                                                   ICmpInst *ICI) {
+  bool Changed = false;
+  ICmpInst::Predicate Pred = ICI->getPredicate();
+  Value *CmpLHS = ICI->getOperand(0);
+  Value *CmpRHS = ICI->getOperand(1);
+  Value *TrueVal = SI.getTrueValue();
+  Value *FalseVal = SI.getFalseValue();
+
+  // Check cases where the comparison is with a constant that
+  // can be adjusted to fit the min/max idiom. We may edit ICI in
+  // place here, so make sure the select is the only user.
+  if (ICI->hasOneUse())
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(CmpRHS)) {
+      switch (Pred) {
+      default: break;
+      case ICmpInst::ICMP_ULT:
+      case ICmpInst::ICMP_SLT: {
+        // X < MIN ? T : F  -->  F
+        if (CI->isMinValue(Pred == ICmpInst::ICMP_SLT))
+          return ReplaceInstUsesWith(SI, FalseVal);
+        // X < C ? X : C-1  -->  X > C-1 ? C-1 : X
+        Constant *AdjustedRHS = SubOne(CI);
+        if ((CmpLHS == TrueVal && AdjustedRHS == FalseVal) ||
+            (CmpLHS == FalseVal && AdjustedRHS == TrueVal)) {
+          Pred = ICmpInst::getSwappedPredicate(Pred);
+          CmpRHS = AdjustedRHS;
+          std::swap(FalseVal, TrueVal);
+          ICI->setPredicate(Pred);
+          ICI->setOperand(1, CmpRHS);
+          SI.setOperand(1, TrueVal);
+          SI.setOperand(2, FalseVal);
+          Changed = true;
+        }
+        break;
+      }
+      case ICmpInst::ICMP_UGT:
+      case ICmpInst::ICMP_SGT: {
+        // X > MAX ? T : F  -->  F
+        if (CI->isMaxValue(Pred == ICmpInst::ICMP_SGT))
+          return ReplaceInstUsesWith(SI, FalseVal);
+        // X > C ? X : C+1  -->  X < C+1 ? C+1 : X
+        Constant *AdjustedRHS = AddOne(CI);
+        if ((CmpLHS == TrueVal && AdjustedRHS == FalseVal) ||
+            (CmpLHS == FalseVal && AdjustedRHS == TrueVal)) {
+          Pred = ICmpInst::getSwappedPredicate(Pred);
+          CmpRHS = AdjustedRHS;
+          std::swap(FalseVal, TrueVal);
+          ICI->setPredicate(Pred);
+          ICI->setOperand(1, CmpRHS);
+          SI.setOperand(1, TrueVal);
+          SI.setOperand(2, FalseVal);
+          Changed = true;
+        }
+        break;
+      }
+      }
+
+      // (x <s 0) ? -1 : 0 -> ashr x, 31   -> all ones if signed
+      // (x >s -1) ? -1 : 0 -> ashr x, 31  -> all ones if not signed
+      CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
+      if (match(TrueVal, m_ConstantInt<-1>()) &&
+          match(FalseVal, m_ConstantInt<0>()))
+        Pred = ICI->getPredicate();
+      else if (match(TrueVal, m_ConstantInt<0>()) &&
+               match(FalseVal, m_ConstantInt<-1>()))
+        Pred = CmpInst::getInversePredicate(ICI->getPredicate());
+      
+      if (Pred != CmpInst::BAD_ICMP_PREDICATE) {
+        // If we are just checking for a icmp eq of a single bit and zext'ing it
+        // to an integer, then shift the bit to the appropriate place and then
+        // cast to integer to avoid the comparison.
+        const APInt &Op1CV = CI->getValue();
+    
+        // sext (x <s  0) to i32 --> x>>s31      true if signbit set.
+        // sext (x >s -1) to i32 --> (x>>s31)^-1  true if signbit clear.
+        if ((Pred == ICmpInst::ICMP_SLT && Op1CV == 0) ||
+            (Pred == ICmpInst::ICMP_SGT && Op1CV.isAllOnesValue())) {
+          Value *In = ICI->getOperand(0);
+          Value *Sh = ConstantInt::get(In->getType(),
+                                       In->getType()->getPrimitiveSizeInBits()-1);
+          In = InsertNewInstBefore(BinaryOperator::CreateAShr(In, Sh,
+                                                          In->getName()+".lobit"),
+                                   *ICI);
+          if (In->getType() != SI.getType())
+            In = CastInst::CreateIntegerCast(In, SI.getType(),
+                                             true/*SExt*/, "tmp", ICI);
+    
+          if (Pred == ICmpInst::ICMP_SGT)
+            In = InsertNewInstBefore(BinaryOperator::CreateNot(In,
+                                       In->getName()+".not"), *ICI);
+    
+          return ReplaceInstUsesWith(SI, In);
+        }
+      }
+    }
+
+  if (CmpLHS == TrueVal && CmpRHS == FalseVal) {
+    // Transform (X == Y) ? X : Y  -> Y
+    if (Pred == ICmpInst::ICMP_EQ)
+      return ReplaceInstUsesWith(SI, FalseVal);
+    // Transform (X != Y) ? X : Y  -> X
+    if (Pred == ICmpInst::ICMP_NE)
+      return ReplaceInstUsesWith(SI, TrueVal);
+    /// NOTE: if we wanted to, this is where to detect integer MIN/MAX
+
+  } else if (CmpLHS == FalseVal && CmpRHS == TrueVal) {
+    // Transform (X == Y) ? Y : X  -> X
+    if (Pred == ICmpInst::ICMP_EQ)
+      return ReplaceInstUsesWith(SI, FalseVal);
+    // Transform (X != Y) ? Y : X  -> Y
+    if (Pred == ICmpInst::ICMP_NE)
+      return ReplaceInstUsesWith(SI, TrueVal);
+    /// NOTE: if we wanted to, this is where to detect integer MIN/MAX
+  }
+
+  /// NOTE: if we wanted to, this is where to detect integer ABS
+
+  return Changed ? &SI : 0;
+}
+
+Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
+  Value *CondVal = SI.getCondition();
+  Value *TrueVal = SI.getTrueValue();
+  Value *FalseVal = SI.getFalseValue();
+
+  // select true, X, Y  -> X
+  // select false, X, Y -> Y
+  if (ConstantInt *C = dyn_cast<ConstantInt>(CondVal))
+    return ReplaceInstUsesWith(SI, C->getZExtValue() ? TrueVal : FalseVal);
+
+  // select C, X, X -> X
+  if (TrueVal == FalseVal)
+    return ReplaceInstUsesWith(SI, TrueVal);
+
+  if (isa<UndefValue>(TrueVal))   // select C, undef, X -> X
+    return ReplaceInstUsesWith(SI, FalseVal);
+  if (isa<UndefValue>(FalseVal))   // select C, X, undef -> X
+    return ReplaceInstUsesWith(SI, TrueVal);
+  if (isa<UndefValue>(CondVal)) {  // select undef, X, Y -> X or Y
+    if (isa<Constant>(TrueVal))
+      return ReplaceInstUsesWith(SI, TrueVal);
+    else
+      return ReplaceInstUsesWith(SI, FalseVal);
+  }
+
+  if (SI.getType() == Type::Int1Ty) {
+    if (ConstantInt *C = dyn_cast<ConstantInt>(TrueVal)) {
+      if (C->getZExtValue()) {
+        // Change: A = select B, true, C --> A = or B, C
+        return BinaryOperator::CreateOr(CondVal, FalseVal);
+      } else {
+        // Change: A = select B, false, C --> A = and !B, C
+        Value *NotCond =
+          InsertNewInstBefore(BinaryOperator::CreateNot(CondVal,
+                                             "not."+CondVal->getName()), SI);
+        return BinaryOperator::CreateAnd(NotCond, FalseVal);
+      }
+    } else if (ConstantInt *C = dyn_cast<ConstantInt>(FalseVal)) {
+      if (C->getZExtValue() == false) {
+        // Change: A = select B, C, false --> A = and B, C
+        return BinaryOperator::CreateAnd(CondVal, TrueVal);
+      } else {
+        // Change: A = select B, C, true --> A = or !B, C
+        Value *NotCond =
+          InsertNewInstBefore(BinaryOperator::CreateNot(CondVal,
+                                             "not."+CondVal->getName()), SI);
+        return BinaryOperator::CreateOr(NotCond, TrueVal);
+      }
+    }
+    
+    // select a, b, a  -> a&b
+    // select a, a, b  -> a|b
+    if (CondVal == TrueVal)
+      return BinaryOperator::CreateOr(CondVal, FalseVal);
+    else if (CondVal == FalseVal)
+      return BinaryOperator::CreateAnd(CondVal, TrueVal);
+  }
+
+  // Selecting between two integer constants?
+  if (ConstantInt *TrueValC = dyn_cast<ConstantInt>(TrueVal))
+    if (ConstantInt *FalseValC = dyn_cast<ConstantInt>(FalseVal)) {
+      // select C, 1, 0 -> zext C to int
+      if (FalseValC->isZero() && TrueValC->getValue() == 1) {
+        return CastInst::Create(Instruction::ZExt, CondVal, SI.getType());
+      } else if (TrueValC->isZero() && FalseValC->getValue() == 1) {
+        // select C, 0, 1 -> zext !C to int
+        Value *NotCond =
+          InsertNewInstBefore(BinaryOperator::CreateNot(CondVal,
+                                               "not."+CondVal->getName()), SI);
+        return CastInst::Create(Instruction::ZExt, NotCond, SI.getType());
+      }
+
+      if (ICmpInst *IC = dyn_cast<ICmpInst>(SI.getCondition())) {
+
+        // (x <s 0) ? -1 : 0 -> ashr x, 31
+        if (TrueValC->isAllOnesValue() && FalseValC->isZero())
+          if (ConstantInt *CmpCst = dyn_cast<ConstantInt>(IC->getOperand(1))) {
+            if (IC->getPredicate() == ICmpInst::ICMP_SLT && CmpCst->isZero()) {
+              // The comparison constant and the result are not neccessarily the
+              // same width. Make an all-ones value by inserting a AShr.
+              Value *X = IC->getOperand(0);
+              uint32_t Bits = X->getType()->getPrimitiveSizeInBits();
+              Constant *ShAmt = ConstantInt::get(X->getType(), Bits-1);
+              Instruction *SRA = BinaryOperator::Create(Instruction::AShr, X,
+                                                        ShAmt, "ones");
+              InsertNewInstBefore(SRA, SI);
+
+              // Then cast to the appropriate width.
+              return CastInst::CreateIntegerCast(SRA, SI.getType(), true);
+            }
+          }
+
+
+        // If one of the constants is zero (we know they can't both be) and we
+        // have an icmp instruction with zero, and we have an 'and' with the
+        // non-constant value, eliminate this whole mess.  This corresponds to
+        // cases like this: ((X & 27) ? 27 : 0)
+        if (TrueValC->isZero() || FalseValC->isZero())
+          if (IC->isEquality() && isa<ConstantInt>(IC->getOperand(1)) &&
+              cast<Constant>(IC->getOperand(1))->isNullValue())
+            if (Instruction *ICA = dyn_cast<Instruction>(IC->getOperand(0)))
+              if (ICA->getOpcode() == Instruction::And &&
+                  isa<ConstantInt>(ICA->getOperand(1)) &&
+                  (ICA->getOperand(1) == TrueValC ||
+                   ICA->getOperand(1) == FalseValC) &&
+                  isOneBitSet(cast<ConstantInt>(ICA->getOperand(1)))) {
+                // Okay, now we know that everything is set up, we just don't
+                // know whether we have a icmp_ne or icmp_eq and whether the 
+                // true or false val is the zero.
+                bool ShouldNotVal = !TrueValC->isZero();
+                ShouldNotVal ^= IC->getPredicate() == ICmpInst::ICMP_NE;
+                Value *V = ICA;
+                if (ShouldNotVal)
+                  V = InsertNewInstBefore(BinaryOperator::Create(
+                                  Instruction::Xor, V, ICA->getOperand(1)), SI);
+                return ReplaceInstUsesWith(SI, V);
+              }
+      }
+    }
+
+  // See if we are selecting two values based on a comparison of the two values.
+  if (FCmpInst *FCI = dyn_cast<FCmpInst>(CondVal)) {
+    if (FCI->getOperand(0) == TrueVal && FCI->getOperand(1) == FalseVal) {
+      // Transform (X == Y) ? X : Y  -> Y
+      if (FCI->getPredicate() == FCmpInst::FCMP_OEQ) {
+        // This is not safe in general for floating point:  
+        // consider X== -0, Y== +0.
+        // It becomes safe if either operand is a nonzero constant.
+        ConstantFP *CFPt, *CFPf;
+        if (((CFPt = dyn_cast<ConstantFP>(TrueVal)) &&
+              !CFPt->getValueAPF().isZero()) ||
+            ((CFPf = dyn_cast<ConstantFP>(FalseVal)) &&
+             !CFPf->getValueAPF().isZero()))
+        return ReplaceInstUsesWith(SI, FalseVal);
+      }
+      // Transform (X != Y) ? X : Y  -> X
+      if (FCI->getPredicate() == FCmpInst::FCMP_ONE)
+        return ReplaceInstUsesWith(SI, TrueVal);
+      // NOTE: if we wanted to, this is where to detect MIN/MAX
+
+    } else if (FCI->getOperand(0) == FalseVal && FCI->getOperand(1) == TrueVal){
+      // Transform (X == Y) ? Y : X  -> X
+      if (FCI->getPredicate() == FCmpInst::FCMP_OEQ) {
+        // This is not safe in general for floating point:  
+        // consider X== -0, Y== +0.
+        // It becomes safe if either operand is a nonzero constant.
+        ConstantFP *CFPt, *CFPf;
+        if (((CFPt = dyn_cast<ConstantFP>(TrueVal)) &&
+              !CFPt->getValueAPF().isZero()) ||
+            ((CFPf = dyn_cast<ConstantFP>(FalseVal)) &&
+             !CFPf->getValueAPF().isZero()))
+          return ReplaceInstUsesWith(SI, FalseVal);
+      }
+      // Transform (X != Y) ? Y : X  -> Y
+      if (FCI->getPredicate() == FCmpInst::FCMP_ONE)
+        return ReplaceInstUsesWith(SI, TrueVal);
+      // NOTE: if we wanted to, this is where to detect MIN/MAX
+    }
+    // NOTE: if we wanted to, this is where to detect ABS
+  }
+
+  // See if we are selecting two values based on a comparison of the two values.
+  if (ICmpInst *ICI = dyn_cast<ICmpInst>(CondVal))
+    if (Instruction *Result = visitSelectInstWithICmp(SI, ICI))
+      return Result;
+
+  if (Instruction *TI = dyn_cast<Instruction>(TrueVal))
+    if (Instruction *FI = dyn_cast<Instruction>(FalseVal))
+      if (TI->hasOneUse() && FI->hasOneUse()) {
+        Instruction *AddOp = 0, *SubOp = 0;
+
+        // Turn (select C, (op X, Y), (op X, Z)) -> (op X, (select C, Y, Z))
+        if (TI->getOpcode() == FI->getOpcode())
+          if (Instruction *IV = FoldSelectOpOp(SI, TI, FI))
+            return IV;
+
+        // Turn select C, (X+Y), (X-Y) --> (X+(select C, Y, (-Y))).  This is
+        // even legal for FP.
+        if (TI->getOpcode() == Instruction::Sub &&
+            FI->getOpcode() == Instruction::Add) {
+          AddOp = FI; SubOp = TI;
+        } else if (FI->getOpcode() == Instruction::Sub &&
+                   TI->getOpcode() == Instruction::Add) {
+          AddOp = TI; SubOp = FI;
+        }
+
+        if (AddOp) {
+          Value *OtherAddOp = 0;
+          if (SubOp->getOperand(0) == AddOp->getOperand(0)) {
+            OtherAddOp = AddOp->getOperand(1);
+          } else if (SubOp->getOperand(0) == AddOp->getOperand(1)) {
+            OtherAddOp = AddOp->getOperand(0);
+          }
+
+          if (OtherAddOp) {
+            // So at this point we know we have (Y -> OtherAddOp):
+            //        select C, (add X, Y), (sub X, Z)
+            Value *NegVal;  // Compute -Z
+            if (Constant *C = dyn_cast<Constant>(SubOp->getOperand(1))) {
+              NegVal = ConstantExpr::getNeg(C);
+            } else {
+              NegVal = InsertNewInstBefore(
+                    BinaryOperator::CreateNeg(SubOp->getOperand(1), "tmp"), SI);
+            }
+
+            Value *NewTrueOp = OtherAddOp;
+            Value *NewFalseOp = NegVal;
+            if (AddOp != TI)
+              std::swap(NewTrueOp, NewFalseOp);
+            Instruction *NewSel =
+              SelectInst::Create(CondVal, NewTrueOp,
+                                 NewFalseOp, SI.getName() + ".p");
+
+            NewSel = InsertNewInstBefore(NewSel, SI);
+            return BinaryOperator::CreateAdd(SubOp->getOperand(0), NewSel);
+          }
+        }
+      }
+
+  // See if we can fold the select into one of our operands.
+  if (SI.getType()->isInteger()) {
+    Instruction *FoldI = FoldSelectIntoOp(SI, TrueVal, FalseVal);
+    if (FoldI)
+      return FoldI;
+  }
+
+  if (BinaryOperator::isNot(CondVal)) {
+    SI.setOperand(0, BinaryOperator::getNotArgument(CondVal));
+    SI.setOperand(1, FalseVal);
+    SI.setOperand(2, TrueVal);
+    return &SI;
+  }
+
+  return 0;
+}
+
+/// EnforceKnownAlignment - If the specified pointer points to an object that
+/// we control, modify the object's alignment to PrefAlign. This isn't
+/// often possible though. If alignment is important, a more reliable approach
+/// is to simply align all global variables and allocation instructions to
+/// their preferred alignment from the beginning.
+///
+static unsigned EnforceKnownAlignment(Value *V,
+                                      unsigned Align, unsigned PrefAlign) {
+
+  User *U = dyn_cast<User>(V);
+  if (!U) return Align;
+
+  switch (getOpcode(U)) {
+  default: break;
+  case Instruction::BitCast:
+    return EnforceKnownAlignment(U->getOperand(0), Align, PrefAlign);
+  case Instruction::GetElementPtr: {
+    // If all indexes are zero, it is just the alignment of the base pointer.
+    bool AllZeroOperands = true;
+    for (User::op_iterator i = U->op_begin() + 1, e = U->op_end(); i != e; ++i)
+      if (!isa<Constant>(*i) ||
+          !cast<Constant>(*i)->isNullValue()) {
+        AllZeroOperands = false;
+        break;
+      }
+
+    if (AllZeroOperands) {
+      // Treat this like a bitcast.
+      return EnforceKnownAlignment(U->getOperand(0), Align, PrefAlign);
+    }
+    break;
+  }
+  }
+
+  if (GlobalValue *GV = dyn_cast<GlobalValue>(V)) {
+    // If there is a large requested alignment and we can, bump up the alignment
+    // of the global.
+    if (!GV->isDeclaration()) {
+      if (GV->getAlignment() >= PrefAlign)
+        Align = GV->getAlignment();
+      else {
+        GV->setAlignment(PrefAlign);
+        Align = PrefAlign;
+      }
+    }
+  } else if (AllocationInst *AI = dyn_cast<AllocationInst>(V)) {
+    // If there is a requested alignment and if this is an alloca, round up.  We
+    // don't do this for malloc, because some systems can't respect the request.
+    if (isa<AllocaInst>(AI)) {
+      if (AI->getAlignment() >= PrefAlign)
+        Align = AI->getAlignment();
+      else {
+        AI->setAlignment(PrefAlign);
+        Align = PrefAlign;
+      }
+    }
+  }
+
+  return Align;
+}
+
+/// GetOrEnforceKnownAlignment - If the specified pointer has an alignment that
+/// we can determine, return it, otherwise return 0.  If PrefAlign is specified,
+/// and it is more than the alignment of the ultimate object, see if we can
+/// increase the alignment of the ultimate object, making this check succeed.
+unsigned InstCombiner::GetOrEnforceKnownAlignment(Value *V,
+                                                  unsigned PrefAlign) {
+  unsigned BitWidth = TD ? TD->getTypeSizeInBits(V->getType()) :
+                      sizeof(PrefAlign) * CHAR_BIT;
+  APInt Mask = APInt::getAllOnesValue(BitWidth);
+  APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0);
+  ComputeMaskedBits(V, Mask, KnownZero, KnownOne);
+  unsigned TrailZ = KnownZero.countTrailingOnes();
+  unsigned Align = 1u << std::min(BitWidth - 1, TrailZ);
+
+  if (PrefAlign > Align)
+    Align = EnforceKnownAlignment(V, Align, PrefAlign);
+  
+    // We don't need to make any adjustment.
+  return Align;
+}
+
+Instruction *InstCombiner::SimplifyMemTransfer(MemIntrinsic *MI) {
+  unsigned DstAlign = GetOrEnforceKnownAlignment(MI->getOperand(1));
+  unsigned SrcAlign = GetOrEnforceKnownAlignment(MI->getOperand(2));
+  unsigned MinAlign = std::min(DstAlign, SrcAlign);
+  unsigned CopyAlign = MI->getAlignment();
+
+  if (CopyAlign < MinAlign) {
+    MI->setAlignment(MinAlign);
+    return MI;
+  }
+  
+  // If MemCpyInst length is 1/2/4/8 bytes then replace memcpy with
+  // load/store.
+  ConstantInt *MemOpLength = dyn_cast<ConstantInt>(MI->getOperand(3));
+  if (MemOpLength == 0) return 0;
+  
+  // Source and destination pointer types are always "i8*" for intrinsic.  See
+  // if the size is something we can handle with a single primitive load/store.
+  // A single load+store correctly handles overlapping memory in the memmove
+  // case.
+  unsigned Size = MemOpLength->getZExtValue();
+  if (Size == 0) return MI;  // Delete this mem transfer.
+  
+  if (Size > 8 || (Size&(Size-1)))
+    return 0;  // If not 1/2/4/8 bytes, exit.
+  
+  // Use an integer load+store unless we can find something better.
+  Type *NewPtrTy = PointerType::getUnqual(IntegerType::get(Size<<3));
+  
+  // Memcpy forces the use of i8* for the source and destination.  That means
+  // that if you're using memcpy to move one double around, you'll get a cast
+  // from double* to i8*.  We'd much rather use a double load+store rather than
+  // an i64 load+store, here because this improves the odds that the source or
+  // dest address will be promotable.  See if we can find a better type than the
+  // integer datatype.
+  if (Value *Op = getBitCastOperand(MI->getOperand(1))) {
+    const Type *SrcETy = cast<PointerType>(Op->getType())->getElementType();
+    if (SrcETy->isSized() && TD->getTypeStoreSize(SrcETy) == Size) {
+      // The SrcETy might be something like {{{double}}} or [1 x double].  Rip
+      // down through these levels if so.
+      while (!SrcETy->isSingleValueType()) {
+        if (const StructType *STy = dyn_cast<StructType>(SrcETy)) {
+          if (STy->getNumElements() == 1)
+            SrcETy = STy->getElementType(0);
+          else
+            break;
+        } else if (const ArrayType *ATy = dyn_cast<ArrayType>(SrcETy)) {
+          if (ATy->getNumElements() == 1)
+            SrcETy = ATy->getElementType();
+          else
+            break;
+        } else
+          break;
+      }
+      
+      if (SrcETy->isSingleValueType())
+        NewPtrTy = PointerType::getUnqual(SrcETy);
+    }
+  }
+  
+  
+  // If the memcpy/memmove provides better alignment info than we can
+  // infer, use it.
+  SrcAlign = std::max(SrcAlign, CopyAlign);
+  DstAlign = std::max(DstAlign, CopyAlign);
+  
+  Value *Src = InsertBitCastBefore(MI->getOperand(2), NewPtrTy, *MI);
+  Value *Dest = InsertBitCastBefore(MI->getOperand(1), NewPtrTy, *MI);
+  Instruction *L = new LoadInst(Src, "tmp", false, SrcAlign);
+  InsertNewInstBefore(L, *MI);
+  InsertNewInstBefore(new StoreInst(L, Dest, false, DstAlign), *MI);
+
+  // Set the size of the copy to 0, it will be deleted on the next iteration.
+  MI->setOperand(3, Constant::getNullValue(MemOpLength->getType()));
+  return MI;
+}
+
+Instruction *InstCombiner::SimplifyMemSet(MemSetInst *MI) {
+  unsigned Alignment = GetOrEnforceKnownAlignment(MI->getDest());
+  if (MI->getAlignment() < Alignment) {
+    MI->setAlignment(Alignment);
+    return MI;
+  }
+  
+  // Extract the length and alignment and fill if they are constant.
+  ConstantInt *LenC = dyn_cast<ConstantInt>(MI->getLength());
+  ConstantInt *FillC = dyn_cast<ConstantInt>(MI->getValue());
+  if (!LenC || !FillC || FillC->getType() != Type::Int8Ty)
+    return 0;
+  uint64_t Len = LenC->getZExtValue();
+  Alignment = MI->getAlignment();
+  
+  // If the length is zero, this is a no-op
+  if (Len == 0) return MI; // memset(d,c,0,a) -> noop
+  
+  // memset(s,c,n) -> store s, c (for n=1,2,4,8)
+  if (Len <= 8 && isPowerOf2_32((uint32_t)Len)) {
+    const Type *ITy = IntegerType::get(Len*8);  // n=1 -> i8.
+    
+    Value *Dest = MI->getDest();
+    Dest = InsertBitCastBefore(Dest, PointerType::getUnqual(ITy), *MI);
+
+    // Alignment 0 is identity for alignment 1 for memset, but not store.
+    if (Alignment == 0) Alignment = 1;
+    
+    // Extract the fill value and store.
+    uint64_t Fill = FillC->getZExtValue()*0x0101010101010101ULL;
+    InsertNewInstBefore(new StoreInst(ConstantInt::get(ITy, Fill), Dest, false,
+                                      Alignment), *MI);
+    
+    // Set the size of the copy to 0, it will be deleted on the next iteration.
+    MI->setLength(Constant::getNullValue(LenC->getType()));
+    return MI;
+  }
+
+  return 0;
+}
+
+
+/// visitCallInst - CallInst simplification.  This mostly only handles folding 
+/// of intrinsic instructions.  For normal calls, it allows visitCallSite to do
+/// the heavy lifting.
+///
+Instruction *InstCombiner::visitCallInst(CallInst &CI) {
+  // If the caller function is nounwind, mark the call as nounwind, even if the
+  // callee isn't.
+  if (CI.getParent()->getParent()->doesNotThrow() &&
+      !CI.doesNotThrow()) {
+    CI.setDoesNotThrow();
+    return &CI;
+  }
+  
+  
+  
+  IntrinsicInst *II = dyn_cast<IntrinsicInst>(&CI);
+  if (!II) return visitCallSite(&CI);
+  
+  // Intrinsics cannot occur in an invoke, so handle them here instead of in
+  // visitCallSite.
+  if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(II)) {
+    bool Changed = false;
+
+    // memmove/cpy/set of zero bytes is a noop.
+    if (Constant *NumBytes = dyn_cast<Constant>(MI->getLength())) {
+      if (NumBytes->isNullValue()) return EraseInstFromFunction(CI);
+
+      if (ConstantInt *CI = dyn_cast<ConstantInt>(NumBytes))
+        if (CI->getZExtValue() == 1) {
+          // Replace the instruction with just byte operations.  We would
+          // transform other cases to loads/stores, but we don't know if
+          // alignment is sufficient.
+        }
+    }
+
+    // If we have a memmove and the source operation is a constant global,
+    // then the source and dest pointers can't alias, so we can change this
+    // into a call to memcpy.
+    if (MemMoveInst *MMI = dyn_cast<MemMoveInst>(MI)) {
+      if (GlobalVariable *GVSrc = dyn_cast<GlobalVariable>(MMI->getSource()))
+        if (GVSrc->isConstant()) {
+          Module *M = CI.getParent()->getParent()->getParent();
+          Intrinsic::ID MemCpyID = Intrinsic::memcpy;
+          const Type *Tys[1];
+          Tys[0] = CI.getOperand(3)->getType();
+          CI.setOperand(0, 
+                        Intrinsic::getDeclaration(M, MemCpyID, Tys, 1));
+          Changed = true;
+        }
+
+      // memmove(x,x,size) -> noop.
+      if (MMI->getSource() == MMI->getDest())
+        return EraseInstFromFunction(CI);
+    }
+
+    // If we can determine a pointer alignment that is bigger than currently
+    // set, update the alignment.
+    if (isa<MemTransferInst>(MI)) {
+      if (Instruction *I = SimplifyMemTransfer(MI))
+        return I;
+    } else if (MemSetInst *MSI = dyn_cast<MemSetInst>(MI)) {
+      if (Instruction *I = SimplifyMemSet(MSI))
+        return I;
+    }
+          
+    if (Changed) return II;
+  }
+  
+  switch (II->getIntrinsicID()) {
+  default: break;
+  case Intrinsic::bswap:
+    // bswap(bswap(x)) -> x
+    if (IntrinsicInst *Operand = dyn_cast<IntrinsicInst>(II->getOperand(1)))
+      if (Operand->getIntrinsicID() == Intrinsic::bswap)
+        return ReplaceInstUsesWith(CI, Operand->getOperand(1));
+    break;
+  case Intrinsic::ppc_altivec_lvx:
+  case Intrinsic::ppc_altivec_lvxl:
+  case Intrinsic::x86_sse_loadu_ps:
+  case Intrinsic::x86_sse2_loadu_pd:
+  case Intrinsic::x86_sse2_loadu_dq:
+    // Turn PPC lvx     -> load if the pointer is known aligned.
+    // Turn X86 loadups -> load if the pointer is known aligned.
+    if (GetOrEnforceKnownAlignment(II->getOperand(1), 16) >= 16) {
+      Value *Ptr = InsertBitCastBefore(II->getOperand(1),
+                                       PointerType::getUnqual(II->getType()),
+                                       CI);
+      return new LoadInst(Ptr);
+    }
+    break;
+  case Intrinsic::ppc_altivec_stvx:
+  case Intrinsic::ppc_altivec_stvxl:
+    // Turn stvx -> store if the pointer is known aligned.
+    if (GetOrEnforceKnownAlignment(II->getOperand(2), 16) >= 16) {
+      const Type *OpPtrTy = 
+        PointerType::getUnqual(II->getOperand(1)->getType());
+      Value *Ptr = InsertBitCastBefore(II->getOperand(2), OpPtrTy, CI);
+      return new StoreInst(II->getOperand(1), Ptr);
+    }
+    break;
+  case Intrinsic::x86_sse_storeu_ps:
+  case Intrinsic::x86_sse2_storeu_pd:
+  case Intrinsic::x86_sse2_storeu_dq:
+    // Turn X86 storeu -> store if the pointer is known aligned.
+    if (GetOrEnforceKnownAlignment(II->getOperand(1), 16) >= 16) {
+      const Type *OpPtrTy = 
+        PointerType::getUnqual(II->getOperand(2)->getType());
+      Value *Ptr = InsertBitCastBefore(II->getOperand(1), OpPtrTy, CI);
+      return new StoreInst(II->getOperand(2), Ptr);
+    }
+    break;
+    
+  case Intrinsic::x86_sse_cvttss2si: {
+    // These intrinsics only demands the 0th element of its input vector.  If
+    // we can simplify the input based on that, do so now.
+    unsigned VWidth =
+      cast<VectorType>(II->getOperand(1)->getType())->getNumElements();
+    APInt DemandedElts(VWidth, 1);
+    APInt UndefElts(VWidth, 0);
+    if (Value *V = SimplifyDemandedVectorElts(II->getOperand(1), DemandedElts,
+                                              UndefElts)) {
+      II->setOperand(1, V);
+      return II;
+    }
+    break;
+  }
+    
+  case Intrinsic::ppc_altivec_vperm:
+    // Turn vperm(V1,V2,mask) -> shuffle(V1,V2,mask) if mask is a constant.
+    if (ConstantVector *Mask = dyn_cast<ConstantVector>(II->getOperand(3))) {
+      assert(Mask->getNumOperands() == 16 && "Bad type for intrinsic!");
+      
+      // Check that all of the elements are integer constants or undefs.
+      bool AllEltsOk = true;
+      for (unsigned i = 0; i != 16; ++i) {
+        if (!isa<ConstantInt>(Mask->getOperand(i)) && 
+            !isa<UndefValue>(Mask->getOperand(i))) {
+          AllEltsOk = false;
+          break;
+        }
+      }
+      
+      if (AllEltsOk) {
+        // Cast the input vectors to byte vectors.
+        Value *Op0 =InsertBitCastBefore(II->getOperand(1),Mask->getType(),CI);
+        Value *Op1 =InsertBitCastBefore(II->getOperand(2),Mask->getType(),CI);
+        Value *Result = UndefValue::get(Op0->getType());
+        
+        // Only extract each element once.
+        Value *ExtractedElts[32];
+        memset(ExtractedElts, 0, sizeof(ExtractedElts));
+        
+        for (unsigned i = 0; i != 16; ++i) {
+          if (isa<UndefValue>(Mask->getOperand(i)))
+            continue;
+          unsigned Idx=cast<ConstantInt>(Mask->getOperand(i))->getZExtValue();
+          Idx &= 31;  // Match the hardware behavior.
+          
+          if (ExtractedElts[Idx] == 0) {
+            Instruction *Elt = 
+              new ExtractElementInst(Idx < 16 ? Op0 : Op1, Idx&15, "tmp");
+            InsertNewInstBefore(Elt, CI);
+            ExtractedElts[Idx] = Elt;
+          }
+        
+          // Insert this value into the result vector.
+          Result = InsertElementInst::Create(Result, ExtractedElts[Idx],
+                                             i, "tmp");
+          InsertNewInstBefore(cast<Instruction>(Result), CI);
+        }
+        return CastInst::Create(Instruction::BitCast, Result, CI.getType());
+      }
+    }
+    break;
+
+  case Intrinsic::stackrestore: {
+    // If the save is right next to the restore, remove the restore.  This can
+    // happen when variable allocas are DCE'd.
+    if (IntrinsicInst *SS = dyn_cast<IntrinsicInst>(II->getOperand(1))) {
+      if (SS->getIntrinsicID() == Intrinsic::stacksave) {
+        BasicBlock::iterator BI = SS;
+        if (&*++BI == II)
+          return EraseInstFromFunction(CI);
+      }
+    }
+    
+    // Scan down this block to see if there is another stack restore in the
+    // same block without an intervening call/alloca.
+    BasicBlock::iterator BI = II;
+    TerminatorInst *TI = II->getParent()->getTerminator();
+    bool CannotRemove = false;
+    for (++BI; &*BI != TI; ++BI) {
+      if (isa<AllocaInst>(BI)) {
+        CannotRemove = true;
+        break;
+      }
+      if (CallInst *BCI = dyn_cast<CallInst>(BI)) {
+        if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(BCI)) {
+          // If there is a stackrestore below this one, remove this one.
+          if (II->getIntrinsicID() == Intrinsic::stackrestore)
+            return EraseInstFromFunction(CI);
+          // Otherwise, ignore the intrinsic.
+        } else {
+          // If we found a non-intrinsic call, we can't remove the stack
+          // restore.
+          CannotRemove = true;
+          break;
+        }
+      }
+    }
+    
+    // If the stack restore is in a return/unwind block and if there are no
+    // allocas or calls between the restore and the return, nuke the restore.
+    if (!CannotRemove && (isa<ReturnInst>(TI) || isa<UnwindInst>(TI)))
+      return EraseInstFromFunction(CI);
+    break;
+  }
+  }
+
+  return visitCallSite(II);
+}
+
+// InvokeInst simplification
+//
+Instruction *InstCombiner::visitInvokeInst(InvokeInst &II) {
+  return visitCallSite(&II);
+}
+
+/// isSafeToEliminateVarargsCast - If this cast does not affect the value 
+/// passed through the varargs area, we can eliminate the use of the cast.
+static bool isSafeToEliminateVarargsCast(const CallSite CS,
+                                         const CastInst * const CI,
+                                         const TargetData * const TD,
+                                         const int ix) {
+  if (!CI->isLosslessCast())
+    return false;
+
+  // The size of ByVal arguments is derived from the type, so we
+  // can't change to a type with a different size.  If the size were
+  // passed explicitly we could avoid this check.
+  if (!CS.paramHasAttr(ix, Attribute::ByVal))
+    return true;
+
+  const Type* SrcTy = 
+            cast<PointerType>(CI->getOperand(0)->getType())->getElementType();
+  const Type* DstTy = cast<PointerType>(CI->getType())->getElementType();
+  if (!SrcTy->isSized() || !DstTy->isSized())
+    return false;
+  if (TD->getTypeAllocSize(SrcTy) != TD->getTypeAllocSize(DstTy))
+    return false;
+  return true;
+}
+
+// visitCallSite - Improvements for call and invoke instructions.
+//
+Instruction *InstCombiner::visitCallSite(CallSite CS) {
+  bool Changed = false;
+
+  // If the callee is a constexpr cast of a function, attempt to move the cast
+  // to the arguments of the call/invoke.
+  if (transformConstExprCastCall(CS)) return 0;
+
+  Value *Callee = CS.getCalledValue();
+
+  if (Function *CalleeF = dyn_cast<Function>(Callee))
+    if (CalleeF->getCallingConv() != CS.getCallingConv()) {
+      Instruction *OldCall = CS.getInstruction();
+      // If the call and callee calling conventions don't match, this call must
+      // be unreachable, as the call is undefined.
+      new StoreInst(ConstantInt::getTrue(),
+                    UndefValue::get(PointerType::getUnqual(Type::Int1Ty)), 
+                                    OldCall);
+      if (!OldCall->use_empty())
+        OldCall->replaceAllUsesWith(UndefValue::get(OldCall->getType()));
+      if (isa<CallInst>(OldCall))   // Not worth removing an invoke here.
+        return EraseInstFromFunction(*OldCall);
+      return 0;
+    }
+
+  if (isa<ConstantPointerNull>(Callee) || isa<UndefValue>(Callee)) {
+    // This instruction is not reachable, just remove it.  We insert a store to
+    // undef so that we know that this code is not reachable, despite the fact
+    // that we can't modify the CFG here.
+    new StoreInst(ConstantInt::getTrue(),
+                  UndefValue::get(PointerType::getUnqual(Type::Int1Ty)),
+                  CS.getInstruction());
+
+    if (!CS.getInstruction()->use_empty())
+      CS.getInstruction()->
+        replaceAllUsesWith(UndefValue::get(CS.getInstruction()->getType()));
+
+    if (InvokeInst *II = dyn_cast<InvokeInst>(CS.getInstruction())) {
+      // Don't break the CFG, insert a dummy cond branch.
+      BranchInst::Create(II->getNormalDest(), II->getUnwindDest(),
+                         ConstantInt::getTrue(), II);
+    }
+    return EraseInstFromFunction(*CS.getInstruction());
+  }
+
+  if (BitCastInst *BC = dyn_cast<BitCastInst>(Callee))
+    if (IntrinsicInst *In = dyn_cast<IntrinsicInst>(BC->getOperand(0)))
+      if (In->getIntrinsicID() == Intrinsic::init_trampoline)
+        return transformCallThroughTrampoline(CS);
+
+  const PointerType *PTy = cast<PointerType>(Callee->getType());
+  const FunctionType *FTy = cast<FunctionType>(PTy->getElementType());
+  if (FTy->isVarArg()) {
+    int ix = FTy->getNumParams() + (isa<InvokeInst>(Callee) ? 3 : 1);
+    // See if we can optimize any arguments passed through the varargs area of
+    // the call.
+    for (CallSite::arg_iterator I = CS.arg_begin()+FTy->getNumParams(),
+           E = CS.arg_end(); I != E; ++I, ++ix) {
+      CastInst *CI = dyn_cast<CastInst>(*I);
+      if (CI && isSafeToEliminateVarargsCast(CS, CI, TD, ix)) {
+        *I = CI->getOperand(0);
+        Changed = true;
+      }
+    }
+  }
+
+  if (isa<InlineAsm>(Callee) && !CS.doesNotThrow()) {
+    // Inline asm calls cannot throw - mark them 'nounwind'.
+    CS.setDoesNotThrow();
+    Changed = true;
+  }
+
+  return Changed ? CS.getInstruction() : 0;
+}
+
+// transformConstExprCastCall - If the callee is a constexpr cast of a function,
+// attempt to move the cast to the arguments of the call/invoke.
+//
+bool InstCombiner::transformConstExprCastCall(CallSite CS) {
+  if (!isa<ConstantExpr>(CS.getCalledValue())) return false;
+  ConstantExpr *CE = cast<ConstantExpr>(CS.getCalledValue());
+  if (CE->getOpcode() != Instruction::BitCast || 
+      !isa<Function>(CE->getOperand(0)))
+    return false;
+  Function *Callee = cast<Function>(CE->getOperand(0));
+  Instruction *Caller = CS.getInstruction();
+  const AttrListPtr &CallerPAL = CS.getAttributes();
+
+  // Okay, this is a cast from a function to a different type.  Unless doing so
+  // would cause a type conversion of one of our arguments, change this call to
+  // be a direct call with arguments casted to the appropriate types.
+  //
+  const FunctionType *FT = Callee->getFunctionType();
+  const Type *OldRetTy = Caller->getType();
+  const Type *NewRetTy = FT->getReturnType();
+
+  if (isa<StructType>(NewRetTy))
+    return false; // TODO: Handle multiple return values.
+
+  // Check to see if we are changing the return type...
+  if (OldRetTy != NewRetTy) {
+    if (Callee->isDeclaration() &&
+        // Conversion is ok if changing from one pointer type to another or from
+        // a pointer to an integer of the same size.
+        !((isa<PointerType>(OldRetTy) || OldRetTy == TD->getIntPtrType()) &&
+          (isa<PointerType>(NewRetTy) || NewRetTy == TD->getIntPtrType())))
+      return false;   // Cannot transform this return value.
+
+    if (!Caller->use_empty() &&
+        // void -> non-void is handled specially
+        NewRetTy != Type::VoidTy && !CastInst::isCastable(NewRetTy, OldRetTy))
+      return false;   // Cannot transform this return value.
+
+    if (!CallerPAL.isEmpty() && !Caller->use_empty()) {
+      Attributes RAttrs = CallerPAL.getRetAttributes();
+      if (RAttrs & Attribute::typeIncompatible(NewRetTy))
+        return false;   // Attribute not compatible with transformed value.
+    }
+
+    // If the callsite is an invoke instruction, and the return value is used by
+    // a PHI node in a successor, we cannot change the return type of the call
+    // because there is no place to put the cast instruction (without breaking
+    // the critical edge).  Bail out in this case.
+    if (!Caller->use_empty())
+      if (InvokeInst *II = dyn_cast<InvokeInst>(Caller))
+        for (Value::use_iterator UI = II->use_begin(), E = II->use_end();
+             UI != E; ++UI)
+          if (PHINode *PN = dyn_cast<PHINode>(*UI))
+            if (PN->getParent() == II->getNormalDest() ||
+                PN->getParent() == II->getUnwindDest())
+              return false;
+  }
+
+  unsigned NumActualArgs = unsigned(CS.arg_end()-CS.arg_begin());
+  unsigned NumCommonArgs = std::min(FT->getNumParams(), NumActualArgs);
+
+  CallSite::arg_iterator AI = CS.arg_begin();
+  for (unsigned i = 0, e = NumCommonArgs; i != e; ++i, ++AI) {
+    const Type *ParamTy = FT->getParamType(i);
+    const Type *ActTy = (*AI)->getType();
+
+    if (!CastInst::isCastable(ActTy, ParamTy))
+      return false;   // Cannot transform this parameter value.
+
+    if (CallerPAL.getParamAttributes(i + 1) 
+        & Attribute::typeIncompatible(ParamTy))
+      return false;   // Attribute not compatible with transformed value.
+
+    // Converting from one pointer type to another or between a pointer and an
+    // integer of the same size is safe even if we do not have a body.
+    bool isConvertible = ActTy == ParamTy ||
+      ((isa<PointerType>(ParamTy) || ParamTy == TD->getIntPtrType()) &&
+       (isa<PointerType>(ActTy) || ActTy == TD->getIntPtrType()));
+    if (Callee->isDeclaration() && !isConvertible) return false;
+  }
+
+  if (FT->getNumParams() < NumActualArgs && !FT->isVarArg() &&
+      Callee->isDeclaration())
+    return false;   // Do not delete arguments unless we have a function body.
+
+  if (FT->getNumParams() < NumActualArgs && FT->isVarArg() &&
+      !CallerPAL.isEmpty())
+    // In this case we have more arguments than the new function type, but we
+    // won't be dropping them.  Check that these extra arguments have attributes
+    // that are compatible with being a vararg call argument.
+    for (unsigned i = CallerPAL.getNumSlots(); i; --i) {
+      if (CallerPAL.getSlot(i - 1).Index <= FT->getNumParams())
+        break;
+      Attributes PAttrs = CallerPAL.getSlot(i - 1).Attrs;
+      if (PAttrs & Attribute::VarArgsIncompatible)
+        return false;
+    }
+
+  // Okay, we decided that this is a safe thing to do: go ahead and start
+  // inserting cast instructions as necessary...
+  std::vector<Value*> Args;
+  Args.reserve(NumActualArgs);
+  SmallVector<AttributeWithIndex, 8> attrVec;
+  attrVec.reserve(NumCommonArgs);
+
+  // Get any return attributes.
+  Attributes RAttrs = CallerPAL.getRetAttributes();
+
+  // If the return value is not being used, the type may not be compatible
+  // with the existing attributes.  Wipe out any problematic attributes.
+  RAttrs &= ~Attribute::typeIncompatible(NewRetTy);
+
+  // Add the new return attributes.
+  if (RAttrs)
+    attrVec.push_back(AttributeWithIndex::get(0, RAttrs));
+
+  AI = CS.arg_begin();
+  for (unsigned i = 0; i != NumCommonArgs; ++i, ++AI) {
+    const Type *ParamTy = FT->getParamType(i);
+    if ((*AI)->getType() == ParamTy) {
+      Args.push_back(*AI);
+    } else {
+      Instruction::CastOps opcode = CastInst::getCastOpcode(*AI,
+          false, ParamTy, false);
+      CastInst *NewCast = CastInst::Create(opcode, *AI, ParamTy, "tmp");
+      Args.push_back(InsertNewInstBefore(NewCast, *Caller));
+    }
+
+    // Add any parameter attributes.
+    if (Attributes PAttrs = CallerPAL.getParamAttributes(i + 1))
+      attrVec.push_back(AttributeWithIndex::get(i + 1, PAttrs));
+  }
+
+  // If the function takes more arguments than the call was taking, add them
+  // now...
+  for (unsigned i = NumCommonArgs; i != FT->getNumParams(); ++i)
+    Args.push_back(Constant::getNullValue(FT->getParamType(i)));
+
+  // If we are removing arguments to the function, emit an obnoxious warning...
+  if (FT->getNumParams() < NumActualArgs) {
+    if (!FT->isVarArg()) {
+      cerr << "WARNING: While resolving call to function '"
+           << Callee->getName() << "' arguments were dropped!\n";
+    } else {
+      // Add all of the arguments in their promoted form to the arg list...
+      for (unsigned i = FT->getNumParams(); i != NumActualArgs; ++i, ++AI) {
+        const Type *PTy = getPromotedType((*AI)->getType());
+        if (PTy != (*AI)->getType()) {
+          // Must promote to pass through va_arg area!
+          Instruction::CastOps opcode = CastInst::getCastOpcode(*AI, false, 
+                                                                PTy, false);
+          Instruction *Cast = CastInst::Create(opcode, *AI, PTy, "tmp");
+          InsertNewInstBefore(Cast, *Caller);
+          Args.push_back(Cast);
+        } else {
+          Args.push_back(*AI);
+        }
+
+        // Add any parameter attributes.
+        if (Attributes PAttrs = CallerPAL.getParamAttributes(i + 1))
+          attrVec.push_back(AttributeWithIndex::get(i + 1, PAttrs));
+      }
+    }
+  }
+
+  if (Attributes FnAttrs =  CallerPAL.getFnAttributes())
+    attrVec.push_back(AttributeWithIndex::get(~0, FnAttrs));
+
+  if (NewRetTy == Type::VoidTy)
+    Caller->setName("");   // Void type should not have a name.
+
+  const AttrListPtr &NewCallerPAL = AttrListPtr::get(attrVec.begin(),attrVec.end());
+
+  Instruction *NC;
+  if (InvokeInst *II = dyn_cast<InvokeInst>(Caller)) {
+    NC = InvokeInst::Create(Callee, II->getNormalDest(), II->getUnwindDest(),
+                            Args.begin(), Args.end(),
+                            Caller->getName(), Caller);
+    cast<InvokeInst>(NC)->setCallingConv(II->getCallingConv());
+    cast<InvokeInst>(NC)->setAttributes(NewCallerPAL);
+  } else {
+    NC = CallInst::Create(Callee, Args.begin(), Args.end(),
+                          Caller->getName(), Caller);
+    CallInst *CI = cast<CallInst>(Caller);
+    if (CI->isTailCall())
+      cast<CallInst>(NC)->setTailCall();
+    cast<CallInst>(NC)->setCallingConv(CI->getCallingConv());
+    cast<CallInst>(NC)->setAttributes(NewCallerPAL);
+  }
+
+  // Insert a cast of the return type as necessary.
+  Value *NV = NC;
+  if (OldRetTy != NV->getType() && !Caller->use_empty()) {
+    if (NV->getType() != Type::VoidTy) {
+      Instruction::CastOps opcode = CastInst::getCastOpcode(NC, false, 
+                                                            OldRetTy, false);
+      NV = NC = CastInst::Create(opcode, NC, OldRetTy, "tmp");
+
+      // If this is an invoke instruction, we should insert it after the first
+      // non-phi, instruction in the normal successor block.
+      if (InvokeInst *II = dyn_cast<InvokeInst>(Caller)) {
+        BasicBlock::iterator I = II->getNormalDest()->getFirstNonPHI();
+        InsertNewInstBefore(NC, *I);
+      } else {
+        // Otherwise, it's a call, just insert cast right after the call instr
+        InsertNewInstBefore(NC, *Caller);
+      }
+      AddUsersToWorkList(*Caller);
+    } else {
+      NV = UndefValue::get(Caller->getType());
+    }
+  }
+
+  if (Caller->getType() != Type::VoidTy && !Caller->use_empty())
+    Caller->replaceAllUsesWith(NV);
+  Caller->eraseFromParent();
+  RemoveFromWorkList(Caller);
+  return true;
+}
+
+// transformCallThroughTrampoline - Turn a call to a function created by the
+// init_trampoline intrinsic into a direct call to the underlying function.
+//
+Instruction *InstCombiner::transformCallThroughTrampoline(CallSite CS) {
+  Value *Callee = CS.getCalledValue();
+  const PointerType *PTy = cast<PointerType>(Callee->getType());
+  const FunctionType *FTy = cast<FunctionType>(PTy->getElementType());
+  const AttrListPtr &Attrs = CS.getAttributes();
+
+  // If the call already has the 'nest' attribute somewhere then give up -
+  // otherwise 'nest' would occur twice after splicing in the chain.
+  if (Attrs.hasAttrSomewhere(Attribute::Nest))
+    return 0;
+
+  IntrinsicInst *Tramp =
+    cast<IntrinsicInst>(cast<BitCastInst>(Callee)->getOperand(0));
+
+  Function *NestF = cast<Function>(Tramp->getOperand(2)->stripPointerCasts());
+  const PointerType *NestFPTy = cast<PointerType>(NestF->getType());
+  const FunctionType *NestFTy = cast<FunctionType>(NestFPTy->getElementType());
+
+  const AttrListPtr &NestAttrs = NestF->getAttributes();
+  if (!NestAttrs.isEmpty()) {
+    unsigned NestIdx = 1;
+    const Type *NestTy = 0;
+    Attributes NestAttr = Attribute::None;
+
+    // Look for a parameter marked with the 'nest' attribute.
+    for (FunctionType::param_iterator I = NestFTy->param_begin(),
+         E = NestFTy->param_end(); I != E; ++NestIdx, ++I)
+      if (NestAttrs.paramHasAttr(NestIdx, Attribute::Nest)) {
+        // Record the parameter type and any other attributes.
+        NestTy = *I;
+        NestAttr = NestAttrs.getParamAttributes(NestIdx);
+        break;
+      }
+
+    if (NestTy) {
+      Instruction *Caller = CS.getInstruction();
+      std::vector<Value*> NewArgs;
+      NewArgs.reserve(unsigned(CS.arg_end()-CS.arg_begin())+1);
+
+      SmallVector<AttributeWithIndex, 8> NewAttrs;
+      NewAttrs.reserve(Attrs.getNumSlots() + 1);
+
+      // Insert the nest argument into the call argument list, which may
+      // mean appending it.  Likewise for attributes.
+
+      // Add any result attributes.
+      if (Attributes Attr = Attrs.getRetAttributes())
+        NewAttrs.push_back(AttributeWithIndex::get(0, Attr));
+
+      {
+        unsigned Idx = 1;
+        CallSite::arg_iterator I = CS.arg_begin(), E = CS.arg_end();
+        do {
+          if (Idx == NestIdx) {
+            // Add the chain argument and attributes.
+            Value *NestVal = Tramp->getOperand(3);
+            if (NestVal->getType() != NestTy)
+              NestVal = new BitCastInst(NestVal, NestTy, "nest", Caller);
+            NewArgs.push_back(NestVal);
+            NewAttrs.push_back(AttributeWithIndex::get(NestIdx, NestAttr));
+          }
+
+          if (I == E)
+            break;
+
+          // Add the original argument and attributes.
+          NewArgs.push_back(*I);
+          if (Attributes Attr = Attrs.getParamAttributes(Idx))
+            NewAttrs.push_back
+              (AttributeWithIndex::get(Idx + (Idx >= NestIdx), Attr));
+
+          ++Idx, ++I;
+        } while (1);
+      }
+
+      // Add any function attributes.
+      if (Attributes Attr = Attrs.getFnAttributes())
+        NewAttrs.push_back(AttributeWithIndex::get(~0, Attr));
+
+      // The trampoline may have been bitcast to a bogus type (FTy).
+      // Handle this by synthesizing a new function type, equal to FTy
+      // with the chain parameter inserted.
+
+      std::vector<const Type*> NewTypes;
+      NewTypes.reserve(FTy->getNumParams()+1);
+
+      // Insert the chain's type into the list of parameter types, which may
+      // mean appending it.
+      {
+        unsigned Idx = 1;
+        FunctionType::param_iterator I = FTy->param_begin(),
+          E = FTy->param_end();
+
+        do {
+          if (Idx == NestIdx)
+            // Add the chain's type.
+            NewTypes.push_back(NestTy);
+
+          if (I == E)
+            break;
+
+          // Add the original type.
+          NewTypes.push_back(*I);
+
+          ++Idx, ++I;
+        } while (1);
+      }
+
+      // Replace the trampoline call with a direct call.  Let the generic
+      // code sort out any function type mismatches.
+      FunctionType *NewFTy =
+        FunctionType::get(FTy->getReturnType(), NewTypes, FTy->isVarArg());
+      Constant *NewCallee = NestF->getType() == PointerType::getUnqual(NewFTy) ?
+        NestF : ConstantExpr::getBitCast(NestF, PointerType::getUnqual(NewFTy));
+      const AttrListPtr &NewPAL = AttrListPtr::get(NewAttrs.begin(),NewAttrs.end());
+
+      Instruction *NewCaller;
+      if (InvokeInst *II = dyn_cast<InvokeInst>(Caller)) {
+        NewCaller = InvokeInst::Create(NewCallee,
+                                       II->getNormalDest(), II->getUnwindDest(),
+                                       NewArgs.begin(), NewArgs.end(),
+                                       Caller->getName(), Caller);
+        cast<InvokeInst>(NewCaller)->setCallingConv(II->getCallingConv());
+        cast<InvokeInst>(NewCaller)->setAttributes(NewPAL);
+      } else {
+        NewCaller = CallInst::Create(NewCallee, NewArgs.begin(), NewArgs.end(),
+                                     Caller->getName(), Caller);
+        if (cast<CallInst>(Caller)->isTailCall())
+          cast<CallInst>(NewCaller)->setTailCall();
+        cast<CallInst>(NewCaller)->
+          setCallingConv(cast<CallInst>(Caller)->getCallingConv());
+        cast<CallInst>(NewCaller)->setAttributes(NewPAL);
+      }
+      if (Caller->getType() != Type::VoidTy && !Caller->use_empty())
+        Caller->replaceAllUsesWith(NewCaller);
+      Caller->eraseFromParent();
+      RemoveFromWorkList(Caller);
+      return 0;
+    }
+  }
+
+  // Replace the trampoline call with a direct call.  Since there is no 'nest'
+  // parameter, there is no need to adjust the argument list.  Let the generic
+  // code sort out any function type mismatches.
+  Constant *NewCallee =
+    NestF->getType() == PTy ? NestF : ConstantExpr::getBitCast(NestF, PTy);
+  CS.setCalledFunction(NewCallee);
+  return CS.getInstruction();
+}
+
+/// FoldPHIArgBinOpIntoPHI - If we have something like phi [add (a,b), add(c,d)]
+/// and if a/b/c/d and the add's all have a single use, turn this into two phi's
+/// and a single binop.
+Instruction *InstCombiner::FoldPHIArgBinOpIntoPHI(PHINode &PN) {
+  Instruction *FirstInst = cast<Instruction>(PN.getIncomingValue(0));
+  assert(isa<BinaryOperator>(FirstInst) || isa<CmpInst>(FirstInst));
+  unsigned Opc = FirstInst->getOpcode();
+  Value *LHSVal = FirstInst->getOperand(0);
+  Value *RHSVal = FirstInst->getOperand(1);
+    
+  const Type *LHSType = LHSVal->getType();
+  const Type *RHSType = RHSVal->getType();
+  
+  // Scan to see if all operands are the same opcode, all have one use, and all
+  // kill their operands (i.e. the operands have one use).
+  for (unsigned i = 1; i != PN.getNumIncomingValues(); ++i) {
+    Instruction *I = dyn_cast<Instruction>(PN.getIncomingValue(i));
+    if (!I || I->getOpcode() != Opc || !I->hasOneUse() ||
+        // Verify type of the LHS matches so we don't fold cmp's of different
+        // types or GEP's with different index types.
+        I->getOperand(0)->getType() != LHSType ||
+        I->getOperand(1)->getType() != RHSType)
+      return 0;
+
+    // If they are CmpInst instructions, check their predicates
+    if (Opc == Instruction::ICmp || Opc == Instruction::FCmp)
+      if (cast<CmpInst>(I)->getPredicate() !=
+          cast<CmpInst>(FirstInst)->getPredicate())
+        return 0;
+    
+    // Keep track of which operand needs a phi node.
+    if (I->getOperand(0) != LHSVal) LHSVal = 0;
+    if (I->getOperand(1) != RHSVal) RHSVal = 0;
+  }
+  
+  // Otherwise, this is safe to transform!
+  
+  Value *InLHS = FirstInst->getOperand(0);
+  Value *InRHS = FirstInst->getOperand(1);
+  PHINode *NewLHS = 0, *NewRHS = 0;
+  if (LHSVal == 0) {
+    NewLHS = PHINode::Create(LHSType,
+                             FirstInst->getOperand(0)->getName() + ".pn");
+    NewLHS->reserveOperandSpace(PN.getNumOperands()/2);
+    NewLHS->addIncoming(InLHS, PN.getIncomingBlock(0));
+    InsertNewInstBefore(NewLHS, PN);
+    LHSVal = NewLHS;
+  }
+  
+  if (RHSVal == 0) {
+    NewRHS = PHINode::Create(RHSType,
+                             FirstInst->getOperand(1)->getName() + ".pn");
+    NewRHS->reserveOperandSpace(PN.getNumOperands()/2);
+    NewRHS->addIncoming(InRHS, PN.getIncomingBlock(0));
+    InsertNewInstBefore(NewRHS, PN);
+    RHSVal = NewRHS;
+  }
+  
+  // Add all operands to the new PHIs.
+  if (NewLHS || NewRHS) {
+    for (unsigned i = 1, e = PN.getNumIncomingValues(); i != e; ++i) {
+      Instruction *InInst = cast<Instruction>(PN.getIncomingValue(i));
+      if (NewLHS) {
+        Value *NewInLHS = InInst->getOperand(0);
+        NewLHS->addIncoming(NewInLHS, PN.getIncomingBlock(i));
+      }
+      if (NewRHS) {
+        Value *NewInRHS = InInst->getOperand(1);
+        NewRHS->addIncoming(NewInRHS, PN.getIncomingBlock(i));
+      }
+    }
+  }
+    
+  if (BinaryOperator *BinOp = dyn_cast<BinaryOperator>(FirstInst))
+    return BinaryOperator::Create(BinOp->getOpcode(), LHSVal, RHSVal);
+  CmpInst *CIOp = cast<CmpInst>(FirstInst);
+  return CmpInst::Create(CIOp->getOpcode(), CIOp->getPredicate(), LHSVal,
+                         RHSVal);
+}
+
+Instruction *InstCombiner::FoldPHIArgGEPIntoPHI(PHINode &PN) {
+  GetElementPtrInst *FirstInst =cast<GetElementPtrInst>(PN.getIncomingValue(0));
+  
+  SmallVector<Value*, 16> FixedOperands(FirstInst->op_begin(), 
+                                        FirstInst->op_end());
+  // This is true if all GEP bases are allocas and if all indices into them are
+  // constants.
+  bool AllBasePointersAreAllocas = true;
+  
+  // Scan to see if all operands are the same opcode, all have one use, and all
+  // kill their operands (i.e. the operands have one use).
+  for (unsigned i = 1; i != PN.getNumIncomingValues(); ++i) {
+    GetElementPtrInst *GEP= dyn_cast<GetElementPtrInst>(PN.getIncomingValue(i));
+    if (!GEP || !GEP->hasOneUse() || GEP->getType() != FirstInst->getType() ||
+      GEP->getNumOperands() != FirstInst->getNumOperands())
+      return 0;
+
+    // Keep track of whether or not all GEPs are of alloca pointers.
+    if (AllBasePointersAreAllocas &&
+        (!isa<AllocaInst>(GEP->getOperand(0)) ||
+         !GEP->hasAllConstantIndices()))
+      AllBasePointersAreAllocas = false;
+    
+    // Compare the operand lists.
+    for (unsigned op = 0, e = FirstInst->getNumOperands(); op != e; ++op) {
+      if (FirstInst->getOperand(op) == GEP->getOperand(op))
+        continue;
+      
+      // Don't merge two GEPs when two operands differ (introducing phi nodes)
+      // if one of the PHIs has a constant for the index.  The index may be
+      // substantially cheaper to compute for the constants, so making it a
+      // variable index could pessimize the path.  This also handles the case
+      // for struct indices, which must always be constant.
+      if (isa<ConstantInt>(FirstInst->getOperand(op)) ||
+          isa<ConstantInt>(GEP->getOperand(op)))
+        return 0;
+      
+      if (FirstInst->getOperand(op)->getType() !=GEP->getOperand(op)->getType())
+        return 0;
+      FixedOperands[op] = 0;  // Needs a PHI.
+    }
+  }
+  
+  // If all of the base pointers of the PHI'd GEPs are from allocas, don't
+  // bother doing this transformation.  At best, this will just save a bit of
+  // offset calculation, but all the predecessors will have to materialize the
+  // stack address into a register anyway.  We'd actually rather *clone* the
+  // load up into the predecessors so that we have a load of a gep of an alloca,
+  // which can usually all be folded into the load.
+  if (AllBasePointersAreAllocas)
+    return 0;
+  
+  // Otherwise, this is safe to transform.  Insert PHI nodes for each operand
+  // that is variable.
+  SmallVector<PHINode*, 16> OperandPhis(FixedOperands.size());
+  
+  bool HasAnyPHIs = false;
+  for (unsigned i = 0, e = FixedOperands.size(); i != e; ++i) {
+    if (FixedOperands[i]) continue;  // operand doesn't need a phi.
+    Value *FirstOp = FirstInst->getOperand(i);
+    PHINode *NewPN = PHINode::Create(FirstOp->getType(),
+                                     FirstOp->getName()+".pn");
+    InsertNewInstBefore(NewPN, PN);
+    
+    NewPN->reserveOperandSpace(e);
+    NewPN->addIncoming(FirstOp, PN.getIncomingBlock(0));
+    OperandPhis[i] = NewPN;
+    FixedOperands[i] = NewPN;
+    HasAnyPHIs = true;
+  }
+
+  
+  // Add all operands to the new PHIs.
+  if (HasAnyPHIs) {
+    for (unsigned i = 1, e = PN.getNumIncomingValues(); i != e; ++i) {
+      GetElementPtrInst *InGEP =cast<GetElementPtrInst>(PN.getIncomingValue(i));
+      BasicBlock *InBB = PN.getIncomingBlock(i);
+      
+      for (unsigned op = 0, e = OperandPhis.size(); op != e; ++op)
+        if (PHINode *OpPhi = OperandPhis[op])
+          OpPhi->addIncoming(InGEP->getOperand(op), InBB);
+    }
+  }
+  
+  Value *Base = FixedOperands[0];
+  return GetElementPtrInst::Create(Base, FixedOperands.begin()+1,
+                                   FixedOperands.end());
+}
+
+
+/// isSafeAndProfitableToSinkLoad - Return true if we know that it is safe to
+/// sink the load out of the block that defines it.  This means that it must be
+/// obvious the value of the load is not changed from the point of the load to
+/// the end of the block it is in.
+///
+/// Finally, it is safe, but not profitable, to sink a load targetting a
+/// non-address-taken alloca.  Doing so will cause us to not promote the alloca
+/// to a register.
+static bool isSafeAndProfitableToSinkLoad(LoadInst *L) {
+  BasicBlock::iterator BBI = L, E = L->getParent()->end();
+  
+  for (++BBI; BBI != E; ++BBI)
+    if (BBI->mayWriteToMemory())
+      return false;
+  
+  // Check for non-address taken alloca.  If not address-taken already, it isn't
+  // profitable to do this xform.
+  if (AllocaInst *AI = dyn_cast<AllocaInst>(L->getOperand(0))) {
+    bool isAddressTaken = false;
+    for (Value::use_iterator UI = AI->use_begin(), E = AI->use_end();
+         UI != E; ++UI) {
+      if (isa<LoadInst>(UI)) continue;
+      if (StoreInst *SI = dyn_cast<StoreInst>(*UI)) {
+        // If storing TO the alloca, then the address isn't taken.
+        if (SI->getOperand(1) == AI) continue;
+      }
+      isAddressTaken = true;
+      break;
+    }
+    
+    if (!isAddressTaken && AI->isStaticAlloca())
+      return false;
+  }
+  
+  // If this load is a load from a GEP with a constant offset from an alloca,
+  // then we don't want to sink it.  In its present form, it will be
+  // load [constant stack offset].  Sinking it will cause us to have to
+  // materialize the stack addresses in each predecessor in a register only to
+  // do a shared load from register in the successor.
+  if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(L->getOperand(0)))
+    if (AllocaInst *AI = dyn_cast<AllocaInst>(GEP->getOperand(0)))
+      if (AI->isStaticAlloca() && GEP->hasAllConstantIndices())
+        return false;
+  
+  return true;
+}
+
+
+// FoldPHIArgOpIntoPHI - If all operands to a PHI node are the same "unary"
+// operator and they all are only used by the PHI, PHI together their
+// inputs, and do the operation once, to the result of the PHI.
+Instruction *InstCombiner::FoldPHIArgOpIntoPHI(PHINode &PN) {
+  Instruction *FirstInst = cast<Instruction>(PN.getIncomingValue(0));
+
+  // Scan the instruction, looking for input operations that can be folded away.
+  // If all input operands to the phi are the same instruction (e.g. a cast from
+  // the same type or "+42") we can pull the operation through the PHI, reducing
+  // code size and simplifying code.
+  Constant *ConstantOp = 0;
+  const Type *CastSrcTy = 0;
+  bool isVolatile = false;
+  if (isa<CastInst>(FirstInst)) {
+    CastSrcTy = FirstInst->getOperand(0)->getType();
+  } else if (isa<BinaryOperator>(FirstInst) || isa<CmpInst>(FirstInst)) {
+    // Can fold binop, compare or shift here if the RHS is a constant, 
+    // otherwise call FoldPHIArgBinOpIntoPHI.
+    ConstantOp = dyn_cast<Constant>(FirstInst->getOperand(1));
+    if (ConstantOp == 0)
+      return FoldPHIArgBinOpIntoPHI(PN);
+  } else if (LoadInst *LI = dyn_cast<LoadInst>(FirstInst)) {
+    isVolatile = LI->isVolatile();
+    // We can't sink the load if the loaded value could be modified between the
+    // load and the PHI.
+    if (LI->getParent() != PN.getIncomingBlock(0) ||
+        !isSafeAndProfitableToSinkLoad(LI))
+      return 0;
+    
+    // If the PHI is of volatile loads and the load block has multiple
+    // successors, sinking it would remove a load of the volatile value from
+    // the path through the other successor.
+    if (isVolatile &&
+        LI->getParent()->getTerminator()->getNumSuccessors() != 1)
+      return 0;
+    
+  } else if (isa<GetElementPtrInst>(FirstInst)) {
+    return FoldPHIArgGEPIntoPHI(PN);
+  } else {
+    return 0;  // Cannot fold this operation.
+  }
+
+  // Check to see if all arguments are the same operation.
+  for (unsigned i = 1, e = PN.getNumIncomingValues(); i != e; ++i) {
+    if (!isa<Instruction>(PN.getIncomingValue(i))) return 0;
+    Instruction *I = cast<Instruction>(PN.getIncomingValue(i));
+    if (!I->hasOneUse() || !I->isSameOperationAs(FirstInst))
+      return 0;
+    if (CastSrcTy) {
+      if (I->getOperand(0)->getType() != CastSrcTy)
+        return 0;  // Cast operation must match.
+    } else if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+      // We can't sink the load if the loaded value could be modified between 
+      // the load and the PHI.
+      if (LI->isVolatile() != isVolatile ||
+          LI->getParent() != PN.getIncomingBlock(i) ||
+          !isSafeAndProfitableToSinkLoad(LI))
+        return 0;
+      
+      // If the PHI is of volatile loads and the load block has multiple
+      // successors, sinking it would remove a load of the volatile value from
+      // the path through the other successor.
+      if (isVolatile &&
+          LI->getParent()->getTerminator()->getNumSuccessors() != 1)
+        return 0;
+      
+    } else if (I->getOperand(1) != ConstantOp) {
+      return 0;
+    }
+  }
+
+  // Okay, they are all the same operation.  Create a new PHI node of the
+  // correct type, and PHI together all of the LHS's of the instructions.
+  PHINode *NewPN = PHINode::Create(FirstInst->getOperand(0)->getType(),
+                                   PN.getName()+".in");
+  NewPN->reserveOperandSpace(PN.getNumOperands()/2);
+
+  Value *InVal = FirstInst->getOperand(0);
+  NewPN->addIncoming(InVal, PN.getIncomingBlock(0));
+
+  // Add all operands to the new PHI.
+  for (unsigned i = 1, e = PN.getNumIncomingValues(); i != e; ++i) {
+    Value *NewInVal = cast<Instruction>(PN.getIncomingValue(i))->getOperand(0);
+    if (NewInVal != InVal)
+      InVal = 0;
+    NewPN->addIncoming(NewInVal, PN.getIncomingBlock(i));
+  }
+
+  Value *PhiVal;
+  if (InVal) {
+    // The new PHI unions all of the same values together.  This is really
+    // common, so we handle it intelligently here for compile-time speed.
+    PhiVal = InVal;
+    delete NewPN;
+  } else {
+    InsertNewInstBefore(NewPN, PN);
+    PhiVal = NewPN;
+  }
+
+  // Insert and return the new operation.
+  if (CastInst* FirstCI = dyn_cast<CastInst>(FirstInst))
+    return CastInst::Create(FirstCI->getOpcode(), PhiVal, PN.getType());
+  if (BinaryOperator *BinOp = dyn_cast<BinaryOperator>(FirstInst))
+    return BinaryOperator::Create(BinOp->getOpcode(), PhiVal, ConstantOp);
+  if (CmpInst *CIOp = dyn_cast<CmpInst>(FirstInst))
+    return CmpInst::Create(CIOp->getOpcode(), CIOp->getPredicate(), 
+                           PhiVal, ConstantOp);
+  assert(isa<LoadInst>(FirstInst) && "Unknown operation");
+  
+  // If this was a volatile load that we are merging, make sure to loop through
+  // and mark all the input loads as non-volatile.  If we don't do this, we will
+  // insert a new volatile load and the old ones will not be deletable.
+  if (isVolatile)
+    for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i)
+      cast<LoadInst>(PN.getIncomingValue(i))->setVolatile(false);
+  
+  return new LoadInst(PhiVal, "", isVolatile);
+}
+
+/// DeadPHICycle - Return true if this PHI node is only used by a PHI node cycle
+/// that is dead.
+static bool DeadPHICycle(PHINode *PN,
+                         SmallPtrSet<PHINode*, 16> &PotentiallyDeadPHIs) {
+  if (PN->use_empty()) return true;
+  if (!PN->hasOneUse()) return false;
+
+  // Remember this node, and if we find the cycle, return.
+  if (!PotentiallyDeadPHIs.insert(PN))
+    return true;
+  
+  // Don't scan crazily complex things.
+  if (PotentiallyDeadPHIs.size() == 16)
+    return false;
+
+  if (PHINode *PU = dyn_cast<PHINode>(PN->use_back()))
+    return DeadPHICycle(PU, PotentiallyDeadPHIs);
+
+  return false;
+}
+
+/// PHIsEqualValue - Return true if this phi node is always equal to
+/// NonPhiInVal.  This happens with mutually cyclic phi nodes like:
+///   z = some value; x = phi (y, z); y = phi (x, z)
+static bool PHIsEqualValue(PHINode *PN, Value *NonPhiInVal, 
+                           SmallPtrSet<PHINode*, 16> &ValueEqualPHIs) {
+  // See if we already saw this PHI node.
+  if (!ValueEqualPHIs.insert(PN))
+    return true;
+  
+  // Don't scan crazily complex things.
+  if (ValueEqualPHIs.size() == 16)
+    return false;
+ 
+  // Scan the operands to see if they are either phi nodes or are equal to
+  // the value.
+  for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+    Value *Op = PN->getIncomingValue(i);
+    if (PHINode *OpPN = dyn_cast<PHINode>(Op)) {
+      if (!PHIsEqualValue(OpPN, NonPhiInVal, ValueEqualPHIs))
+        return false;
+    } else if (Op != NonPhiInVal)
+      return false;
+  }
+  
+  return true;
+}
+
+
+// PHINode simplification
+//
+Instruction *InstCombiner::visitPHINode(PHINode &PN) {
+  // If LCSSA is around, don't mess with Phi nodes
+  if (MustPreserveLCSSA) return 0;
+  
+  if (Value *V = PN.hasConstantValue())
+    return ReplaceInstUsesWith(PN, V);
+
+  // If all PHI operands are the same operation, pull them through the PHI,
+  // reducing code size.
+  if (isa<Instruction>(PN.getIncomingValue(0)) &&
+      isa<Instruction>(PN.getIncomingValue(1)) &&
+      cast<Instruction>(PN.getIncomingValue(0))->getOpcode() ==
+      cast<Instruction>(PN.getIncomingValue(1))->getOpcode() &&
+      // FIXME: The hasOneUse check will fail for PHIs that use the value more
+      // than themselves more than once.
+      PN.getIncomingValue(0)->hasOneUse())
+    if (Instruction *Result = FoldPHIArgOpIntoPHI(PN))
+      return Result;
+
+  // If this is a trivial cycle in the PHI node graph, remove it.  Basically, if
+  // this PHI only has a single use (a PHI), and if that PHI only has one use (a
+  // PHI)... break the cycle.
+  if (PN.hasOneUse()) {
+    Instruction *PHIUser = cast<Instruction>(PN.use_back());
+    if (PHINode *PU = dyn_cast<PHINode>(PHIUser)) {
+      SmallPtrSet<PHINode*, 16> PotentiallyDeadPHIs;
+      PotentiallyDeadPHIs.insert(&PN);
+      if (DeadPHICycle(PU, PotentiallyDeadPHIs))
+        return ReplaceInstUsesWith(PN, UndefValue::get(PN.getType()));
+    }
+   
+    // If this phi has a single use, and if that use just computes a value for
+    // the next iteration of a loop, delete the phi.  This occurs with unused
+    // induction variables, e.g. "for (int j = 0; ; ++j);".  Detecting this
+    // common case here is good because the only other things that catch this
+    // are induction variable analysis (sometimes) and ADCE, which is only run
+    // late.
+    if (PHIUser->hasOneUse() &&
+        (isa<BinaryOperator>(PHIUser) || isa<GetElementPtrInst>(PHIUser)) &&
+        PHIUser->use_back() == &PN) {
+      return ReplaceInstUsesWith(PN, UndefValue::get(PN.getType()));
+    }
+  }
+
+  // We sometimes end up with phi cycles that non-obviously end up being the
+  // same value, for example:
+  //   z = some value; x = phi (y, z); y = phi (x, z)
+  // where the phi nodes don't necessarily need to be in the same block.  Do a
+  // quick check to see if the PHI node only contains a single non-phi value, if
+  // so, scan to see if the phi cycle is actually equal to that value.
+  {
+    unsigned InValNo = 0, NumOperandVals = PN.getNumIncomingValues();
+    // Scan for the first non-phi operand.
+    while (InValNo != NumOperandVals && 
+           isa<PHINode>(PN.getIncomingValue(InValNo)))
+      ++InValNo;
+
+    if (InValNo != NumOperandVals) {
+      Value *NonPhiInVal = PN.getOperand(InValNo);
+      
+      // Scan the rest of the operands to see if there are any conflicts, if so
+      // there is no need to recursively scan other phis.
+      for (++InValNo; InValNo != NumOperandVals; ++InValNo) {
+        Value *OpVal = PN.getIncomingValue(InValNo);
+        if (OpVal != NonPhiInVal && !isa<PHINode>(OpVal))
+          break;
+      }
+      
+      // If we scanned over all operands, then we have one unique value plus
+      // phi values.  Scan PHI nodes to see if they all merge in each other or
+      // the value.
+      if (InValNo == NumOperandVals) {
+        SmallPtrSet<PHINode*, 16> ValueEqualPHIs;
+        if (PHIsEqualValue(&PN, NonPhiInVal, ValueEqualPHIs))
+          return ReplaceInstUsesWith(PN, NonPhiInVal);
+      }
+    }
+  }
+  return 0;
+}
+
+static Value *InsertCastToIntPtrTy(Value *V, const Type *DTy,
+                                   Instruction *InsertPoint,
+                                   InstCombiner *IC) {
+  unsigned PtrSize = DTy->getPrimitiveSizeInBits();
+  unsigned VTySize = V->getType()->getPrimitiveSizeInBits();
+  // We must cast correctly to the pointer type. Ensure that we
+  // sign extend the integer value if it is smaller as this is
+  // used for address computation.
+  Instruction::CastOps opcode = 
+     (VTySize < PtrSize ? Instruction::SExt :
+      (VTySize == PtrSize ? Instruction::BitCast : Instruction::Trunc));
+  return IC->InsertCastBefore(opcode, V, DTy, *InsertPoint);
+}
+
+
+Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
+  Value *PtrOp = GEP.getOperand(0);
+  // Is it 'getelementptr %P, i32 0'  or 'getelementptr %P'
+  // If so, eliminate the noop.
+  if (GEP.getNumOperands() == 1)
+    return ReplaceInstUsesWith(GEP, PtrOp);
+
+  if (isa<UndefValue>(GEP.getOperand(0)))
+    return ReplaceInstUsesWith(GEP, UndefValue::get(GEP.getType()));
+
+  bool HasZeroPointerIndex = false;
+  if (Constant *C = dyn_cast<Constant>(GEP.getOperand(1)))
+    HasZeroPointerIndex = C->isNullValue();
+
+  if (GEP.getNumOperands() == 2 && HasZeroPointerIndex)
+    return ReplaceInstUsesWith(GEP, PtrOp);
+
+  // Eliminate unneeded casts for indices.
+  bool MadeChange = false;
+  
+  gep_type_iterator GTI = gep_type_begin(GEP);
+  for (User::op_iterator i = GEP.op_begin() + 1, e = GEP.op_end();
+       i != e; ++i, ++GTI) {
+    if (isa<SequentialType>(*GTI)) {
+      if (CastInst *CI = dyn_cast<CastInst>(*i)) {
+        if (CI->getOpcode() == Instruction::ZExt ||
+            CI->getOpcode() == Instruction::SExt) {
+          const Type *SrcTy = CI->getOperand(0)->getType();
+          // We can eliminate a cast from i32 to i64 iff the target 
+          // is a 32-bit pointer target.
+          if (SrcTy->getPrimitiveSizeInBits() >= TD->getPointerSizeInBits()) {
+            MadeChange = true;
+            *i = CI->getOperand(0);
+          }
+        }
+      }
+      // If we are using a wider index than needed for this platform, shrink it
+      // to what we need.  If narrower, sign-extend it to what we need.
+      // If the incoming value needs a cast instruction,
+      // insert it.  This explicit cast can make subsequent optimizations more
+      // obvious.
+      Value *Op = *i;
+      if (TD->getTypeSizeInBits(Op->getType()) > TD->getPointerSizeInBits()) {
+        if (Constant *C = dyn_cast<Constant>(Op)) {
+          *i = ConstantExpr::getTrunc(C, TD->getIntPtrType());
+          MadeChange = true;
+        } else {
+          Op = InsertCastBefore(Instruction::Trunc, Op, TD->getIntPtrType(),
+                                GEP);
+          *i = Op;
+          MadeChange = true;
+        }
+      } else if (TD->getTypeSizeInBits(Op->getType()) < TD->getPointerSizeInBits()) {
+        if (Constant *C = dyn_cast<Constant>(Op)) {
+          *i = ConstantExpr::getSExt(C, TD->getIntPtrType());
+          MadeChange = true;
+        } else {
+          Op = InsertCastBefore(Instruction::SExt, Op, TD->getIntPtrType(),
+                                GEP);
+          *i = Op;
+          MadeChange = true;
+        }
+      }
+    }
+  }
+  if (MadeChange) return &GEP;
+
+  // Combine Indices - If the source pointer to this getelementptr instruction
+  // is a getelementptr instruction, combine the indices of the two
+  // getelementptr instructions into a single instruction.
+  //
+  SmallVector<Value*, 8> SrcGEPOperands;
+  if (User *Src = dyn_castGetElementPtr(PtrOp))
+    SrcGEPOperands.append(Src->op_begin(), Src->op_end());
+
+  if (!SrcGEPOperands.empty()) {
+    // Note that if our source is a gep chain itself that we wait for that
+    // chain to be resolved before we perform this transformation.  This
+    // avoids us creating a TON of code in some cases.
+    //
+    if (isa<GetElementPtrInst>(SrcGEPOperands[0]) &&
+        cast<Instruction>(SrcGEPOperands[0])->getNumOperands() == 2)
+      return 0;   // Wait until our source is folded to completion.
+
+    SmallVector<Value*, 8> Indices;
+
+    // Find out whether the last index in the source GEP is a sequential idx.
+    bool EndsWithSequential = false;
+    for (gep_type_iterator I = gep_type_begin(*cast<User>(PtrOp)),
+           E = gep_type_end(*cast<User>(PtrOp)); I != E; ++I)
+      EndsWithSequential = !isa<StructType>(*I);
+
+    // Can we combine the two pointer arithmetics offsets?
+    if (EndsWithSequential) {
+      // Replace: gep (gep %P, long B), long A, ...
+      // With:    T = long A+B; gep %P, T, ...
+      //
+      Value *Sum, *SO1 = SrcGEPOperands.back(), *GO1 = GEP.getOperand(1);
+      if (SO1 == Constant::getNullValue(SO1->getType())) {
+        Sum = GO1;
+      } else if (GO1 == Constant::getNullValue(GO1->getType())) {
+        Sum = SO1;
+      } else {
+        // If they aren't the same type, convert both to an integer of the
+        // target's pointer size.
+        if (SO1->getType() != GO1->getType()) {
+          if (Constant *SO1C = dyn_cast<Constant>(SO1)) {
+            SO1 = ConstantExpr::getIntegerCast(SO1C, GO1->getType(), true);
+          } else if (Constant *GO1C = dyn_cast<Constant>(GO1)) {
+            GO1 = ConstantExpr::getIntegerCast(GO1C, SO1->getType(), true);
+          } else {
+            unsigned PS = TD->getPointerSizeInBits();
+            if (TD->getTypeSizeInBits(SO1->getType()) == PS) {
+              // Convert GO1 to SO1's type.
+              GO1 = InsertCastToIntPtrTy(GO1, SO1->getType(), &GEP, this);
+
+            } else if (TD->getTypeSizeInBits(GO1->getType()) == PS) {
+              // Convert SO1 to GO1's type.
+              SO1 = InsertCastToIntPtrTy(SO1, GO1->getType(), &GEP, this);
+            } else {
+              const Type *PT = TD->getIntPtrType();
+              SO1 = InsertCastToIntPtrTy(SO1, PT, &GEP, this);
+              GO1 = InsertCastToIntPtrTy(GO1, PT, &GEP, this);
+            }
+          }
+        }
+        if (isa<Constant>(SO1) && isa<Constant>(GO1))
+          Sum = ConstantExpr::getAdd(cast<Constant>(SO1), cast<Constant>(GO1));
+        else {
+          Sum = BinaryOperator::CreateAdd(SO1, GO1, PtrOp->getName()+".sum");
+          InsertNewInstBefore(cast<Instruction>(Sum), GEP);
+        }
+      }
+
+      // Recycle the GEP we already have if possible.
+      if (SrcGEPOperands.size() == 2) {
+        GEP.setOperand(0, SrcGEPOperands[0]);
+        GEP.setOperand(1, Sum);
+        return &GEP;
+      } else {
+        Indices.insert(Indices.end(), SrcGEPOperands.begin()+1,
+                       SrcGEPOperands.end()-1);
+        Indices.push_back(Sum);
+        Indices.insert(Indices.end(), GEP.op_begin()+2, GEP.op_end());
+      }
+    } else if (isa<Constant>(*GEP.idx_begin()) &&
+               cast<Constant>(*GEP.idx_begin())->isNullValue() &&
+               SrcGEPOperands.size() != 1) {
+      // Otherwise we can do the fold if the first index of the GEP is a zero
+      Indices.insert(Indices.end(), SrcGEPOperands.begin()+1,
+                     SrcGEPOperands.end());
+      Indices.insert(Indices.end(), GEP.idx_begin()+1, GEP.idx_end());
+    }
+
+    if (!Indices.empty())
+      return GetElementPtrInst::Create(SrcGEPOperands[0], Indices.begin(),
+                                       Indices.end(), GEP.getName());
+
+  } else if (GlobalValue *GV = dyn_cast<GlobalValue>(PtrOp)) {
+    // GEP of global variable.  If all of the indices for this GEP are
+    // constants, we can promote this to a constexpr instead of an instruction.
+
+    // Scan for nonconstants...
+    SmallVector<Constant*, 8> Indices;
+    User::op_iterator I = GEP.idx_begin(), E = GEP.idx_end();
+    for (; I != E && isa<Constant>(*I); ++I)
+      Indices.push_back(cast<Constant>(*I));
+
+    if (I == E) {  // If they are all constants...
+      Constant *CE = ConstantExpr::getGetElementPtr(GV,
+                                                    &Indices[0],Indices.size());
+
+      // Replace all uses of the GEP with the new constexpr...
+      return ReplaceInstUsesWith(GEP, CE);
+    }
+  } else if (Value *X = getBitCastOperand(PtrOp)) {  // Is the operand a cast?
+    if (!isa<PointerType>(X->getType())) {
+      // Not interesting.  Source pointer must be a cast from pointer.
+    } else if (HasZeroPointerIndex) {
+      // transform: GEP (bitcast [10 x i8]* X to [0 x i8]*), i32 0, ...
+      // into     : GEP [10 x i8]* X, i32 0, ...
+      //
+      // Likewise, transform: GEP (bitcast i8* X to [0 x i8]*), i32 0, ...
+      //           into     : GEP i8* X, ...
+      // 
+      // This occurs when the program declares an array extern like "int X[];"
+      const PointerType *CPTy = cast<PointerType>(PtrOp->getType());
+      const PointerType *XTy = cast<PointerType>(X->getType());
+      if (const ArrayType *CATy =
+          dyn_cast<ArrayType>(CPTy->getElementType())) {
+        // GEP (bitcast i8* X to [0 x i8]*), i32 0, ... ?
+        if (CATy->getElementType() == XTy->getElementType()) {
+          // -> GEP i8* X, ...
+          SmallVector<Value*, 8> Indices(GEP.idx_begin()+1, GEP.idx_end());
+          return GetElementPtrInst::Create(X, Indices.begin(), Indices.end(),
+                                           GEP.getName());
+        } else if (const ArrayType *XATy =
+                 dyn_cast<ArrayType>(XTy->getElementType())) {
+          // GEP (bitcast [10 x i8]* X to [0 x i8]*), i32 0, ... ?
+          if (CATy->getElementType() == XATy->getElementType()) {
+            // -> GEP [10 x i8]* X, i32 0, ...
+            // At this point, we know that the cast source type is a pointer
+            // to an array of the same type as the destination pointer
+            // array.  Because the array type is never stepped over (there
+            // is a leading zero) we can fold the cast into this GEP.
+            GEP.setOperand(0, X);
+            return &GEP;
+          }
+        }
+      }
+    } else if (GEP.getNumOperands() == 2) {
+      // Transform things like:
+      // %t = getelementptr i32* bitcast ([2 x i32]* %str to i32*), i32 %V
+      // into:  %t1 = getelementptr [2 x i32]* %str, i32 0, i32 %V; bitcast
+      const Type *SrcElTy = cast<PointerType>(X->getType())->getElementType();
+      const Type *ResElTy=cast<PointerType>(PtrOp->getType())->getElementType();
+      if (isa<ArrayType>(SrcElTy) &&
+          TD->getTypeAllocSize(cast<ArrayType>(SrcElTy)->getElementType()) ==
+          TD->getTypeAllocSize(ResElTy)) {
+        Value *Idx[2];
+        Idx[0] = Constant::getNullValue(Type::Int32Ty);
+        Idx[1] = GEP.getOperand(1);
+        Value *V = InsertNewInstBefore(
+               GetElementPtrInst::Create(X, Idx, Idx + 2, GEP.getName()), GEP);
+        // V and GEP are both pointer types --> BitCast
+        return new BitCastInst(V, GEP.getType());
+      }
+      
+      // Transform things like:
+      // getelementptr i8* bitcast ([100 x double]* X to i8*), i32 %tmp
+      //   (where tmp = 8*tmp2) into:
+      // getelementptr [100 x double]* %arr, i32 0, i32 %tmp2; bitcast
+      
+      if (isa<ArrayType>(SrcElTy) && ResElTy == Type::Int8Ty) {
+        uint64_t ArrayEltSize =
+            TD->getTypeAllocSize(cast<ArrayType>(SrcElTy)->getElementType());
+        
+        // Check to see if "tmp" is a scale by a multiple of ArrayEltSize.  We
+        // allow either a mul, shift, or constant here.
+        Value *NewIdx = 0;
+        ConstantInt *Scale = 0;
+        if (ArrayEltSize == 1) {
+          NewIdx = GEP.getOperand(1);
+          Scale = ConstantInt::get(NewIdx->getType(), 1);
+        } else if (ConstantInt *CI = dyn_cast<ConstantInt>(GEP.getOperand(1))) {
+          NewIdx = ConstantInt::get(CI->getType(), 1);
+          Scale = CI;
+        } else if (Instruction *Inst =dyn_cast<Instruction>(GEP.getOperand(1))){
+          if (Inst->getOpcode() == Instruction::Shl &&
+              isa<ConstantInt>(Inst->getOperand(1))) {
+            ConstantInt *ShAmt = cast<ConstantInt>(Inst->getOperand(1));
+            uint32_t ShAmtVal = ShAmt->getLimitedValue(64);
+            Scale = ConstantInt::get(Inst->getType(), 1ULL << ShAmtVal);
+            NewIdx = Inst->getOperand(0);
+          } else if (Inst->getOpcode() == Instruction::Mul &&
+                     isa<ConstantInt>(Inst->getOperand(1))) {
+            Scale = cast<ConstantInt>(Inst->getOperand(1));
+            NewIdx = Inst->getOperand(0);
+          }
+        }
+        
+        // If the index will be to exactly the right offset with the scale taken
+        // out, perform the transformation. Note, we don't know whether Scale is
+        // signed or not. We'll use unsigned version of division/modulo
+        // operation after making sure Scale doesn't have the sign bit set.
+        if (ArrayEltSize && Scale && Scale->getSExtValue() >= 0LL &&
+            Scale->getZExtValue() % ArrayEltSize == 0) {
+          Scale = ConstantInt::get(Scale->getType(),
+                                   Scale->getZExtValue() / ArrayEltSize);
+          if (Scale->getZExtValue() != 1) {
+            Constant *C = ConstantExpr::getIntegerCast(Scale, NewIdx->getType(),
+                                                       false /*ZExt*/);
+            Instruction *Sc = BinaryOperator::CreateMul(NewIdx, C, "idxscale");
+            NewIdx = InsertNewInstBefore(Sc, GEP);
+          }
+
+          // Insert the new GEP instruction.
+          Value *Idx[2];
+          Idx[0] = Constant::getNullValue(Type::Int32Ty);
+          Idx[1] = NewIdx;
+          Instruction *NewGEP =
+            GetElementPtrInst::Create(X, Idx, Idx + 2, GEP.getName());
+          NewGEP = InsertNewInstBefore(NewGEP, GEP);
+          // The NewGEP must be pointer typed, so must the old one -> BitCast
+          return new BitCastInst(NewGEP, GEP.getType());
+        }
+      }
+    }
+  }
+  
+  /// See if we can simplify:
+  ///   X = bitcast A to B*
+  ///   Y = gep X, <...constant indices...>
+  /// into a gep of the original struct.  This is important for SROA and alias
+  /// analysis of unions.  If "A" is also a bitcast, wait for A/X to be merged.
+  if (BitCastInst *BCI = dyn_cast<BitCastInst>(PtrOp)) {
+    if (!isa<BitCastInst>(BCI->getOperand(0)) && GEP.hasAllConstantIndices()) {
+      // Determine how much the GEP moves the pointer.  We are guaranteed to get
+      // a constant back from EmitGEPOffset.
+      ConstantInt *OffsetV = cast<ConstantInt>(EmitGEPOffset(&GEP, GEP, *this));
+      int64_t Offset = OffsetV->getSExtValue();
+      
+      // If this GEP instruction doesn't move the pointer, just replace the GEP
+      // with a bitcast of the real input to the dest type.
+      if (Offset == 0) {
+        // If the bitcast is of an allocation, and the allocation will be
+        // converted to match the type of the cast, don't touch this.
+        if (isa<AllocationInst>(BCI->getOperand(0))) {
+          // See if the bitcast simplifies, if so, don't nuke this GEP yet.
+          if (Instruction *I = visitBitCast(*BCI)) {
+            if (I != BCI) {
+              I->takeName(BCI);
+              BCI->getParent()->getInstList().insert(BCI, I);
+              ReplaceInstUsesWith(*BCI, I);
+            }
+            return &GEP;
+          }
+        }
+        return new BitCastInst(BCI->getOperand(0), GEP.getType());
+      }
+      
+      // Otherwise, if the offset is non-zero, we need to find out if there is a
+      // field at Offset in 'A's type.  If so, we can pull the cast through the
+      // GEP.
+      SmallVector<Value*, 8> NewIndices;
+      const Type *InTy =
+        cast<PointerType>(BCI->getOperand(0)->getType())->getElementType();
+      if (FindElementAtOffset(InTy, Offset, NewIndices, TD)) {
+        Instruction *NGEP =
+           GetElementPtrInst::Create(BCI->getOperand(0), NewIndices.begin(),
+                                     NewIndices.end());
+        if (NGEP->getType() == GEP.getType()) return NGEP;
+        InsertNewInstBefore(NGEP, GEP);
+        NGEP->takeName(&GEP);
+        return new BitCastInst(NGEP, GEP.getType());
+      }
+    }
+  }    
+    
+  return 0;
+}
+
+Instruction *InstCombiner::visitAllocationInst(AllocationInst &AI) {
+  // Convert: malloc Ty, C - where C is a constant != 1 into: malloc [C x Ty], 1
+  if (AI.isArrayAllocation()) {  // Check C != 1
+    if (const ConstantInt *C = dyn_cast<ConstantInt>(AI.getArraySize())) {
+      const Type *NewTy = 
+        ArrayType::get(AI.getAllocatedType(), C->getZExtValue());
+      AllocationInst *New = 0;
+
+      // Create and insert the replacement instruction...
+      if (isa<MallocInst>(AI))
+        New = new MallocInst(NewTy, 0, AI.getAlignment(), AI.getName());
+      else {
+        assert(isa<AllocaInst>(AI) && "Unknown type of allocation inst!");
+        New = new AllocaInst(NewTy, 0, AI.getAlignment(), AI.getName());
+      }
+
+      InsertNewInstBefore(New, AI);
+
+      // Scan to the end of the allocation instructions, to skip over a block of
+      // allocas if possible...also skip interleaved debug info
+      //
+      BasicBlock::iterator It = New;
+      while (isa<AllocationInst>(*It) || isa<DbgInfoIntrinsic>(*It)) ++It;
+
+      // Now that I is pointing to the first non-allocation-inst in the block,
+      // insert our getelementptr instruction...
+      //
+      Value *NullIdx = Constant::getNullValue(Type::Int32Ty);
+      Value *Idx[2];
+      Idx[0] = NullIdx;
+      Idx[1] = NullIdx;
+      Value *V = GetElementPtrInst::Create(New, Idx, Idx + 2,
+                                           New->getName()+".sub", It);
+
+      // Now make everything use the getelementptr instead of the original
+      // allocation.
+      return ReplaceInstUsesWith(AI, V);
+    } else if (isa<UndefValue>(AI.getArraySize())) {
+      return ReplaceInstUsesWith(AI, Constant::getNullValue(AI.getType()));
+    }
+  }
+
+  if (isa<AllocaInst>(AI) && AI.getAllocatedType()->isSized()) {
+    // If alloca'ing a zero byte object, replace the alloca with a null pointer.
+    // Note that we only do this for alloca's, because malloc should allocate
+    // and return a unique pointer, even for a zero byte allocation.
+    if (TD->getTypeAllocSize(AI.getAllocatedType()) == 0)
+      return ReplaceInstUsesWith(AI, Constant::getNullValue(AI.getType()));
+
+    // If the alignment is 0 (unspecified), assign it the preferred alignment.
+    if (AI.getAlignment() == 0)
+      AI.setAlignment(TD->getPrefTypeAlignment(AI.getAllocatedType()));
+  }
+
+  return 0;
+}
+
+Instruction *InstCombiner::visitFreeInst(FreeInst &FI) {
+  Value *Op = FI.getOperand(0);
+
+  // free undef -> unreachable.
+  if (isa<UndefValue>(Op)) {
+    // Insert a new store to null because we cannot modify the CFG here.
+    new StoreInst(ConstantInt::getTrue(),
+                  UndefValue::get(PointerType::getUnqual(Type::Int1Ty)), &FI);
+    return EraseInstFromFunction(FI);
+  }
+  
+  // If we have 'free null' delete the instruction.  This can happen in stl code
+  // when lots of inlining happens.
+  if (isa<ConstantPointerNull>(Op))
+    return EraseInstFromFunction(FI);
+  
+  // Change free <ty>* (cast <ty2>* X to <ty>*) into free <ty2>* X
+  if (BitCastInst *CI = dyn_cast<BitCastInst>(Op)) {
+    FI.setOperand(0, CI->getOperand(0));
+    return &FI;
+  }
+  
+  // Change free (gep X, 0,0,0,0) into free(X)
+  if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(Op)) {
+    if (GEPI->hasAllZeroIndices()) {
+      AddToWorkList(GEPI);
+      FI.setOperand(0, GEPI->getOperand(0));
+      return &FI;
+    }
+  }
+  
+  // Change free(malloc) into nothing, if the malloc has a single use.
+  if (MallocInst *MI = dyn_cast<MallocInst>(Op))
+    if (MI->hasOneUse()) {
+      EraseInstFromFunction(FI);
+      return EraseInstFromFunction(*MI);
+    }
+
+  return 0;
+}
+
+
+/// InstCombineLoadCast - Fold 'load (cast P)' -> cast (load P)' when possible.
+static Instruction *InstCombineLoadCast(InstCombiner &IC, LoadInst &LI,
+                                        const TargetData *TD) {
+  User *CI = cast<User>(LI.getOperand(0));
+  Value *CastOp = CI->getOperand(0);
+
+  if (TD) {
+    if (ConstantExpr *CE = dyn_cast<ConstantExpr>(CI)) {
+      // Instead of loading constant c string, use corresponding integer value
+      // directly if string length is small enough.
+      std::string Str;
+      if (GetConstantStringInfo(CE->getOperand(0), Str) && !Str.empty()) {
+        unsigned len = Str.length();
+        const Type *Ty = cast<PointerType>(CE->getType())->getElementType();
+        unsigned numBits = Ty->getPrimitiveSizeInBits();
+        // Replace LI with immediate integer store.
+        if ((numBits >> 3) == len + 1) {
+          APInt StrVal(numBits, 0);
+          APInt SingleChar(numBits, 0);
+          if (TD->isLittleEndian()) {
+            for (signed i = len-1; i >= 0; i--) {
+              SingleChar = (uint64_t) Str[i] & UCHAR_MAX;
+              StrVal = (StrVal << 8) | SingleChar;
+            }
+          } else {
+            for (unsigned i = 0; i < len; i++) {
+              SingleChar = (uint64_t) Str[i] & UCHAR_MAX;
+              StrVal = (StrVal << 8) | SingleChar;
+            }
+            // Append NULL at the end.
+            SingleChar = 0;
+            StrVal = (StrVal << 8) | SingleChar;
+          }
+          Value *NL = ConstantInt::get(StrVal);
+          return IC.ReplaceInstUsesWith(LI, NL);
+        }
+      }
+    }
+  }
+
+  const PointerType *DestTy = cast<PointerType>(CI->getType());
+  const Type *DestPTy = DestTy->getElementType();
+  if (const PointerType *SrcTy = dyn_cast<PointerType>(CastOp->getType())) {
+
+    // If the address spaces don't match, don't eliminate the cast.
+    if (DestTy->getAddressSpace() != SrcTy->getAddressSpace())
+      return 0;
+
+    const Type *SrcPTy = SrcTy->getElementType();
+
+    if (DestPTy->isInteger() || isa<PointerType>(DestPTy) || 
+         isa<VectorType>(DestPTy)) {
+      // If the source is an array, the code below will not succeed.  Check to
+      // see if a trivial 'gep P, 0, 0' will help matters.  Only do this for
+      // constants.
+      if (const ArrayType *ASrcTy = dyn_cast<ArrayType>(SrcPTy))
+        if (Constant *CSrc = dyn_cast<Constant>(CastOp))
+          if (ASrcTy->getNumElements() != 0) {
+            Value *Idxs[2];
+            Idxs[0] = Idxs[1] = Constant::getNullValue(Type::Int32Ty);
+            CastOp = ConstantExpr::getGetElementPtr(CSrc, Idxs, 2);
+            SrcTy = cast<PointerType>(CastOp->getType());
+            SrcPTy = SrcTy->getElementType();
+          }
+
+      if ((SrcPTy->isInteger() || isa<PointerType>(SrcPTy) || 
+            isa<VectorType>(SrcPTy)) &&
+          // Do not allow turning this into a load of an integer, which is then
+          // casted to a pointer, this pessimizes pointer analysis a lot.
+          (isa<PointerType>(SrcPTy) == isa<PointerType>(LI.getType())) &&
+          IC.getTargetData().getTypeSizeInBits(SrcPTy) ==
+               IC.getTargetData().getTypeSizeInBits(DestPTy)) {
+
+        // Okay, we are casting from one integer or pointer type to another of
+        // the same size.  Instead of casting the pointer before the load, cast
+        // the result of the loaded value.
+        Value *NewLoad = IC.InsertNewInstBefore(new LoadInst(CastOp,
+                                                             CI->getName(),
+                                                         LI.isVolatile()),LI);
+        // Now cast the result of the load.
+        return new BitCastInst(NewLoad, LI.getType());
+      }
+    }
+  }
+  return 0;
+}
+
+/// isSafeToLoadUnconditionally - Return true if we know that executing a load
+/// from this value cannot trap.  If it is not obviously safe to load from the
+/// specified pointer, we do a quick local scan of the basic block containing
+/// ScanFrom, to determine if the address is already accessed.
+static bool isSafeToLoadUnconditionally(Value *V, Instruction *ScanFrom) {
+  // If it is an alloca it is always safe to load from.
+  if (isa<AllocaInst>(V)) return true;
+
+  // If it is a global variable it is mostly safe to load from.
+  if (const GlobalValue *GV = dyn_cast<GlobalVariable>(V))
+    // Don't try to evaluate aliases.  External weak GV can be null.
+    return !isa<GlobalAlias>(GV) && !GV->hasExternalWeakLinkage();
+
+  // Otherwise, be a little bit agressive by scanning the local block where we
+  // want to check to see if the pointer is already being loaded or stored
+  // from/to.  If so, the previous load or store would have already trapped,
+  // so there is no harm doing an extra load (also, CSE will later eliminate
+  // the load entirely).
+  BasicBlock::iterator BBI = ScanFrom, E = ScanFrom->getParent()->begin();
+
+  while (BBI != E) {
+    --BBI;
+
+    // If we see a free or a call (which might do a free) the pointer could be
+    // marked invalid.
+    if (isa<FreeInst>(BBI) || 
+        (isa<CallInst>(BBI) && !isa<DbgInfoIntrinsic>(BBI)))
+      return false;
+    
+    if (LoadInst *LI = dyn_cast<LoadInst>(BBI)) {
+      if (LI->getOperand(0) == V) return true;
+    } else if (StoreInst *SI = dyn_cast<StoreInst>(BBI)) {
+      if (SI->getOperand(1) == V) return true;
+    }
+
+  }
+  return false;
+}
+
+Instruction *InstCombiner::visitLoadInst(LoadInst &LI) {
+  Value *Op = LI.getOperand(0);
+
+  // Attempt to improve the alignment.
+  unsigned KnownAlign =
+    GetOrEnforceKnownAlignment(Op, TD->getPrefTypeAlignment(LI.getType()));
+  if (KnownAlign >
+      (LI.getAlignment() == 0 ? TD->getABITypeAlignment(LI.getType()) :
+                                LI.getAlignment()))
+    LI.setAlignment(KnownAlign);
+
+  // load (cast X) --> cast (load X) iff safe
+  if (isa<CastInst>(Op))
+    if (Instruction *Res = InstCombineLoadCast(*this, LI, TD))
+      return Res;
+
+  // None of the following transforms are legal for volatile loads.
+  if (LI.isVolatile()) return 0;
+  
+  // Do really simple store-to-load forwarding and load CSE, to catch cases
+  // where there are several consequtive memory accesses to the same location,
+  // separated by a few arithmetic operations.
+  BasicBlock::iterator BBI = &LI;
+  if (Value *AvailableVal = FindAvailableLoadedValue(Op, LI.getParent(), BBI,6))
+    return ReplaceInstUsesWith(LI, AvailableVal);
+
+  if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(Op)) {
+    const Value *GEPI0 = GEPI->getOperand(0);
+    // TODO: Consider a target hook for valid address spaces for this xform.
+    if (isa<ConstantPointerNull>(GEPI0) &&
+        cast<PointerType>(GEPI0->getType())->getAddressSpace() == 0) {
+      // Insert a new store to null instruction before the load to indicate
+      // that this code is not reachable.  We do this instead of inserting
+      // an unreachable instruction directly because we cannot modify the
+      // CFG.
+      new StoreInst(UndefValue::get(LI.getType()),
+                    Constant::getNullValue(Op->getType()), &LI);
+      return ReplaceInstUsesWith(LI, UndefValue::get(LI.getType()));
+    }
+  } 
+
+  if (Constant *C = dyn_cast<Constant>(Op)) {
+    // load null/undef -> undef
+    // TODO: Consider a target hook for valid address spaces for this xform.
+    if (isa<UndefValue>(C) || (C->isNullValue() && 
+        cast<PointerType>(Op->getType())->getAddressSpace() == 0)) {
+      // Insert a new store to null instruction before the load to indicate that
+      // this code is not reachable.  We do this instead of inserting an
+      // unreachable instruction directly because we cannot modify the CFG.
+      new StoreInst(UndefValue::get(LI.getType()),
+                    Constant::getNullValue(Op->getType()), &LI);
+      return ReplaceInstUsesWith(LI, UndefValue::get(LI.getType()));
+    }
+
+    // Instcombine load (constant global) into the value loaded.
+    if (GlobalVariable *GV = dyn_cast<GlobalVariable>(Op))
+      if (GV->isConstant() && GV->hasDefinitiveInitializer())
+        return ReplaceInstUsesWith(LI, GV->getInitializer());
+
+    // Instcombine load (constantexpr_GEP global, 0, ...) into the value loaded.
+    if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Op)) {
+      if (CE->getOpcode() == Instruction::GetElementPtr) {
+        if (GlobalVariable *GV = dyn_cast<GlobalVariable>(CE->getOperand(0)))
+          if (GV->isConstant() && GV->hasDefinitiveInitializer())
+            if (Constant *V = 
+               ConstantFoldLoadThroughGEPConstantExpr(GV->getInitializer(), CE))
+              return ReplaceInstUsesWith(LI, V);
+        if (CE->getOperand(0)->isNullValue()) {
+          // Insert a new store to null instruction before the load to indicate
+          // that this code is not reachable.  We do this instead of inserting
+          // an unreachable instruction directly because we cannot modify the
+          // CFG.
+          new StoreInst(UndefValue::get(LI.getType()),
+                        Constant::getNullValue(Op->getType()), &LI);
+          return ReplaceInstUsesWith(LI, UndefValue::get(LI.getType()));
+        }
+
+      } else if (CE->isCast()) {
+        if (Instruction *Res = InstCombineLoadCast(*this, LI, TD))
+          return Res;
+      }
+    }
+  }
+    
+  // If this load comes from anywhere in a constant global, and if the global
+  // is all undef or zero, we know what it loads.
+  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(Op->getUnderlyingObject())){
+    if (GV->isConstant() && GV->hasDefinitiveInitializer()) {
+      if (GV->getInitializer()->isNullValue())
+        return ReplaceInstUsesWith(LI, Constant::getNullValue(LI.getType()));
+      else if (isa<UndefValue>(GV->getInitializer()))
+        return ReplaceInstUsesWith(LI, UndefValue::get(LI.getType()));
+    }
+  }
+
+  if (Op->hasOneUse()) {
+    // Change select and PHI nodes to select values instead of addresses: this
+    // helps alias analysis out a lot, allows many others simplifications, and
+    // exposes redundancy in the code.
+    //
+    // Note that we cannot do the transformation unless we know that the
+    // introduced loads cannot trap!  Something like this is valid as long as
+    // the condition is always false: load (select bool %C, int* null, int* %G),
+    // but it would not be valid if we transformed it to load from null
+    // unconditionally.
+    //
+    if (SelectInst *SI = dyn_cast<SelectInst>(Op)) {
+      // load (select (Cond, &V1, &V2))  --> select(Cond, load &V1, load &V2).
+      if (isSafeToLoadUnconditionally(SI->getOperand(1), SI) &&
+          isSafeToLoadUnconditionally(SI->getOperand(2), SI)) {
+        Value *V1 = InsertNewInstBefore(new LoadInst(SI->getOperand(1),
+                                     SI->getOperand(1)->getName()+".val"), LI);
+        Value *V2 = InsertNewInstBefore(new LoadInst(SI->getOperand(2),
+                                     SI->getOperand(2)->getName()+".val"), LI);
+        return SelectInst::Create(SI->getCondition(), V1, V2);
+      }
+
+      // load (select (cond, null, P)) -> load P
+      if (Constant *C = dyn_cast<Constant>(SI->getOperand(1)))
+        if (C->isNullValue()) {
+          LI.setOperand(0, SI->getOperand(2));
+          return &LI;
+        }
+
+      // load (select (cond, P, null)) -> load P
+      if (Constant *C = dyn_cast<Constant>(SI->getOperand(2)))
+        if (C->isNullValue()) {
+          LI.setOperand(0, SI->getOperand(1));
+          return &LI;
+        }
+    }
+  }
+  return 0;
+}
+
+/// InstCombineStoreToCast - Fold store V, (cast P) -> store (cast V), P
+/// when possible.  This makes it generally easy to do alias analysis and/or
+/// SROA/mem2reg of the memory object.
+static Instruction *InstCombineStoreToCast(InstCombiner &IC, StoreInst &SI) {
+  User *CI = cast<User>(SI.getOperand(1));
+  Value *CastOp = CI->getOperand(0);
+
+  const Type *DestPTy = cast<PointerType>(CI->getType())->getElementType();
+  const PointerType *SrcTy = dyn_cast<PointerType>(CastOp->getType());
+  if (SrcTy == 0) return 0;
+  
+  const Type *SrcPTy = SrcTy->getElementType();
+
+  if (!DestPTy->isInteger() && !isa<PointerType>(DestPTy))
+    return 0;
+  
+  /// NewGEPIndices - If SrcPTy is an aggregate type, we can emit a "noop gep"
+  /// to its first element.  This allows us to handle things like:
+  ///   store i32 xxx, (bitcast {foo*, float}* %P to i32*)
+  /// on 32-bit hosts.
+  SmallVector<Value*, 4> NewGEPIndices;
+  
+  // If the source is an array, the code below will not succeed.  Check to
+  // see if a trivial 'gep P, 0, 0' will help matters.  Only do this for
+  // constants.
+  if (isa<ArrayType>(SrcPTy) || isa<StructType>(SrcPTy)) {
+    // Index through pointer.
+    Constant *Zero = Constant::getNullValue(Type::Int32Ty);
+    NewGEPIndices.push_back(Zero);
+    
+    while (1) {
+      if (const StructType *STy = dyn_cast<StructType>(SrcPTy)) {
+        if (!STy->getNumElements()) /* Struct can be empty {} */
+          break;
+        NewGEPIndices.push_back(Zero);
+        SrcPTy = STy->getElementType(0);
+      } else if (const ArrayType *ATy = dyn_cast<ArrayType>(SrcPTy)) {
+        NewGEPIndices.push_back(Zero);
+        SrcPTy = ATy->getElementType();
+      } else {
+        break;
+      }
+    }
+    
+    SrcTy = PointerType::get(SrcPTy, SrcTy->getAddressSpace());
+  }
+
+  if (!SrcPTy->isInteger() && !isa<PointerType>(SrcPTy))
+    return 0;
+  
+  // If the pointers point into different address spaces or if they point to
+  // values with different sizes, we can't do the transformation.
+  if (SrcTy->getAddressSpace() != 
+        cast<PointerType>(CI->getType())->getAddressSpace() ||
+      IC.getTargetData().getTypeSizeInBits(SrcPTy) !=
+      IC.getTargetData().getTypeSizeInBits(DestPTy))
+    return 0;
+
+  // Okay, we are casting from one integer or pointer type to another of
+  // the same size.  Instead of casting the pointer before 
+  // the store, cast the value to be stored.
+  Value *NewCast;
+  Value *SIOp0 = SI.getOperand(0);
+  Instruction::CastOps opcode = Instruction::BitCast;
+  const Type* CastSrcTy = SIOp0->getType();
+  const Type* CastDstTy = SrcPTy;
+  if (isa<PointerType>(CastDstTy)) {
+    if (CastSrcTy->isInteger())
+      opcode = Instruction::IntToPtr;
+  } else if (isa<IntegerType>(CastDstTy)) {
+    if (isa<PointerType>(SIOp0->getType()))
+      opcode = Instruction::PtrToInt;
+  }
+  
+  // SIOp0 is a pointer to aggregate and this is a store to the first field,
+  // emit a GEP to index into its first field.
+  if (!NewGEPIndices.empty()) {
+    if (Constant *C = dyn_cast<Constant>(CastOp))
+      CastOp = ConstantExpr::getGetElementPtr(C, &NewGEPIndices[0], 
+                                              NewGEPIndices.size());
+    else
+      CastOp = IC.InsertNewInstBefore(
+              GetElementPtrInst::Create(CastOp, NewGEPIndices.begin(),
+                                        NewGEPIndices.end()), SI);
+  }
+  
+  if (Constant *C = dyn_cast<Constant>(SIOp0))
+    NewCast = ConstantExpr::getCast(opcode, C, CastDstTy);
+  else
+    NewCast = IC.InsertNewInstBefore(
+      CastInst::Create(opcode, SIOp0, CastDstTy, SIOp0->getName()+".c"), 
+      SI);
+  return new StoreInst(NewCast, CastOp);
+}
+
+/// equivalentAddressValues - Test if A and B will obviously have the same
+/// value. This includes recognizing that %t0 and %t1 will have the same
+/// value in code like this:
+///   %t0 = getelementptr \@a, 0, 3
+///   store i32 0, i32* %t0
+///   %t1 = getelementptr \@a, 0, 3
+///   %t2 = load i32* %t1
+///
+static bool equivalentAddressValues(Value *A, Value *B) {
+  // Test if the values are trivially equivalent.
+  if (A == B) return true;
+  
+  // Test if the values come form identical arithmetic instructions.
+  if (isa<BinaryOperator>(A) ||
+      isa<CastInst>(A) ||
+      isa<PHINode>(A) ||
+      isa<GetElementPtrInst>(A))
+    if (Instruction *BI = dyn_cast<Instruction>(B))
+      if (cast<Instruction>(A)->isIdenticalTo(BI))
+        return true;
+  
+  // Otherwise they may not be equivalent.
+  return false;
+}
+
+// If this instruction has two uses, one of which is a llvm.dbg.declare,
+// return the llvm.dbg.declare.
+DbgDeclareInst *InstCombiner::hasOneUsePlusDeclare(Value *V) {
+  if (!V->hasNUses(2))
+    return 0;
+  for (Value::use_iterator UI = V->use_begin(), E = V->use_end();
+       UI != E; ++UI) {
+    if (DbgDeclareInst *DI = dyn_cast<DbgDeclareInst>(UI))
+      return DI;
+    if (isa<BitCastInst>(UI) && UI->hasOneUse()) {
+      if (DbgDeclareInst *DI = dyn_cast<DbgDeclareInst>(UI->use_begin()))
+        return DI;
+      }
+  }
+  return 0;
+}
+
+Instruction *InstCombiner::visitStoreInst(StoreInst &SI) {
+  Value *Val = SI.getOperand(0);
+  Value *Ptr = SI.getOperand(1);
+
+  if (isa<UndefValue>(Ptr)) {     // store X, undef -> noop (even if volatile)
+    EraseInstFromFunction(SI);
+    ++NumCombined;
+    return 0;
+  }
+  
+  // If the RHS is an alloca with a single use, zapify the store, making the
+  // alloca dead.
+  // If the RHS is an alloca with a two uses, the other one being a 
+  // llvm.dbg.declare, zapify the store and the declare, making the
+  // alloca dead.  We must do this to prevent declare's from affecting
+  // codegen.
+  if (!SI.isVolatile()) {
+    if (Ptr->hasOneUse()) {
+      if (isa<AllocaInst>(Ptr)) {
+        EraseInstFromFunction(SI);
+        ++NumCombined;
+        return 0;
+      }
+      if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr)) {
+        if (isa<AllocaInst>(GEP->getOperand(0))) {
+          if (GEP->getOperand(0)->hasOneUse()) {
+            EraseInstFromFunction(SI);
+            ++NumCombined;
+            return 0;
+          }
+          if (DbgDeclareInst *DI = hasOneUsePlusDeclare(GEP->getOperand(0))) {
+            EraseInstFromFunction(*DI);
+            EraseInstFromFunction(SI);
+            ++NumCombined;
+            return 0;
+          }
+        }
+      }
+    }
+    if (DbgDeclareInst *DI = hasOneUsePlusDeclare(Ptr)) {
+      EraseInstFromFunction(*DI);
+      EraseInstFromFunction(SI);
+      ++NumCombined;
+      return 0;
+    }
+  }
+
+  // Attempt to improve the alignment.
+  unsigned KnownAlign =
+    GetOrEnforceKnownAlignment(Ptr, TD->getPrefTypeAlignment(Val->getType()));
+  if (KnownAlign >
+      (SI.getAlignment() == 0 ? TD->getABITypeAlignment(Val->getType()) :
+                                SI.getAlignment()))
+    SI.setAlignment(KnownAlign);
+
+  // Do really simple DSE, to catch cases where there are several consecutive
+  // stores to the same location, separated by a few arithmetic operations. This
+  // situation often occurs with bitfield accesses.
+  BasicBlock::iterator BBI = &SI;
+  for (unsigned ScanInsts = 6; BBI != SI.getParent()->begin() && ScanInsts;
+       --ScanInsts) {
+    --BBI;
+    // Don't count debug info directives, lest they affect codegen,
+    // and we skip pointer-to-pointer bitcasts, which are NOPs.
+    // It is necessary for correctness to skip those that feed into a
+    // llvm.dbg.declare, as these are not present when debugging is off.
+    if (isa<DbgInfoIntrinsic>(BBI) ||
+        (isa<BitCastInst>(BBI) && isa<PointerType>(BBI->getType()))) {
+      ScanInsts++;
+      continue;
+    }    
+    
+    if (StoreInst *PrevSI = dyn_cast<StoreInst>(BBI)) {
+      // Prev store isn't volatile, and stores to the same location?
+      if (!PrevSI->isVolatile() &&equivalentAddressValues(PrevSI->getOperand(1),
+                                                          SI.getOperand(1))) {
+        ++NumDeadStore;
+        ++BBI;
+        EraseInstFromFunction(*PrevSI);
+        continue;
+      }
+      break;
+    }
+    
+    // If this is a load, we have to stop.  However, if the loaded value is from
+    // the pointer we're loading and is producing the pointer we're storing,
+    // then *this* store is dead (X = load P; store X -> P).
+    if (LoadInst *LI = dyn_cast<LoadInst>(BBI)) {
+      if (LI == Val && equivalentAddressValues(LI->getOperand(0), Ptr) &&
+          !SI.isVolatile()) {
+        EraseInstFromFunction(SI);
+        ++NumCombined;
+        return 0;
+      }
+      // Otherwise, this is a load from some other location.  Stores before it
+      // may not be dead.
+      break;
+    }
+    
+    // Don't skip over loads or things that can modify memory.
+    if (BBI->mayWriteToMemory() || BBI->mayReadFromMemory())
+      break;
+  }
+  
+  
+  if (SI.isVolatile()) return 0;  // Don't hack volatile stores.
+
+  // store X, null    -> turns into 'unreachable' in SimplifyCFG
+  if (isa<ConstantPointerNull>(Ptr)) {
+    if (!isa<UndefValue>(Val)) {
+      SI.setOperand(0, UndefValue::get(Val->getType()));
+      if (Instruction *U = dyn_cast<Instruction>(Val))
+        AddToWorkList(U);  // Dropped a use.
+      ++NumCombined;
+    }
+    return 0;  // Do not modify these!
+  }
+
+  // store undef, Ptr -> noop
+  if (isa<UndefValue>(Val)) {
+    EraseInstFromFunction(SI);
+    ++NumCombined;
+    return 0;
+  }
+
+  // If the pointer destination is a cast, see if we can fold the cast into the
+  // source instead.
+  if (isa<CastInst>(Ptr))
+    if (Instruction *Res = InstCombineStoreToCast(*this, SI))
+      return Res;
+  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Ptr))
+    if (CE->isCast())
+      if (Instruction *Res = InstCombineStoreToCast(*this, SI))
+        return Res;
+
+  
+  // If this store is the last instruction in the basic block (possibly
+  // excepting debug info instructions and the pointer bitcasts that feed
+  // into them), and if the block ends with an unconditional branch, try
+  // to move it to the successor block.
+  BBI = &SI; 
+  do {
+    ++BBI;
+  } while (isa<DbgInfoIntrinsic>(BBI) ||
+           (isa<BitCastInst>(BBI) && isa<PointerType>(BBI->getType())));
+  if (BranchInst *BI = dyn_cast<BranchInst>(BBI))
+    if (BI->isUnconditional())
+      if (SimplifyStoreAtEndOfBlock(SI))
+        return 0;  // xform done!
+  
+  return 0;
+}
+
+/// SimplifyStoreAtEndOfBlock - Turn things like:
+///   if () { *P = v1; } else { *P = v2 }
+/// into a phi node with a store in the successor.
+///
+/// Simplify things like:
+///   *P = v1; if () { *P = v2; }
+/// into a phi node with a store in the successor.
+///
+bool InstCombiner::SimplifyStoreAtEndOfBlock(StoreInst &SI) {
+  BasicBlock *StoreBB = SI.getParent();
+  
+  // Check to see if the successor block has exactly two incoming edges.  If
+  // so, see if the other predecessor contains a store to the same location.
+  // if so, insert a PHI node (if needed) and move the stores down.
+  BasicBlock *DestBB = StoreBB->getTerminator()->getSuccessor(0);
+  
+  // Determine whether Dest has exactly two predecessors and, if so, compute
+  // the other predecessor.
+  pred_iterator PI = pred_begin(DestBB);
+  BasicBlock *OtherBB = 0;
+  if (*PI != StoreBB)
+    OtherBB = *PI;
+  ++PI;
+  if (PI == pred_end(DestBB))
+    return false;
+  
+  if (*PI != StoreBB) {
+    if (OtherBB)
+      return false;
+    OtherBB = *PI;
+  }
+  if (++PI != pred_end(DestBB))
+    return false;
+
+  // Bail out if all the relevant blocks aren't distinct (this can happen,
+  // for example, if SI is in an infinite loop)
+  if (StoreBB == DestBB || OtherBB == DestBB)
+    return false;
+
+  // Verify that the other block ends in a branch and is not otherwise empty.
+  BasicBlock::iterator BBI = OtherBB->getTerminator();
+  BranchInst *OtherBr = dyn_cast<BranchInst>(BBI);
+  if (!OtherBr || BBI == OtherBB->begin())
+    return false;
+  
+  // If the other block ends in an unconditional branch, check for the 'if then
+  // else' case.  there is an instruction before the branch.
+  StoreInst *OtherStore = 0;
+  if (OtherBr->isUnconditional()) {
+    --BBI;
+    // Skip over debugging info.
+    while (isa<DbgInfoIntrinsic>(BBI) ||
+           (isa<BitCastInst>(BBI) && isa<PointerType>(BBI->getType()))) {
+      if (BBI==OtherBB->begin())
+        return false;
+      --BBI;
+    }
+    // If this isn't a store, or isn't a store to the same location, bail out.
+    OtherStore = dyn_cast<StoreInst>(BBI);
+    if (!OtherStore || OtherStore->getOperand(1) != SI.getOperand(1))
+      return false;
+  } else {
+    // Otherwise, the other block ended with a conditional branch. If one of the
+    // destinations is StoreBB, then we have the if/then case.
+    if (OtherBr->getSuccessor(0) != StoreBB && 
+        OtherBr->getSuccessor(1) != StoreBB)
+      return false;
+    
+    // Okay, we know that OtherBr now goes to Dest and StoreBB, so this is an
+    // if/then triangle.  See if there is a store to the same ptr as SI that
+    // lives in OtherBB.
+    for (;; --BBI) {
+      // Check to see if we find the matching store.
+      if ((OtherStore = dyn_cast<StoreInst>(BBI))) {
+        if (OtherStore->getOperand(1) != SI.getOperand(1))
+          return false;
+        break;
+      }
+      // If we find something that may be using or overwriting the stored
+      // value, or if we run out of instructions, we can't do the xform.
+      if (BBI->mayReadFromMemory() || BBI->mayWriteToMemory() ||
+          BBI == OtherBB->begin())
+        return false;
+    }
+    
+    // In order to eliminate the store in OtherBr, we have to
+    // make sure nothing reads or overwrites the stored value in
+    // StoreBB.
+    for (BasicBlock::iterator I = StoreBB->begin(); &*I != &SI; ++I) {
+      // FIXME: This should really be AA driven.
+      if (I->mayReadFromMemory() || I->mayWriteToMemory())
+        return false;
+    }
+  }
+  
+  // Insert a PHI node now if we need it.
+  Value *MergedVal = OtherStore->getOperand(0);
+  if (MergedVal != SI.getOperand(0)) {
+    PHINode *PN = PHINode::Create(MergedVal->getType(), "storemerge");
+    PN->reserveOperandSpace(2);
+    PN->addIncoming(SI.getOperand(0), SI.getParent());
+    PN->addIncoming(OtherStore->getOperand(0), OtherBB);
+    MergedVal = InsertNewInstBefore(PN, DestBB->front());
+  }
+  
+  // Advance to a place where it is safe to insert the new store and
+  // insert it.
+  BBI = DestBB->getFirstNonPHI();
+  InsertNewInstBefore(new StoreInst(MergedVal, SI.getOperand(1),
+                                    OtherStore->isVolatile()), *BBI);
+  
+  // Nuke the old stores.
+  EraseInstFromFunction(SI);
+  EraseInstFromFunction(*OtherStore);
+  ++NumCombined;
+  return true;
+}
+
+
+Instruction *InstCombiner::visitBranchInst(BranchInst &BI) {
+  // Change br (not X), label True, label False to: br X, label False, True
+  Value *X = 0;
+  BasicBlock *TrueDest;
+  BasicBlock *FalseDest;
+  if (match(&BI, m_Br(m_Not(m_Value(X)), TrueDest, FalseDest)) &&
+      !isa<Constant>(X)) {
+    // Swap Destinations and condition...
+    BI.setCondition(X);
+    BI.setSuccessor(0, FalseDest);
+    BI.setSuccessor(1, TrueDest);
+    return &BI;
+  }
+
+  // Cannonicalize fcmp_one -> fcmp_oeq
+  FCmpInst::Predicate FPred; Value *Y;
+  if (match(&BI, m_Br(m_FCmp(FPred, m_Value(X), m_Value(Y)), 
+                             TrueDest, FalseDest)))
+    if ((FPred == FCmpInst::FCMP_ONE || FPred == FCmpInst::FCMP_OLE ||
+         FPred == FCmpInst::FCMP_OGE) && BI.getCondition()->hasOneUse()) {
+      FCmpInst *I = cast<FCmpInst>(BI.getCondition());
+      FCmpInst::Predicate NewPred = FCmpInst::getInversePredicate(FPred);
+      Instruction *NewSCC = new FCmpInst(NewPred, X, Y, "", I);
+      NewSCC->takeName(I);
+      // Swap Destinations and condition...
+      BI.setCondition(NewSCC);
+      BI.setSuccessor(0, FalseDest);
+      BI.setSuccessor(1, TrueDest);
+      RemoveFromWorkList(I);
+      I->eraseFromParent();
+      AddToWorkList(NewSCC);
+      return &BI;
+    }
+
+  // Cannonicalize icmp_ne -> icmp_eq
+  ICmpInst::Predicate IPred;
+  if (match(&BI, m_Br(m_ICmp(IPred, m_Value(X), m_Value(Y)),
+                      TrueDest, FalseDest)))
+    if ((IPred == ICmpInst::ICMP_NE  || IPred == ICmpInst::ICMP_ULE ||
+         IPred == ICmpInst::ICMP_SLE || IPred == ICmpInst::ICMP_UGE ||
+         IPred == ICmpInst::ICMP_SGE) && BI.getCondition()->hasOneUse()) {
+      ICmpInst *I = cast<ICmpInst>(BI.getCondition());
+      ICmpInst::Predicate NewPred = ICmpInst::getInversePredicate(IPred);
+      Instruction *NewSCC = new ICmpInst(NewPred, X, Y, "", I);
+      NewSCC->takeName(I);
+      // Swap Destinations and condition...
+      BI.setCondition(NewSCC);
+      BI.setSuccessor(0, FalseDest);
+      BI.setSuccessor(1, TrueDest);
+      RemoveFromWorkList(I);
+      I->eraseFromParent();;
+      AddToWorkList(NewSCC);
+      return &BI;
+    }
+
+  return 0;
+}
+
+Instruction *InstCombiner::visitSwitchInst(SwitchInst &SI) {
+  Value *Cond = SI.getCondition();
+  if (Instruction *I = dyn_cast<Instruction>(Cond)) {
+    if (I->getOpcode() == Instruction::Add)
+      if (ConstantInt *AddRHS = dyn_cast<ConstantInt>(I->getOperand(1))) {
+        // change 'switch (X+4) case 1:' into 'switch (X) case -3'
+        for (unsigned i = 2, e = SI.getNumOperands(); i != e; i += 2)
+          SI.setOperand(i,ConstantExpr::getSub(cast<Constant>(SI.getOperand(i)),
+                                                AddRHS));
+        SI.setOperand(0, I->getOperand(0));
+        AddToWorkList(I);
+        return &SI;
+      }
+  }
+  return 0;
+}
+
+Instruction *InstCombiner::visitExtractValueInst(ExtractValueInst &EV) {
+  Value *Agg = EV.getAggregateOperand();
+
+  if (!EV.hasIndices())
+    return ReplaceInstUsesWith(EV, Agg);
+
+  if (Constant *C = dyn_cast<Constant>(Agg)) {
+    if (isa<UndefValue>(C))
+      return ReplaceInstUsesWith(EV, UndefValue::get(EV.getType()));
+      
+    if (isa<ConstantAggregateZero>(C))
+      return ReplaceInstUsesWith(EV, Constant::getNullValue(EV.getType()));
+
+    if (isa<ConstantArray>(C) || isa<ConstantStruct>(C)) {
+      // Extract the element indexed by the first index out of the constant
+      Value *V = C->getOperand(*EV.idx_begin());
+      if (EV.getNumIndices() > 1)
+        // Extract the remaining indices out of the constant indexed by the
+        // first index
+        return ExtractValueInst::Create(V, EV.idx_begin() + 1, EV.idx_end());
+      else
+        return ReplaceInstUsesWith(EV, V);
+    }
+    return 0; // Can't handle other constants
+  } 
+  if (InsertValueInst *IV = dyn_cast<InsertValueInst>(Agg)) {
+    // We're extracting from an insertvalue instruction, compare the indices
+    const unsigned *exti, *exte, *insi, *inse;
+    for (exti = EV.idx_begin(), insi = IV->idx_begin(),
+         exte = EV.idx_end(), inse = IV->idx_end();
+         exti != exte && insi != inse;
+         ++exti, ++insi) {
+      if (*insi != *exti)
+        // The insert and extract both reference distinctly different elements.
+        // This means the extract is not influenced by the insert, and we can
+        // replace the aggregate operand of the extract with the aggregate
+        // operand of the insert. i.e., replace
+        // %I = insertvalue { i32, { i32 } } %A, { i32 } { i32 42 }, 1
+        // %E = extractvalue { i32, { i32 } } %I, 0
+        // with
+        // %E = extractvalue { i32, { i32 } } %A, 0
+        return ExtractValueInst::Create(IV->getAggregateOperand(),
+                                        EV.idx_begin(), EV.idx_end());
+    }
+    if (exti == exte && insi == inse)
+      // Both iterators are at the end: Index lists are identical. Replace
+      // %B = insertvalue { i32, { i32 } } %A, i32 42, 1, 0
+      // %C = extractvalue { i32, { i32 } } %B, 1, 0
+      // with "i32 42"
+      return ReplaceInstUsesWith(EV, IV->getInsertedValueOperand());
+    if (exti == exte) {
+      // The extract list is a prefix of the insert list. i.e. replace
+      // %I = insertvalue { i32, { i32 } } %A, i32 42, 1, 0
+      // %E = extractvalue { i32, { i32 } } %I, 1
+      // with
+      // %X = extractvalue { i32, { i32 } } %A, 1
+      // %E = insertvalue { i32 } %X, i32 42, 0
+      // by switching the order of the insert and extract (though the
+      // insertvalue should be left in, since it may have other uses).
+      Value *NewEV = InsertNewInstBefore(
+        ExtractValueInst::Create(IV->getAggregateOperand(),
+                                 EV.idx_begin(), EV.idx_end()),
+        EV);
+      return InsertValueInst::Create(NewEV, IV->getInsertedValueOperand(),
+                                     insi, inse);
+    }
+    if (insi == inse)
+      // The insert list is a prefix of the extract list
+      // We can simply remove the common indices from the extract and make it
+      // operate on the inserted value instead of the insertvalue result.
+      // i.e., replace
+      // %I = insertvalue { i32, { i32 } } %A, { i32 } { i32 42 }, 1
+      // %E = extractvalue { i32, { i32 } } %I, 1, 0
+      // with
+      // %E extractvalue { i32 } { i32 42 }, 0
+      return ExtractValueInst::Create(IV->getInsertedValueOperand(), 
+                                      exti, exte);
+  }
+  // Can't simplify extracts from other values. Note that nested extracts are
+  // already simplified implicitely by the above (extract ( extract (insert) )
+  // will be translated into extract ( insert ( extract ) ) first and then just
+  // the value inserted, if appropriate).
+  return 0;
+}
+
+/// CheapToScalarize - Return true if the value is cheaper to scalarize than it
+/// is to leave as a vector operation.
+static bool CheapToScalarize(Value *V, bool isConstant) {
+  if (isa<ConstantAggregateZero>(V)) 
+    return true;
+  if (ConstantVector *C = dyn_cast<ConstantVector>(V)) {
+    if (isConstant) return true;
+    // If all elts are the same, we can extract.
+    Constant *Op0 = C->getOperand(0);
+    for (unsigned i = 1; i < C->getNumOperands(); ++i)
+      if (C->getOperand(i) != Op0)
+        return false;
+    return true;
+  }
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (!I) return false;
+  
+  // Insert element gets simplified to the inserted element or is deleted if
+  // this is constant idx extract element and its a constant idx insertelt.
+  if (I->getOpcode() == Instruction::InsertElement && isConstant &&
+      isa<ConstantInt>(I->getOperand(2)))
+    return true;
+  if (I->getOpcode() == Instruction::Load && I->hasOneUse())
+    return true;
+  if (BinaryOperator *BO = dyn_cast<BinaryOperator>(I))
+    if (BO->hasOneUse() &&
+        (CheapToScalarize(BO->getOperand(0), isConstant) ||
+         CheapToScalarize(BO->getOperand(1), isConstant)))
+      return true;
+  if (CmpInst *CI = dyn_cast<CmpInst>(I))
+    if (CI->hasOneUse() &&
+        (CheapToScalarize(CI->getOperand(0), isConstant) ||
+         CheapToScalarize(CI->getOperand(1), isConstant)))
+      return true;
+  
+  return false;
+}
+
+/// Read and decode a shufflevector mask.
+///
+/// It turns undef elements into values that are larger than the number of
+/// elements in the input.
+static std::vector<unsigned> getShuffleMask(const ShuffleVectorInst *SVI) {
+  unsigned NElts = SVI->getType()->getNumElements();
+  if (isa<ConstantAggregateZero>(SVI->getOperand(2)))
+    return std::vector<unsigned>(NElts, 0);
+  if (isa<UndefValue>(SVI->getOperand(2)))
+    return std::vector<unsigned>(NElts, 2*NElts);
+
+  std::vector<unsigned> Result;
+  const ConstantVector *CP = cast<ConstantVector>(SVI->getOperand(2));
+  for (User::const_op_iterator i = CP->op_begin(), e = CP->op_end(); i!=e; ++i)
+    if (isa<UndefValue>(*i))
+      Result.push_back(NElts*2);  // undef -> 8
+    else
+      Result.push_back(cast<ConstantInt>(*i)->getZExtValue());
+  return Result;
+}
+
+/// FindScalarElement - Given a vector and an element number, see if the scalar
+/// value is already around as a register, for example if it were inserted then
+/// extracted from the vector.
+static Value *FindScalarElement(Value *V, unsigned EltNo) {
+  assert(isa<VectorType>(V->getType()) && "Not looking at a vector?");
+  const VectorType *PTy = cast<VectorType>(V->getType());
+  unsigned Width = PTy->getNumElements();
+  if (EltNo >= Width)  // Out of range access.
+    return UndefValue::get(PTy->getElementType());
+  
+  if (isa<UndefValue>(V))
+    return UndefValue::get(PTy->getElementType());
+  else if (isa<ConstantAggregateZero>(V))
+    return Constant::getNullValue(PTy->getElementType());
+  else if (ConstantVector *CP = dyn_cast<ConstantVector>(V))
+    return CP->getOperand(EltNo);
+  else if (InsertElementInst *III = dyn_cast<InsertElementInst>(V)) {
+    // If this is an insert to a variable element, we don't know what it is.
+    if (!isa<ConstantInt>(III->getOperand(2))) 
+      return 0;
+    unsigned IIElt = cast<ConstantInt>(III->getOperand(2))->getZExtValue();
+    
+    // If this is an insert to the element we are looking for, return the
+    // inserted value.
+    if (EltNo == IIElt) 
+      return III->getOperand(1);
+    
+    // Otherwise, the insertelement doesn't modify the value, recurse on its
+    // vector input.
+    return FindScalarElement(III->getOperand(0), EltNo);
+  } else if (ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(V)) {
+    unsigned LHSWidth =
+      cast<VectorType>(SVI->getOperand(0)->getType())->getNumElements();
+    unsigned InEl = getShuffleMask(SVI)[EltNo];
+    if (InEl < LHSWidth)
+      return FindScalarElement(SVI->getOperand(0), InEl);
+    else if (InEl < LHSWidth*2)
+      return FindScalarElement(SVI->getOperand(1), InEl - LHSWidth);
+    else
+      return UndefValue::get(PTy->getElementType());
+  }
+  
+  // Otherwise, we don't know.
+  return 0;
+}
+
+Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) {
+  // If vector val is undef, replace extract with scalar undef.
+  if (isa<UndefValue>(EI.getOperand(0)))
+    return ReplaceInstUsesWith(EI, UndefValue::get(EI.getType()));
+
+  // If vector val is constant 0, replace extract with scalar 0.
+  if (isa<ConstantAggregateZero>(EI.getOperand(0)))
+    return ReplaceInstUsesWith(EI, Constant::getNullValue(EI.getType()));
+  
+  if (ConstantVector *C = dyn_cast<ConstantVector>(EI.getOperand(0))) {
+    // If vector val is constant with all elements the same, replace EI with
+    // that element. When the elements are not identical, we cannot replace yet
+    // (we do that below, but only when the index is constant).
+    Constant *op0 = C->getOperand(0);
+    for (unsigned i = 1; i < C->getNumOperands(); ++i)
+      if (C->getOperand(i) != op0) {
+        op0 = 0; 
+        break;
+      }
+    if (op0)
+      return ReplaceInstUsesWith(EI, op0);
+  }
+  
+  // If extracting a specified index from the vector, see if we can recursively
+  // find a previously computed scalar that was inserted into the vector.
+  if (ConstantInt *IdxC = dyn_cast<ConstantInt>(EI.getOperand(1))) {
+    unsigned IndexVal = IdxC->getZExtValue();
+    unsigned VectorWidth = 
+      cast<VectorType>(EI.getOperand(0)->getType())->getNumElements();
+      
+    // If this is extracting an invalid index, turn this into undef, to avoid
+    // crashing the code below.
+    if (IndexVal >= VectorWidth)
+      return ReplaceInstUsesWith(EI, UndefValue::get(EI.getType()));
+    
+    // This instruction only demands the single element from the input vector.
+    // If the input vector has a single use, simplify it based on this use
+    // property.
+    if (EI.getOperand(0)->hasOneUse() && VectorWidth != 1) {
+      APInt UndefElts(VectorWidth, 0);
+      APInt DemandedMask(VectorWidth, 1 << IndexVal);
+      if (Value *V = SimplifyDemandedVectorElts(EI.getOperand(0),
+                                                DemandedMask, UndefElts)) {
+        EI.setOperand(0, V);
+        return &EI;
+      }
+    }
+    
+    if (Value *Elt = FindScalarElement(EI.getOperand(0), IndexVal))
+      return ReplaceInstUsesWith(EI, Elt);
+    
+    // If the this extractelement is directly using a bitcast from a vector of
+    // the same number of elements, see if we can find the source element from
+    // it.  In this case, we will end up needing to bitcast the scalars.
+    if (BitCastInst *BCI = dyn_cast<BitCastInst>(EI.getOperand(0))) {
+      if (const VectorType *VT = 
+              dyn_cast<VectorType>(BCI->getOperand(0)->getType()))
+        if (VT->getNumElements() == VectorWidth)
+          if (Value *Elt = FindScalarElement(BCI->getOperand(0), IndexVal))
+            return new BitCastInst(Elt, EI.getType());
+    }
+  }
+  
+  if (Instruction *I = dyn_cast<Instruction>(EI.getOperand(0))) {
+    if (I->hasOneUse()) {
+      // Push extractelement into predecessor operation if legal and
+      // profitable to do so
+      if (BinaryOperator *BO = dyn_cast<BinaryOperator>(I)) {
+        bool isConstantElt = isa<ConstantInt>(EI.getOperand(1));
+        if (CheapToScalarize(BO, isConstantElt)) {
+          ExtractElementInst *newEI0 = 
+            new ExtractElementInst(BO->getOperand(0), EI.getOperand(1),
+                                   EI.getName()+".lhs");
+          ExtractElementInst *newEI1 =
+            new ExtractElementInst(BO->getOperand(1), EI.getOperand(1),
+                                   EI.getName()+".rhs");
+          InsertNewInstBefore(newEI0, EI);
+          InsertNewInstBefore(newEI1, EI);
+          return BinaryOperator::Create(BO->getOpcode(), newEI0, newEI1);
+        }
+      } else if (isa<LoadInst>(I)) {
+        unsigned AS = 
+          cast<PointerType>(I->getOperand(0)->getType())->getAddressSpace();
+        Value *Ptr = InsertBitCastBefore(I->getOperand(0),
+                                         PointerType::get(EI.getType(), AS),EI);
+        GetElementPtrInst *GEP =
+          GetElementPtrInst::Create(Ptr, EI.getOperand(1), I->getName()+".gep");
+        InsertNewInstBefore(GEP, EI);
+        return new LoadInst(GEP);
+      }
+    }
+    if (InsertElementInst *IE = dyn_cast<InsertElementInst>(I)) {
+      // Extracting the inserted element?
+      if (IE->getOperand(2) == EI.getOperand(1))
+        return ReplaceInstUsesWith(EI, IE->getOperand(1));
+      // If the inserted and extracted elements are constants, they must not
+      // be the same value, extract from the pre-inserted value instead.
+      if (isa<Constant>(IE->getOperand(2)) &&
+          isa<Constant>(EI.getOperand(1))) {
+        AddUsesToWorkList(EI);
+        EI.setOperand(0, IE->getOperand(0));
+        return &EI;
+      }
+    } else if (ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(I)) {
+      // If this is extracting an element from a shufflevector, figure out where
+      // it came from and extract from the appropriate input element instead.
+      if (ConstantInt *Elt = dyn_cast<ConstantInt>(EI.getOperand(1))) {
+        unsigned SrcIdx = getShuffleMask(SVI)[Elt->getZExtValue()];
+        Value *Src;
+        unsigned LHSWidth =
+          cast<VectorType>(SVI->getOperand(0)->getType())->getNumElements();
+
+        if (SrcIdx < LHSWidth)
+          Src = SVI->getOperand(0);
+        else if (SrcIdx < LHSWidth*2) {
+          SrcIdx -= LHSWidth;
+          Src = SVI->getOperand(1);
+        } else {
+          return ReplaceInstUsesWith(EI, UndefValue::get(EI.getType()));
+        }
+        return new ExtractElementInst(Src, SrcIdx);
+      }
+    }
+  }
+  return 0;
+}
+
+/// CollectSingleShuffleElements - If V is a shuffle of values that ONLY returns
+/// elements from either LHS or RHS, return the shuffle mask and true. 
+/// Otherwise, return false.
+static bool CollectSingleShuffleElements(Value *V, Value *LHS, Value *RHS,
+                                         std::vector<Constant*> &Mask) {
+  assert(V->getType() == LHS->getType() && V->getType() == RHS->getType() &&
+         "Invalid CollectSingleShuffleElements");
+  unsigned NumElts = cast<VectorType>(V->getType())->getNumElements();
+
+  if (isa<UndefValue>(V)) {
+    Mask.assign(NumElts, UndefValue::get(Type::Int32Ty));
+    return true;
+  } else if (V == LHS) {
+    for (unsigned i = 0; i != NumElts; ++i)
+      Mask.push_back(ConstantInt::get(Type::Int32Ty, i));
+    return true;
+  } else if (V == RHS) {
+    for (unsigned i = 0; i != NumElts; ++i)
+      Mask.push_back(ConstantInt::get(Type::Int32Ty, i+NumElts));
+    return true;
+  } else if (InsertElementInst *IEI = dyn_cast<InsertElementInst>(V)) {
+    // If this is an insert of an extract from some other vector, include it.
+    Value *VecOp    = IEI->getOperand(0);
+    Value *ScalarOp = IEI->getOperand(1);
+    Value *IdxOp    = IEI->getOperand(2);
+    
+    if (!isa<ConstantInt>(IdxOp))
+      return false;
+    unsigned InsertedIdx = cast<ConstantInt>(IdxOp)->getZExtValue();
+    
+    if (isa<UndefValue>(ScalarOp)) {  // inserting undef into vector.
+      // Okay, we can handle this if the vector we are insertinting into is
+      // transitively ok.
+      if (CollectSingleShuffleElements(VecOp, LHS, RHS, Mask)) {
+        // If so, update the mask to reflect the inserted undef.
+        Mask[InsertedIdx] = UndefValue::get(Type::Int32Ty);
+        return true;
+      }      
+    } else if (ExtractElementInst *EI = dyn_cast<ExtractElementInst>(ScalarOp)){
+      if (isa<ConstantInt>(EI->getOperand(1)) &&
+          EI->getOperand(0)->getType() == V->getType()) {
+        unsigned ExtractedIdx =
+          cast<ConstantInt>(EI->getOperand(1))->getZExtValue();
+        
+        // This must be extracting from either LHS or RHS.
+        if (EI->getOperand(0) == LHS || EI->getOperand(0) == RHS) {
+          // Okay, we can handle this if the vector we are insertinting into is
+          // transitively ok.
+          if (CollectSingleShuffleElements(VecOp, LHS, RHS, Mask)) {
+            // If so, update the mask to reflect the inserted value.
+            if (EI->getOperand(0) == LHS) {
+              Mask[InsertedIdx % NumElts] = 
+                 ConstantInt::get(Type::Int32Ty, ExtractedIdx);
+            } else {
+              assert(EI->getOperand(0) == RHS);
+              Mask[InsertedIdx % NumElts] = 
+                ConstantInt::get(Type::Int32Ty, ExtractedIdx+NumElts);
+              
+            }
+            return true;
+          }
+        }
+      }
+    }
+  }
+  // TODO: Handle shufflevector here!
+  
+  return false;
+}
+
+/// CollectShuffleElements - We are building a shuffle of V, using RHS as the
+/// RHS of the shuffle instruction, if it is not null.  Return a shuffle mask
+/// that computes V and the LHS value of the shuffle.
+static Value *CollectShuffleElements(Value *V, std::vector<Constant*> &Mask,
+                                     Value *&RHS) {
+  assert(isa<VectorType>(V->getType()) && 
+         (RHS == 0 || V->getType() == RHS->getType()) &&
+         "Invalid shuffle!");
+  unsigned NumElts = cast<VectorType>(V->getType())->getNumElements();
+
+  if (isa<UndefValue>(V)) {
+    Mask.assign(NumElts, UndefValue::get(Type::Int32Ty));
+    return V;
+  } else if (isa<ConstantAggregateZero>(V)) {
+    Mask.assign(NumElts, ConstantInt::get(Type::Int32Ty, 0));
+    return V;
+  } else if (InsertElementInst *IEI = dyn_cast<InsertElementInst>(V)) {
+    // If this is an insert of an extract from some other vector, include it.
+    Value *VecOp    = IEI->getOperand(0);
+    Value *ScalarOp = IEI->getOperand(1);
+    Value *IdxOp    = IEI->getOperand(2);
+    
+    if (ExtractElementInst *EI = dyn_cast<ExtractElementInst>(ScalarOp)) {
+      if (isa<ConstantInt>(EI->getOperand(1)) && isa<ConstantInt>(IdxOp) &&
+          EI->getOperand(0)->getType() == V->getType()) {
+        unsigned ExtractedIdx =
+          cast<ConstantInt>(EI->getOperand(1))->getZExtValue();
+        unsigned InsertedIdx = cast<ConstantInt>(IdxOp)->getZExtValue();
+        
+        // Either the extracted from or inserted into vector must be RHSVec,
+        // otherwise we'd end up with a shuffle of three inputs.
+        if (EI->getOperand(0) == RHS || RHS == 0) {
+          RHS = EI->getOperand(0);
+          Value *V = CollectShuffleElements(VecOp, Mask, RHS);
+          Mask[InsertedIdx % NumElts] = 
+            ConstantInt::get(Type::Int32Ty, NumElts+ExtractedIdx);
+          return V;
+        }
+        
+        if (VecOp == RHS) {
+          Value *V = CollectShuffleElements(EI->getOperand(0), Mask, RHS);
+          // Everything but the extracted element is replaced with the RHS.
+          for (unsigned i = 0; i != NumElts; ++i) {
+            if (i != InsertedIdx)
+              Mask[i] = ConstantInt::get(Type::Int32Ty, NumElts+i);
+          }
+          return V;
+        }
+        
+        // If this insertelement is a chain that comes from exactly these two
+        // vectors, return the vector and the effective shuffle.
+        if (CollectSingleShuffleElements(IEI, EI->getOperand(0), RHS, Mask))
+          return EI->getOperand(0);
+        
+      }
+    }
+  }
+  // TODO: Handle shufflevector here!
+  
+  // Otherwise, can't do anything fancy.  Return an identity vector.
+  for (unsigned i = 0; i != NumElts; ++i)
+    Mask.push_back(ConstantInt::get(Type::Int32Ty, i));
+  return V;
+}
+
+Instruction *InstCombiner::visitInsertElementInst(InsertElementInst &IE) {
+  Value *VecOp    = IE.getOperand(0);
+  Value *ScalarOp = IE.getOperand(1);
+  Value *IdxOp    = IE.getOperand(2);
+  
+  // Inserting an undef or into an undefined place, remove this.
+  if (isa<UndefValue>(ScalarOp) || isa<UndefValue>(IdxOp))
+    ReplaceInstUsesWith(IE, VecOp);
+  
+  // If the inserted element was extracted from some other vector, and if the 
+  // indexes are constant, try to turn this into a shufflevector operation.
+  if (ExtractElementInst *EI = dyn_cast<ExtractElementInst>(ScalarOp)) {
+    if (isa<ConstantInt>(EI->getOperand(1)) && isa<ConstantInt>(IdxOp) &&
+        EI->getOperand(0)->getType() == IE.getType()) {
+      unsigned NumVectorElts = IE.getType()->getNumElements();
+      unsigned ExtractedIdx =
+        cast<ConstantInt>(EI->getOperand(1))->getZExtValue();
+      unsigned InsertedIdx = cast<ConstantInt>(IdxOp)->getZExtValue();
+      
+      if (ExtractedIdx >= NumVectorElts) // Out of range extract.
+        return ReplaceInstUsesWith(IE, VecOp);
+      
+      if (InsertedIdx >= NumVectorElts)  // Out of range insert.
+        return ReplaceInstUsesWith(IE, UndefValue::get(IE.getType()));
+      
+      // If we are extracting a value from a vector, then inserting it right
+      // back into the same place, just use the input vector.
+      if (EI->getOperand(0) == VecOp && ExtractedIdx == InsertedIdx)
+        return ReplaceInstUsesWith(IE, VecOp);      
+      
+      // We could theoretically do this for ANY input.  However, doing so could
+      // turn chains of insertelement instructions into a chain of shufflevector
+      // instructions, and right now we do not merge shufflevectors.  As such,
+      // only do this in a situation where it is clear that there is benefit.
+      if (isa<UndefValue>(VecOp) || isa<ConstantAggregateZero>(VecOp)) {
+        // Turn this into shuffle(EIOp0, VecOp, Mask).  The result has all of
+        // the values of VecOp, except then one read from EIOp0.
+        // Build a new shuffle mask.
+        std::vector<Constant*> Mask;
+        if (isa<UndefValue>(VecOp))
+          Mask.assign(NumVectorElts, UndefValue::get(Type::Int32Ty));
+        else {
+          assert(isa<ConstantAggregateZero>(VecOp) && "Unknown thing");
+          Mask.assign(NumVectorElts, ConstantInt::get(Type::Int32Ty,
+                                                       NumVectorElts));
+        } 
+        Mask[InsertedIdx] = ConstantInt::get(Type::Int32Ty, ExtractedIdx);
+        return new ShuffleVectorInst(EI->getOperand(0), VecOp,
+                                     ConstantVector::get(Mask));
+      }
+      
+      // If this insertelement isn't used by some other insertelement, turn it
+      // (and any insertelements it points to), into one big shuffle.
+      if (!IE.hasOneUse() || !isa<InsertElementInst>(IE.use_back())) {
+        std::vector<Constant*> Mask;
+        Value *RHS = 0;
+        Value *LHS = CollectShuffleElements(&IE, Mask, RHS);
+        if (RHS == 0) RHS = UndefValue::get(LHS->getType());
+        // We now have a shuffle of LHS, RHS, Mask.
+        return new ShuffleVectorInst(LHS, RHS, ConstantVector::get(Mask));
+      }
+    }
+  }
+
+  return 0;
+}
+
+
+Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
+  Value *LHS = SVI.getOperand(0);
+  Value *RHS = SVI.getOperand(1);
+  std::vector<unsigned> Mask = getShuffleMask(&SVI);
+
+  bool MadeChange = false;
+
+  // Undefined shuffle mask -> undefined value.
+  if (isa<UndefValue>(SVI.getOperand(2)))
+    return ReplaceInstUsesWith(SVI, UndefValue::get(SVI.getType()));
+
+  unsigned VWidth = cast<VectorType>(SVI.getType())->getNumElements();
+
+  if (VWidth != cast<VectorType>(LHS->getType())->getNumElements())
+    return 0;
+
+  APInt UndefElts(VWidth, 0);
+  APInt AllOnesEltMask(APInt::getAllOnesValue(VWidth));
+  if (SimplifyDemandedVectorElts(&SVI, AllOnesEltMask, UndefElts)) {
+    LHS = SVI.getOperand(0);
+    RHS = SVI.getOperand(1);
+    MadeChange = true;
+  }
+  
+  // Canonicalize shuffle(x    ,x,mask) -> shuffle(x, undef,mask')
+  // Canonicalize shuffle(undef,x,mask) -> shuffle(x, undef,mask').
+  if (LHS == RHS || isa<UndefValue>(LHS)) {
+    if (isa<UndefValue>(LHS) && LHS == RHS) {
+      // shuffle(undef,undef,mask) -> undef.
+      return ReplaceInstUsesWith(SVI, LHS);
+    }
+    
+    // Remap any references to RHS to use LHS.
+    std::vector<Constant*> Elts;
+    for (unsigned i = 0, e = Mask.size(); i != e; ++i) {
+      if (Mask[i] >= 2*e)
+        Elts.push_back(UndefValue::get(Type::Int32Ty));
+      else {
+        if ((Mask[i] >= e && isa<UndefValue>(RHS)) ||
+            (Mask[i] <  e && isa<UndefValue>(LHS))) {
+          Mask[i] = 2*e;     // Turn into undef.
+          Elts.push_back(UndefValue::get(Type::Int32Ty));
+        } else {
+          Mask[i] = Mask[i] % e;  // Force to LHS.
+          Elts.push_back(ConstantInt::get(Type::Int32Ty, Mask[i]));
+        }
+      }
+    }
+    SVI.setOperand(0, SVI.getOperand(1));
+    SVI.setOperand(1, UndefValue::get(RHS->getType()));
+    SVI.setOperand(2, ConstantVector::get(Elts));
+    LHS = SVI.getOperand(0);
+    RHS = SVI.getOperand(1);
+    MadeChange = true;
+  }
+  
+  // Analyze the shuffle, are the LHS or RHS and identity shuffles?
+  bool isLHSID = true, isRHSID = true;
+    
+  for (unsigned i = 0, e = Mask.size(); i != e; ++i) {
+    if (Mask[i] >= e*2) continue;  // Ignore undef values.
+    // Is this an identity shuffle of the LHS value?
+    isLHSID &= (Mask[i] == i);
+      
+    // Is this an identity shuffle of the RHS value?
+    isRHSID &= (Mask[i]-e == i);
+  }
+
+  // Eliminate identity shuffles.
+  if (isLHSID) return ReplaceInstUsesWith(SVI, LHS);
+  if (isRHSID) return ReplaceInstUsesWith(SVI, RHS);
+  
+  // If the LHS is a shufflevector itself, see if we can combine it with this
+  // one without producing an unusual shuffle.  Here we are really conservative:
+  // we are absolutely afraid of producing a shuffle mask not in the input
+  // program, because the code gen may not be smart enough to turn a merged
+  // shuffle into two specific shuffles: it may produce worse code.  As such,
+  // we only merge two shuffles if the result is one of the two input shuffle
+  // masks.  In this case, merging the shuffles just removes one instruction,
+  // which we know is safe.  This is good for things like turning:
+  // (splat(splat)) -> splat.
+  if (ShuffleVectorInst *LHSSVI = dyn_cast<ShuffleVectorInst>(LHS)) {
+    if (isa<UndefValue>(RHS)) {
+      std::vector<unsigned> LHSMask = getShuffleMask(LHSSVI);
+
+      std::vector<unsigned> NewMask;
+      for (unsigned i = 0, e = Mask.size(); i != e; ++i)
+        if (Mask[i] >= 2*e)
+          NewMask.push_back(2*e);
+        else
+          NewMask.push_back(LHSMask[Mask[i]]);
+      
+      // If the result mask is equal to the src shuffle or this shuffle mask, do
+      // the replacement.
+      if (NewMask == LHSMask || NewMask == Mask) {
+        unsigned LHSInNElts =
+          cast<VectorType>(LHSSVI->getOperand(0)->getType())->getNumElements();
+        std::vector<Constant*> Elts;
+        for (unsigned i = 0, e = NewMask.size(); i != e; ++i) {
+          if (NewMask[i] >= LHSInNElts*2) {
+            Elts.push_back(UndefValue::get(Type::Int32Ty));
+          } else {
+            Elts.push_back(ConstantInt::get(Type::Int32Ty, NewMask[i]));
+          }
+        }
+        return new ShuffleVectorInst(LHSSVI->getOperand(0),
+                                     LHSSVI->getOperand(1),
+                                     ConstantVector::get(Elts));
+      }
+    }
+  }
+
+  return MadeChange ? &SVI : 0;
+}
+
+
+
+
+/// TryToSinkInstruction - Try to move the specified instruction from its
+/// current block into the beginning of DestBlock, which can only happen if it's
+/// safe to move the instruction past all of the instructions between it and the
+/// end of its block.
+static bool TryToSinkInstruction(Instruction *I, BasicBlock *DestBlock) {
+  assert(I->hasOneUse() && "Invariants didn't hold!");
+
+  // Cannot move control-flow-involving, volatile loads, vaarg, etc.
+  if (isa<PHINode>(I) || I->mayHaveSideEffects() || isa<TerminatorInst>(I))
+    return false;
+
+  // Do not sink alloca instructions out of the entry block.
+  if (isa<AllocaInst>(I) && I->getParent() ==
+        &DestBlock->getParent()->getEntryBlock())
+    return false;
+
+  // We can only sink load instructions if there is nothing between the load and
+  // the end of block that could change the value.
+  if (I->mayReadFromMemory()) {
+    for (BasicBlock::iterator Scan = I, E = I->getParent()->end();
+         Scan != E; ++Scan)
+      if (Scan->mayWriteToMemory())
+        return false;
+  }
+
+  BasicBlock::iterator InsertPos = DestBlock->getFirstNonPHI();
+
+  CopyPrecedingStopPoint(I, InsertPos);
+  I->moveBefore(InsertPos);
+  ++NumSunkInst;
+  return true;
+}
+
+
+/// AddReachableCodeToWorklist - Walk the function in depth-first order, adding
+/// all reachable code to the worklist.
+///
+/// This has a couple of tricks to make the code faster and more powerful.  In
+/// particular, we constant fold and DCE instructions as we go, to avoid adding
+/// them to the worklist (this significantly speeds up instcombine on code where
+/// many instructions are dead or constant).  Additionally, if we find a branch
+/// whose condition is a known constant, we only visit the reachable successors.
+///
+static void AddReachableCodeToWorklist(BasicBlock *BB, 
+                                       SmallPtrSet<BasicBlock*, 64> &Visited,
+                                       InstCombiner &IC,
+                                       const TargetData *TD) {
+  SmallVector<BasicBlock*, 256> Worklist;
+  Worklist.push_back(BB);
+
+  while (!Worklist.empty()) {
+    BB = Worklist.back();
+    Worklist.pop_back();
+    
+    // We have now visited this block!  If we've already been here, ignore it.
+    if (!Visited.insert(BB)) continue;
+
+    DbgInfoIntrinsic *DBI_Prev = NULL;
+    for (BasicBlock::iterator BBI = BB->begin(), E = BB->end(); BBI != E; ) {
+      Instruction *Inst = BBI++;
+      
+      // DCE instruction if trivially dead.
+      if (isInstructionTriviallyDead(Inst)) {
+        ++NumDeadInst;
+        DOUT << "IC: DCE: " << *Inst;
+        Inst->eraseFromParent();
+        continue;
+      }
+      
+      // ConstantProp instruction if trivially constant.
+      if (Constant *C = ConstantFoldInstruction(Inst, TD)) {
+        DOUT << "IC: ConstFold to: " << *C << " from: " << *Inst;
+        Inst->replaceAllUsesWith(C);
+        ++NumConstProp;
+        Inst->eraseFromParent();
+        continue;
+      }
+     
+      // If there are two consecutive llvm.dbg.stoppoint calls then
+      // it is likely that the optimizer deleted code in between these
+      // two intrinsics. 
+      DbgInfoIntrinsic *DBI_Next = dyn_cast<DbgInfoIntrinsic>(Inst);
+      if (DBI_Next) {
+        if (DBI_Prev
+            && DBI_Prev->getIntrinsicID() == llvm::Intrinsic::dbg_stoppoint
+            && DBI_Next->getIntrinsicID() == llvm::Intrinsic::dbg_stoppoint) {
+          IC.RemoveFromWorkList(DBI_Prev);
+          DBI_Prev->eraseFromParent();
+        }
+        DBI_Prev = DBI_Next;
+      } else {
+        DBI_Prev = 0;
+      }
+
+      IC.AddToWorkList(Inst);
+    }
+
+    // Recursively visit successors.  If this is a branch or switch on a
+    // constant, only visit the reachable successor.
+    TerminatorInst *TI = BB->getTerminator();
+    if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
+      if (BI->isConditional() && isa<ConstantInt>(BI->getCondition())) {
+        bool CondVal = cast<ConstantInt>(BI->getCondition())->getZExtValue();
+        BasicBlock *ReachableBB = BI->getSuccessor(!CondVal);
+        Worklist.push_back(ReachableBB);
+        continue;
+      }
+    } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
+      if (ConstantInt *Cond = dyn_cast<ConstantInt>(SI->getCondition())) {
+        // See if this is an explicit destination.
+        for (unsigned i = 1, e = SI->getNumSuccessors(); i != e; ++i)
+          if (SI->getCaseValue(i) == Cond) {
+            BasicBlock *ReachableBB = SI->getSuccessor(i);
+            Worklist.push_back(ReachableBB);
+            continue;
+          }
+        
+        // Otherwise it is the default destination.
+        Worklist.push_back(SI->getSuccessor(0));
+        continue;
+      }
+    }
+    
+    for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i)
+      Worklist.push_back(TI->getSuccessor(i));
+  }
+}
+
+bool InstCombiner::DoOneIteration(Function &F, unsigned Iteration) {
+  bool Changed = false;
+  TD = &getAnalysis<TargetData>();
+  
+  DEBUG(DOUT << "\n\nINSTCOMBINE ITERATION #" << Iteration << " on "
+             << F.getNameStr() << "\n");
+
+  {
+    // Do a depth-first traversal of the function, populate the worklist with
+    // the reachable instructions.  Ignore blocks that are not reachable.  Keep
+    // track of which blocks we visit.
+    SmallPtrSet<BasicBlock*, 64> Visited;
+    AddReachableCodeToWorklist(F.begin(), Visited, *this, TD);
+
+    // Do a quick scan over the function.  If we find any blocks that are
+    // unreachable, remove any instructions inside of them.  This prevents
+    // the instcombine code from having to deal with some bad special cases.
+    for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB)
+      if (!Visited.count(BB)) {
+        Instruction *Term = BB->getTerminator();
+        while (Term != BB->begin()) {   // Remove instrs bottom-up
+          BasicBlock::iterator I = Term; --I;
+
+          DOUT << "IC: DCE: " << *I;
+          // A debug intrinsic shouldn't force another iteration if we weren't
+          // going to do one without it.
+          if (!isa<DbgInfoIntrinsic>(I)) {
+            ++NumDeadInst;
+            Changed = true;
+          }
+          if (!I->use_empty())
+            I->replaceAllUsesWith(UndefValue::get(I->getType()));
+          I->eraseFromParent();
+        }
+      }
+  }
+
+  while (!Worklist.empty()) {
+    Instruction *I = RemoveOneFromWorkList();
+    if (I == 0) continue;  // skip null values.
+
+    // Check to see if we can DCE the instruction.
+    if (isInstructionTriviallyDead(I)) {
+      // Add operands to the worklist.
+      if (I->getNumOperands() < 4)
+        AddUsesToWorkList(*I);
+      ++NumDeadInst;
+
+      DOUT << "IC: DCE: " << *I;
+
+      I->eraseFromParent();
+      RemoveFromWorkList(I);
+      Changed = true;
+      continue;
+    }
+
+    // Instruction isn't dead, see if we can constant propagate it.
+    if (Constant *C = ConstantFoldInstruction(I, TD)) {
+      DOUT << "IC: ConstFold to: " << *C << " from: " << *I;
+
+      // Add operands to the worklist.
+      AddUsesToWorkList(*I);
+      ReplaceInstUsesWith(*I, C);
+
+      ++NumConstProp;
+      I->eraseFromParent();
+      RemoveFromWorkList(I);
+      Changed = true;
+      continue;
+    }
+
+    if (TD &&
+        (I->getType()->getTypeID() == Type::VoidTyID ||
+         I->isTrapping())) {
+      // See if we can constant fold its operands.
+      for (User::op_iterator i = I->op_begin(), e = I->op_end(); i != e; ++i)
+        if (ConstantExpr *CE = dyn_cast<ConstantExpr>(i))
+          if (Constant *NewC = ConstantFoldConstantExpression(CE, TD))
+            if (NewC != CE) {
+              i->set(NewC);
+              Changed = true;
+            }
+    }
+
+    // See if we can trivially sink this instruction to a successor basic block.
+    if (I->hasOneUse()) {
+      BasicBlock *BB = I->getParent();
+      BasicBlock *UserParent = cast<Instruction>(I->use_back())->getParent();
+      if (UserParent != BB) {
+        bool UserIsSuccessor = false;
+        // See if the user is one of our successors.
+        for (succ_iterator SI = succ_begin(BB), E = succ_end(BB); SI != E; ++SI)
+          if (*SI == UserParent) {
+            UserIsSuccessor = true;
+            break;
+          }
+
+        // If the user is one of our immediate successors, and if that successor
+        // only has us as a predecessors (we'd have to split the critical edge
+        // otherwise), we can keep going.
+        if (UserIsSuccessor && !isa<PHINode>(I->use_back()) &&
+            next(pred_begin(UserParent)) == pred_end(UserParent))
+          // Okay, the CFG is simple enough, try to sink this instruction.
+          Changed |= TryToSinkInstruction(I, UserParent);
+      }
+    }
+
+    // Now that we have an instruction, try combining it to simplify it...
+#ifndef NDEBUG
+    std::string OrigI;
+#endif
+    DEBUG(std::ostringstream SS; I->print(SS); OrigI = SS.str(););
+    if (Instruction *Result = visit(*I)) {
+      ++NumCombined;
+      // Should we replace the old instruction with a new one?
+      if (Result != I) {
+        DOUT << "IC: Old = " << *I
+             << "    New = " << *Result;
+
+        // Everything uses the new instruction now.
+        I->replaceAllUsesWith(Result);
+
+        // Push the new instruction and any users onto the worklist.
+        AddToWorkList(Result);
+        AddUsersToWorkList(*Result);
+
+        // Move the name to the new instruction first.
+        Result->takeName(I);
+
+        // Insert the new instruction into the basic block...
+        BasicBlock *InstParent = I->getParent();
+        BasicBlock::iterator InsertPos = I;
+
+        if (!isa<PHINode>(Result))        // If combining a PHI, don't insert
+          while (isa<PHINode>(InsertPos)) // middle of a block of PHIs.
+            ++InsertPos;
+
+        InstParent->getInstList().insert(InsertPos, Result);
+
+        // Make sure that we reprocess all operands now that we reduced their
+        // use counts.
+        AddUsesToWorkList(*I);
+
+        // Instructions can end up on the worklist more than once.  Make sure
+        // we do not process an instruction that has been deleted.
+        RemoveFromWorkList(I);
+
+        // Erase the old instruction.
+        InstParent->getInstList().erase(I);
+      } else {
+#ifndef NDEBUG
+        DOUT << "IC: Mod = " << OrigI
+             << "    New = " << *I;
+#endif
+
+        // If the instruction was modified, it's possible that it is now dead.
+        // if so, remove it.
+        if (isInstructionTriviallyDead(I)) {
+          // Make sure we process all operands now that we are reducing their
+          // use counts.
+          AddUsesToWorkList(*I);
+
+          // Instructions may end up in the worklist more than once.  Erase all
+          // occurrences of this instruction.
+          RemoveFromWorkList(I);
+          I->eraseFromParent();
+        } else {
+          AddToWorkList(I);
+          AddUsersToWorkList(*I);
+        }
+      }
+      Changed = true;
+    }
+  }
+
+  assert(WorklistMap.empty() && "Worklist empty, but map not?");
+    
+  // Do an explicit clear, this shrinks the map if needed.
+  WorklistMap.clear();
+  return Changed;
+}
+
+
+bool InstCombiner::runOnFunction(Function &F) {
+  MustPreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
+  
+  bool EverMadeChange = false;
+
+  // Iterate while there is work to do.
+  unsigned Iteration = 0;
+  while (DoOneIteration(F, Iteration++))
+    EverMadeChange = true;
+  return EverMadeChange;
+}
+
+FunctionPass *llvm::createInstructionCombiningPass() {
+  return new InstCombiner();
+}
diff --git a/lib/Transforms/Scalar/JumpThreading.cpp b/lib/Transforms/Scalar/JumpThreading.cpp
new file mode 100644
index 0000000..c0ca2df
--- /dev/null
+++ b/lib/Transforms/Scalar/JumpThreading.cpp
@@ -0,0 +1,954 @@
+//===- JumpThreading.cpp - Thread control through conditional blocks ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Jump Threading pass.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "jump-threading"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Pass.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ValueHandle.h"
+using namespace llvm;
+
+STATISTIC(NumThreads, "Number of jumps threaded");
+STATISTIC(NumFolds,   "Number of terminators folded");
+
+static cl::opt<unsigned>
+Threshold("jump-threading-threshold", 
+          cl::desc("Max block size to duplicate for jump threading"),
+          cl::init(6), cl::Hidden);
+
+namespace {
+  /// This pass performs 'jump threading', which looks at blocks that have
+  /// multiple predecessors and multiple successors.  If one or more of the
+  /// predecessors of the block can be proven to always jump to one of the
+  /// successors, we forward the edge from the predecessor to the successor by
+  /// duplicating the contents of this block.
+  ///
+  /// An example of when this can occur is code like this:
+  ///
+  ///   if () { ...
+  ///     X = 4;
+  ///   }
+  ///   if (X < 3) {
+  ///
+  /// In this case, the unconditional branch at the end of the first if can be
+  /// revectored to the false side of the second if.
+  ///
+  class VISIBILITY_HIDDEN JumpThreading : public FunctionPass {
+    TargetData *TD;
+#ifdef NDEBUG
+    SmallPtrSet<BasicBlock*, 16> LoopHeaders;
+#else
+    SmallSet<AssertingVH<BasicBlock>, 16> LoopHeaders;
+#endif
+  public:
+    static char ID; // Pass identification
+    JumpThreading() : FunctionPass(&ID) {}
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequired<TargetData>();
+    }
+
+    bool runOnFunction(Function &F);
+    void FindLoopHeaders(Function &F);
+    
+    bool ProcessBlock(BasicBlock *BB);
+    bool ThreadEdge(BasicBlock *BB, BasicBlock *PredBB, BasicBlock *SuccBB,
+                    unsigned JumpThreadCost);
+    BasicBlock *FactorCommonPHIPreds(PHINode *PN, Constant *CstVal);
+    bool ProcessBranchOnDuplicateCond(BasicBlock *PredBB, BasicBlock *DestBB);
+    bool ProcessSwitchOnDuplicateCond(BasicBlock *PredBB, BasicBlock *DestBB);
+
+    bool ProcessJumpOnPHI(PHINode *PN);
+    bool ProcessBranchOnLogical(Value *V, BasicBlock *BB, bool isAnd);
+    bool ProcessBranchOnCompare(CmpInst *Cmp, BasicBlock *BB);
+    
+    bool SimplifyPartiallyRedundantLoad(LoadInst *LI);
+  };
+}
+
+char JumpThreading::ID = 0;
+static RegisterPass<JumpThreading>
+X("jump-threading", "Jump Threading");
+
+// Public interface to the Jump Threading pass
+FunctionPass *llvm::createJumpThreadingPass() { return new JumpThreading(); }
+
+/// runOnFunction - Top level algorithm.
+///
+bool JumpThreading::runOnFunction(Function &F) {
+  DOUT << "Jump threading on function '" << F.getNameStart() << "'\n";
+  TD = &getAnalysis<TargetData>();
+  
+  FindLoopHeaders(F);
+  
+  bool AnotherIteration = true, EverChanged = false;
+  while (AnotherIteration) {
+    AnotherIteration = false;
+    bool Changed = false;
+    for (Function::iterator I = F.begin(), E = F.end(); I != E;) {
+      BasicBlock *BB = I;
+      while (ProcessBlock(BB))
+        Changed = true;
+      
+      ++I;
+      
+      // If the block is trivially dead, zap it.  This eliminates the successor
+      // edges which simplifies the CFG.
+      if (pred_begin(BB) == pred_end(BB) &&
+          BB != &BB->getParent()->getEntryBlock()) {
+        DOUT << "  JT: Deleting dead block '" << BB->getNameStart()
+             << "' with terminator: " << *BB->getTerminator();
+        LoopHeaders.erase(BB);
+        DeleteDeadBlock(BB);
+        Changed = true;
+      }
+    }
+    AnotherIteration = Changed;
+    EverChanged |= Changed;
+  }
+  
+  LoopHeaders.clear();
+  return EverChanged;
+}
+
+/// FindLoopHeaders - We do not want jump threading to turn proper loop
+/// structures into irreducible loops.  Doing this breaks up the loop nesting
+/// hierarchy and pessimizes later transformations.  To prevent this from
+/// happening, we first have to find the loop headers.  Here we approximate this
+/// by finding targets of backedges in the CFG.
+///
+/// Note that there definitely are cases when we want to allow threading of
+/// edges across a loop header.  For example, threading a jump from outside the
+/// loop (the preheader) to an exit block of the loop is definitely profitable.
+/// It is also almost always profitable to thread backedges from within the loop
+/// to exit blocks, and is often profitable to thread backedges to other blocks
+/// within the loop (forming a nested loop).  This simple analysis is not rich
+/// enough to track all of these properties and keep it up-to-date as the CFG
+/// mutates, so we don't allow any of these transformations.
+///
+void JumpThreading::FindLoopHeaders(Function &F) {
+  SmallVector<std::pair<const BasicBlock*,const BasicBlock*>, 32> Edges;
+  FindFunctionBackedges(F, Edges);
+  
+  for (unsigned i = 0, e = Edges.size(); i != e; ++i)
+    LoopHeaders.insert(const_cast<BasicBlock*>(Edges[i].second));
+}
+
+
+/// FactorCommonPHIPreds - If there are multiple preds with the same incoming
+/// value for the PHI, factor them together so we get one block to thread for
+/// the whole group.
+/// This is important for things like "phi i1 [true, true, false, true, x]"
+/// where we only need to clone the block for the true blocks once.
+///
+BasicBlock *JumpThreading::FactorCommonPHIPreds(PHINode *PN, Constant *CstVal) {
+  SmallVector<BasicBlock*, 16> CommonPreds;
+  for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+    if (PN->getIncomingValue(i) == CstVal)
+      CommonPreds.push_back(PN->getIncomingBlock(i));
+  
+  if (CommonPreds.size() == 1)
+    return CommonPreds[0];
+    
+  DOUT << "  Factoring out " << CommonPreds.size()
+       << " common predecessors.\n";
+  return SplitBlockPredecessors(PN->getParent(),
+                                &CommonPreds[0], CommonPreds.size(),
+                                ".thr_comm", this);
+}
+  
+
+/// getJumpThreadDuplicationCost - Return the cost of duplicating this block to
+/// thread across it.
+static unsigned getJumpThreadDuplicationCost(const BasicBlock *BB) {
+  /// Ignore PHI nodes, these will be flattened when duplication happens.
+  BasicBlock::const_iterator I = BB->getFirstNonPHI();
+
+  // Sum up the cost of each instruction until we get to the terminator.  Don't
+  // include the terminator because the copy won't include it.
+  unsigned Size = 0;
+  for (; !isa<TerminatorInst>(I); ++I) {
+    // Debugger intrinsics don't incur code size.
+    if (isa<DbgInfoIntrinsic>(I)) continue;
+    
+    // If this is a pointer->pointer bitcast, it is free.
+    if (isa<BitCastInst>(I) && isa<PointerType>(I->getType()))
+      continue;
+    
+    // All other instructions count for at least one unit.
+    ++Size;
+    
+    // Calls are more expensive.  If they are non-intrinsic calls, we model them
+    // as having cost of 4.  If they are a non-vector intrinsic, we model them
+    // as having cost of 2 total, and if they are a vector intrinsic, we model
+    // them as having cost 1.
+    if (const CallInst *CI = dyn_cast<CallInst>(I)) {
+      if (!isa<IntrinsicInst>(CI))
+        Size += 3;
+      else if (isa<VectorType>(CI->getType()))
+        Size += 1;
+    }
+  }
+  
+  // Threading through a switch statement is particularly profitable.  If this
+  // block ends in a switch, decrease its cost to make it more likely to happen.
+  if (isa<SwitchInst>(I))
+    Size = Size > 6 ? Size-6 : 0;
+  
+  return Size;
+}
+
+/// ProcessBlock - If there are any predecessors whose control can be threaded
+/// through to a successor, transform them now.
+bool JumpThreading::ProcessBlock(BasicBlock *BB) {
+  // If this block has a single predecessor, and if that pred has a single
+  // successor, merge the blocks.  This encourages recursive jump threading
+  // because now the condition in this block can be threaded through
+  // predecessors of our predecessor block.
+  if (BasicBlock *SinglePred = BB->getSinglePredecessor())
+    if (SinglePred->getTerminator()->getNumSuccessors() == 1 &&
+        SinglePred != BB) {
+      // If SinglePred was a loop header, BB becomes one.
+      if (LoopHeaders.erase(SinglePred))
+        LoopHeaders.insert(BB);
+      
+      // Remember if SinglePred was the entry block of the function.  If so, we
+      // will need to move BB back to the entry position.
+      bool isEntry = SinglePred == &SinglePred->getParent()->getEntryBlock();
+      MergeBasicBlockIntoOnlyPred(BB);
+      
+      if (isEntry && BB != &BB->getParent()->getEntryBlock())
+        BB->moveBefore(&BB->getParent()->getEntryBlock());
+      return true;
+    }
+  
+  // See if this block ends with a branch or switch.  If so, see if the
+  // condition is a phi node.  If so, and if an entry of the phi node is a
+  // constant, we can thread the block.
+  Value *Condition;
+  if (BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator())) {
+    // Can't thread an unconditional jump.
+    if (BI->isUnconditional()) return false;
+    Condition = BI->getCondition();
+  } else if (SwitchInst *SI = dyn_cast<SwitchInst>(BB->getTerminator()))
+    Condition = SI->getCondition();
+  else
+    return false; // Must be an invoke.
+  
+  // If the terminator of this block is branching on a constant, simplify the
+  // terminator to an unconditional branch.  This can occur due to threading in
+  // other blocks.
+  if (isa<ConstantInt>(Condition)) {
+    DOUT << "  In block '" << BB->getNameStart()
+         << "' folding terminator: " << *BB->getTerminator();
+    ++NumFolds;
+    ConstantFoldTerminator(BB);
+    return true;
+  }
+  
+  // If the terminator is branching on an undef, we can pick any of the
+  // successors to branch to.  Since this is arbitrary, we pick the successor
+  // with the fewest predecessors.  This should reduce the in-degree of the
+  // others.
+  if (isa<UndefValue>(Condition)) {
+    TerminatorInst *BBTerm = BB->getTerminator();
+    unsigned MinSucc = 0;
+    BasicBlock *TestBB = BBTerm->getSuccessor(MinSucc);
+    // Compute the successor with the minimum number of predecessors.
+    unsigned MinNumPreds = std::distance(pred_begin(TestBB), pred_end(TestBB));
+    for (unsigned i = 1, e = BBTerm->getNumSuccessors(); i != e; ++i) {
+      TestBB = BBTerm->getSuccessor(i);
+      unsigned NumPreds = std::distance(pred_begin(TestBB), pred_end(TestBB));
+      if (NumPreds < MinNumPreds)
+        MinSucc = i;
+    }
+    
+    // Fold the branch/switch.
+    for (unsigned i = 0, e = BBTerm->getNumSuccessors(); i != e; ++i) {
+      if (i == MinSucc) continue;
+      BBTerm->getSuccessor(i)->removePredecessor(BB);
+    }
+    
+    DOUT << "  In block '" << BB->getNameStart()
+         << "' folding undef terminator: " << *BBTerm;
+    BranchInst::Create(BBTerm->getSuccessor(MinSucc), BBTerm);
+    BBTerm->eraseFromParent();
+    return true;
+  }
+  
+  Instruction *CondInst = dyn_cast<Instruction>(Condition);
+
+  // If the condition is an instruction defined in another block, see if a
+  // predecessor has the same condition:
+  //     br COND, BBX, BBY
+  //  BBX:
+  //     br COND, BBZ, BBW
+  if (!Condition->hasOneUse() && // Multiple uses.
+      (CondInst == 0 || CondInst->getParent() != BB)) { // Non-local definition.
+    pred_iterator PI = pred_begin(BB), E = pred_end(BB);
+    if (isa<BranchInst>(BB->getTerminator())) {
+      for (; PI != E; ++PI)
+        if (BranchInst *PBI = dyn_cast<BranchInst>((*PI)->getTerminator()))
+          if (PBI->isConditional() && PBI->getCondition() == Condition &&
+              ProcessBranchOnDuplicateCond(*PI, BB))
+            return true;
+    } else {
+      assert(isa<SwitchInst>(BB->getTerminator()) && "Unknown jump terminator");
+      for (; PI != E; ++PI)
+        if (SwitchInst *PSI = dyn_cast<SwitchInst>((*PI)->getTerminator()))
+          if (PSI->getCondition() == Condition &&
+              ProcessSwitchOnDuplicateCond(*PI, BB))
+            return true;
+    }
+  }
+
+  // If there is only a single predecessor of this block, nothing to fold.
+  if (BB->getSinglePredecessor())
+    return false;
+  
+  // All the rest of our checks depend on the condition being an instruction.
+  if (CondInst == 0)
+    return false;
+  
+  // See if this is a phi node in the current block.
+  if (PHINode *PN = dyn_cast<PHINode>(CondInst))
+    if (PN->getParent() == BB)
+      return ProcessJumpOnPHI(PN);
+  
+  // If this is a conditional branch whose condition is and/or of a phi, try to
+  // simplify it.
+  if ((CondInst->getOpcode() == Instruction::And || 
+       CondInst->getOpcode() == Instruction::Or) &&
+      isa<BranchInst>(BB->getTerminator()) &&
+      ProcessBranchOnLogical(CondInst, BB,
+                             CondInst->getOpcode() == Instruction::And))
+    return true;
+  
+  // If we have "br (phi != 42)" and the phi node has any constant values as 
+  // operands, we can thread through this block.
+  if (CmpInst *CondCmp = dyn_cast<CmpInst>(CondInst))
+    if (isa<PHINode>(CondCmp->getOperand(0)) &&
+        isa<Constant>(CondCmp->getOperand(1)) &&
+        ProcessBranchOnCompare(CondCmp, BB))
+      return true;
+
+  // Check for some cases that are worth simplifying.  Right now we want to look
+  // for loads that are used by a switch or by the condition for the branch.  If
+  // we see one, check to see if it's partially redundant.  If so, insert a PHI
+  // which can then be used to thread the values.
+  //
+  // This is particularly important because reg2mem inserts loads and stores all
+  // over the place, and this blocks jump threading if we don't zap them.
+  Value *SimplifyValue = CondInst;
+  if (CmpInst *CondCmp = dyn_cast<CmpInst>(SimplifyValue))
+    if (isa<Constant>(CondCmp->getOperand(1)))
+      SimplifyValue = CondCmp->getOperand(0);
+  
+  if (LoadInst *LI = dyn_cast<LoadInst>(SimplifyValue))
+    if (SimplifyPartiallyRedundantLoad(LI))
+      return true;
+  
+  // TODO: If we have: "br (X > 0)"  and we have a predecessor where we know
+  // "(X == 4)" thread through this block.
+  
+  return false;
+}
+
+/// ProcessBranchOnDuplicateCond - We found a block and a predecessor of that
+/// block that jump on exactly the same condition.  This means that we almost
+/// always know the direction of the edge in the DESTBB:
+///  PREDBB:
+///     br COND, DESTBB, BBY
+///  DESTBB:
+///     br COND, BBZ, BBW
+///
+/// If DESTBB has multiple predecessors, we can't just constant fold the branch
+/// in DESTBB, we have to thread over it.
+bool JumpThreading::ProcessBranchOnDuplicateCond(BasicBlock *PredBB,
+                                                 BasicBlock *BB) {
+  BranchInst *PredBI = cast<BranchInst>(PredBB->getTerminator());
+  
+  // If both successors of PredBB go to DESTBB, we don't know anything.  We can
+  // fold the branch to an unconditional one, which allows other recursive
+  // simplifications.
+  bool BranchDir;
+  if (PredBI->getSuccessor(1) != BB)
+    BranchDir = true;
+  else if (PredBI->getSuccessor(0) != BB)
+    BranchDir = false;
+  else {
+    DOUT << "  In block '" << PredBB->getNameStart()
+         << "' folding terminator: " << *PredBB->getTerminator();
+    ++NumFolds;
+    ConstantFoldTerminator(PredBB);
+    return true;
+  }
+   
+  BranchInst *DestBI = cast<BranchInst>(BB->getTerminator());
+
+  // If the dest block has one predecessor, just fix the branch condition to a
+  // constant and fold it.
+  if (BB->getSinglePredecessor()) {
+    DOUT << "  In block '" << BB->getNameStart()
+         << "' folding condition to '" << BranchDir << "': "
+         << *BB->getTerminator();
+    ++NumFolds;
+    DestBI->setCondition(ConstantInt::get(Type::Int1Ty, BranchDir));
+    ConstantFoldTerminator(BB);
+    return true;
+  }
+  
+  // Otherwise we need to thread from PredBB to DestBB's successor which
+  // involves code duplication.  Check to see if it is worth it.
+  unsigned JumpThreadCost = getJumpThreadDuplicationCost(BB);
+  if (JumpThreadCost > Threshold) {
+    DOUT << "  Not threading BB '" << BB->getNameStart()
+         << "' - Cost is too high: " << JumpThreadCost << "\n";
+    return false;
+  }
+  
+  // Next, figure out which successor we are threading to.
+  BasicBlock *SuccBB = DestBI->getSuccessor(!BranchDir);
+  
+  // Ok, try to thread it!
+  return ThreadEdge(BB, PredBB, SuccBB, JumpThreadCost);
+}
+
+/// ProcessSwitchOnDuplicateCond - We found a block and a predecessor of that
+/// block that switch on exactly the same condition.  This means that we almost
+/// always know the direction of the edge in the DESTBB:
+///  PREDBB:
+///     switch COND [... DESTBB, BBY ... ]
+///  DESTBB:
+///     switch COND [... BBZ, BBW ]
+///
+/// Optimizing switches like this is very important, because simplifycfg builds
+/// switches out of repeated 'if' conditions.
+bool JumpThreading::ProcessSwitchOnDuplicateCond(BasicBlock *PredBB,
+                                                 BasicBlock *DestBB) {
+  // Can't thread edge to self.
+  if (PredBB == DestBB)
+    return false;
+  
+  
+  SwitchInst *PredSI = cast<SwitchInst>(PredBB->getTerminator());
+  SwitchInst *DestSI = cast<SwitchInst>(DestBB->getTerminator());
+
+  // There are a variety of optimizations that we can potentially do on these
+  // blocks: we order them from most to least preferable.
+  
+  // If DESTBB *just* contains the switch, then we can forward edges from PREDBB
+  // directly to their destination.  This does not introduce *any* code size
+  // growth.  Skip debug info first.
+  BasicBlock::iterator BBI = DestBB->begin();
+  while (isa<DbgInfoIntrinsic>(BBI))
+    BBI++;
+  
+  // FIXME: Thread if it just contains a PHI.
+  if (isa<SwitchInst>(BBI)) {
+    bool MadeChange = false;
+    // Ignore the default edge for now.
+    for (unsigned i = 1, e = DestSI->getNumSuccessors(); i != e; ++i) {
+      ConstantInt *DestVal = DestSI->getCaseValue(i);
+      BasicBlock *DestSucc = DestSI->getSuccessor(i);
+      
+      // Okay, DestSI has a case for 'DestVal' that goes to 'DestSucc'.  See if
+      // PredSI has an explicit case for it.  If so, forward.  If it is covered
+      // by the default case, we can't update PredSI.
+      unsigned PredCase = PredSI->findCaseValue(DestVal);
+      if (PredCase == 0) continue;
+      
+      // If PredSI doesn't go to DestBB on this value, then it won't reach the
+      // case on this condition.
+      if (PredSI->getSuccessor(PredCase) != DestBB &&
+          DestSI->getSuccessor(i) != DestBB)
+        continue;
+
+      // Otherwise, we're safe to make the change.  Make sure that the edge from
+      // DestSI to DestSucc is not critical and has no PHI nodes.
+      DOUT << "FORWARDING EDGE " << *DestVal << "   FROM: " << *PredSI;
+      DOUT << "THROUGH: " << *DestSI;
+
+      // If the destination has PHI nodes, just split the edge for updating
+      // simplicity.
+      if (isa<PHINode>(DestSucc->begin()) && !DestSucc->getSinglePredecessor()){
+        SplitCriticalEdge(DestSI, i, this);
+        DestSucc = DestSI->getSuccessor(i);
+      }
+      FoldSingleEntryPHINodes(DestSucc);
+      PredSI->setSuccessor(PredCase, DestSucc);
+      MadeChange = true;
+    }
+    
+    if (MadeChange)
+      return true;
+  }
+  
+  return false;
+}
+
+
+/// SimplifyPartiallyRedundantLoad - If LI is an obviously partially redundant
+/// load instruction, eliminate it by replacing it with a PHI node.  This is an
+/// important optimization that encourages jump threading, and needs to be run
+/// interlaced with other jump threading tasks.
+bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
+  // Don't hack volatile loads.
+  if (LI->isVolatile()) return false;
+  
+  // If the load is defined in a block with exactly one predecessor, it can't be
+  // partially redundant.
+  BasicBlock *LoadBB = LI->getParent();
+  if (LoadBB->getSinglePredecessor())
+    return false;
+  
+  Value *LoadedPtr = LI->getOperand(0);
+
+  // If the loaded operand is defined in the LoadBB, it can't be available.
+  // FIXME: Could do PHI translation, that would be fun :)
+  if (Instruction *PtrOp = dyn_cast<Instruction>(LoadedPtr))
+    if (PtrOp->getParent() == LoadBB)
+      return false;
+  
+  // Scan a few instructions up from the load, to see if it is obviously live at
+  // the entry to its block.
+  BasicBlock::iterator BBIt = LI;
+
+  if (Value *AvailableVal = FindAvailableLoadedValue(LoadedPtr, LoadBB, 
+                                                     BBIt, 6)) {
+    // If the value if the load is locally available within the block, just use
+    // it.  This frequently occurs for reg2mem'd allocas.
+    //cerr << "LOAD ELIMINATED:\n" << *BBIt << *LI << "\n";
+    
+    // If the returned value is the load itself, replace with an undef. This can
+    // only happen in dead loops.
+    if (AvailableVal == LI) AvailableVal = UndefValue::get(LI->getType());
+    LI->replaceAllUsesWith(AvailableVal);
+    LI->eraseFromParent();
+    return true;
+  }
+
+  // Otherwise, if we scanned the whole block and got to the top of the block,
+  // we know the block is locally transparent to the load.  If not, something
+  // might clobber its value.
+  if (BBIt != LoadBB->begin())
+    return false;
+  
+  
+  SmallPtrSet<BasicBlock*, 8> PredsScanned;
+  typedef SmallVector<std::pair<BasicBlock*, Value*>, 8> AvailablePredsTy;
+  AvailablePredsTy AvailablePreds;
+  BasicBlock *OneUnavailablePred = 0;
+  
+  // If we got here, the loaded value is transparent through to the start of the
+  // block.  Check to see if it is available in any of the predecessor blocks.
+  for (pred_iterator PI = pred_begin(LoadBB), PE = pred_end(LoadBB);
+       PI != PE; ++PI) {
+    BasicBlock *PredBB = *PI;
+
+    // If we already scanned this predecessor, skip it.
+    if (!PredsScanned.insert(PredBB))
+      continue;
+
+    // Scan the predecessor to see if the value is available in the pred.
+    BBIt = PredBB->end();
+    Value *PredAvailable = FindAvailableLoadedValue(LoadedPtr, PredBB, BBIt, 6);
+    if (!PredAvailable) {
+      OneUnavailablePred = PredBB;
+      continue;
+    }
+    
+    // If so, this load is partially redundant.  Remember this info so that we
+    // can create a PHI node.
+    AvailablePreds.push_back(std::make_pair(PredBB, PredAvailable));
+  }
+  
+  // If the loaded value isn't available in any predecessor, it isn't partially
+  // redundant.
+  if (AvailablePreds.empty()) return false;
+  
+  // Okay, the loaded value is available in at least one (and maybe all!)
+  // predecessors.  If the value is unavailable in more than one unique
+  // predecessor, we want to insert a merge block for those common predecessors.
+  // This ensures that we only have to insert one reload, thus not increasing
+  // code size.
+  BasicBlock *UnavailablePred = 0;
+  
+  // If there is exactly one predecessor where the value is unavailable, the
+  // already computed 'OneUnavailablePred' block is it.  If it ends in an
+  // unconditional branch, we know that it isn't a critical edge.
+  if (PredsScanned.size() == AvailablePreds.size()+1 &&
+      OneUnavailablePred->getTerminator()->getNumSuccessors() == 1) {
+    UnavailablePred = OneUnavailablePred;
+  } else if (PredsScanned.size() != AvailablePreds.size()) {
+    // Otherwise, we had multiple unavailable predecessors or we had a critical
+    // edge from the one.
+    SmallVector<BasicBlock*, 8> PredsToSplit;
+    SmallPtrSet<BasicBlock*, 8> AvailablePredSet;
+
+    for (unsigned i = 0, e = AvailablePreds.size(); i != e; ++i)
+      AvailablePredSet.insert(AvailablePreds[i].first);
+
+    // Add all the unavailable predecessors to the PredsToSplit list.
+    for (pred_iterator PI = pred_begin(LoadBB), PE = pred_end(LoadBB);
+         PI != PE; ++PI)
+      if (!AvailablePredSet.count(*PI))
+        PredsToSplit.push_back(*PI);
+    
+    // Split them out to their own block.
+    UnavailablePred =
+      SplitBlockPredecessors(LoadBB, &PredsToSplit[0], PredsToSplit.size(),
+                             "thread-split", this);
+  }
+  
+  // If the value isn't available in all predecessors, then there will be
+  // exactly one where it isn't available.  Insert a load on that edge and add
+  // it to the AvailablePreds list.
+  if (UnavailablePred) {
+    assert(UnavailablePred->getTerminator()->getNumSuccessors() == 1 &&
+           "Can't handle critical edge here!");
+    Value *NewVal = new LoadInst(LoadedPtr, LI->getName()+".pr",
+                                 UnavailablePred->getTerminator());
+    AvailablePreds.push_back(std::make_pair(UnavailablePred, NewVal));
+  }
+  
+  // Now we know that each predecessor of this block has a value in
+  // AvailablePreds, sort them for efficient access as we're walking the preds.
+  array_pod_sort(AvailablePreds.begin(), AvailablePreds.end());
+  
+  // Create a PHI node at the start of the block for the PRE'd load value.
+  PHINode *PN = PHINode::Create(LI->getType(), "", LoadBB->begin());
+  PN->takeName(LI);
+  
+  // Insert new entries into the PHI for each predecessor.  A single block may
+  // have multiple entries here.
+  for (pred_iterator PI = pred_begin(LoadBB), E = pred_end(LoadBB); PI != E;
+       ++PI) {
+    AvailablePredsTy::iterator I = 
+      std::lower_bound(AvailablePreds.begin(), AvailablePreds.end(),
+                       std::make_pair(*PI, (Value*)0));
+    
+    assert(I != AvailablePreds.end() && I->first == *PI &&
+           "Didn't find entry for predecessor!");
+    
+    PN->addIncoming(I->second, I->first);
+  }
+  
+  //cerr << "PRE: " << *LI << *PN << "\n";
+  
+  LI->replaceAllUsesWith(PN);
+  LI->eraseFromParent();
+  
+  return true;
+}
+
+
+/// ProcessJumpOnPHI - We have a conditional branch of switch on a PHI node in
+/// the current block.  See if there are any simplifications we can do based on
+/// inputs to the phi node.
+/// 
+bool JumpThreading::ProcessJumpOnPHI(PHINode *PN) {
+  // See if the phi node has any constant values.  If so, we can determine where
+  // the corresponding predecessor will branch.
+  ConstantInt *PredCst = 0;
+  for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+    if ((PredCst = dyn_cast<ConstantInt>(PN->getIncomingValue(i))))
+      break;
+  
+  // If no incoming value has a constant, we don't know the destination of any
+  // predecessors.
+  if (PredCst == 0)
+    return false;
+  
+  // See if the cost of duplicating this block is low enough.
+  BasicBlock *BB = PN->getParent();
+  unsigned JumpThreadCost = getJumpThreadDuplicationCost(BB);
+  if (JumpThreadCost > Threshold) {
+    DOUT << "  Not threading BB '" << BB->getNameStart()
+         << "' - Cost is too high: " << JumpThreadCost << "\n";
+    return false;
+  }
+  
+  // If so, we can actually do this threading.  Merge any common predecessors
+  // that will act the same.
+  BasicBlock *PredBB = FactorCommonPHIPreds(PN, PredCst);
+  
+  // Next, figure out which successor we are threading to.
+  BasicBlock *SuccBB;
+  if (BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator()))
+    SuccBB = BI->getSuccessor(PredCst == ConstantInt::getFalse());
+  else {
+    SwitchInst *SI = cast<SwitchInst>(BB->getTerminator());
+    SuccBB = SI->getSuccessor(SI->findCaseValue(PredCst));
+  }
+  
+  // Ok, try to thread it!
+  return ThreadEdge(BB, PredBB, SuccBB, JumpThreadCost);
+}
+
+/// ProcessJumpOnLogicalPHI - PN's basic block contains a conditional branch
+/// whose condition is an AND/OR where one side is PN.  If PN has constant
+/// operands that permit us to evaluate the condition for some operand, thread
+/// through the block.  For example with:
+///   br (and X, phi(Y, Z, false))
+/// the predecessor corresponding to the 'false' will always jump to the false
+/// destination of the branch.
+///
+bool JumpThreading::ProcessBranchOnLogical(Value *V, BasicBlock *BB,
+                                           bool isAnd) {
+  // If this is a binary operator tree of the same AND/OR opcode, check the
+  // LHS/RHS.
+  if (BinaryOperator *BO = dyn_cast<BinaryOperator>(V))
+    if ((isAnd && BO->getOpcode() == Instruction::And) ||
+        (!isAnd && BO->getOpcode() == Instruction::Or)) {
+      if (ProcessBranchOnLogical(BO->getOperand(0), BB, isAnd))
+        return true;
+      if (ProcessBranchOnLogical(BO->getOperand(1), BB, isAnd))
+        return true;
+    }
+      
+  // If this isn't a PHI node, we can't handle it.
+  PHINode *PN = dyn_cast<PHINode>(V);
+  if (!PN || PN->getParent() != BB) return false;
+                                             
+  // We can only do the simplification for phi nodes of 'false' with AND or
+  // 'true' with OR.  See if we have any entries in the phi for this.
+  unsigned PredNo = ~0U;
+  ConstantInt *PredCst = ConstantInt::get(Type::Int1Ty, !isAnd);
+  for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+    if (PN->getIncomingValue(i) == PredCst) {
+      PredNo = i;
+      break;
+    }
+  }
+  
+  // If no match, bail out.
+  if (PredNo == ~0U)
+    return false;
+  
+  // See if the cost of duplicating this block is low enough.
+  unsigned JumpThreadCost = getJumpThreadDuplicationCost(BB);
+  if (JumpThreadCost > Threshold) {
+    DOUT << "  Not threading BB '" << BB->getNameStart()
+         << "' - Cost is too high: " << JumpThreadCost << "\n";
+    return false;
+  }
+
+  // If so, we can actually do this threading.  Merge any common predecessors
+  // that will act the same.
+  BasicBlock *PredBB = FactorCommonPHIPreds(PN, PredCst);
+  
+  // Next, figure out which successor we are threading to.  If this was an AND,
+  // the constant must be FALSE, and we must be targeting the 'false' block.
+  // If this is an OR, the constant must be TRUE, and we must be targeting the
+  // 'true' block.
+  BasicBlock *SuccBB = BB->getTerminator()->getSuccessor(isAnd);
+  
+  // Ok, try to thread it!
+  return ThreadEdge(BB, PredBB, SuccBB, JumpThreadCost);
+}
+
+/// ProcessBranchOnCompare - We found a branch on a comparison between a phi
+/// node and a constant.  If the PHI node contains any constants as inputs, we
+/// can fold the compare for that edge and thread through it.
+bool JumpThreading::ProcessBranchOnCompare(CmpInst *Cmp, BasicBlock *BB) {
+  PHINode *PN = cast<PHINode>(Cmp->getOperand(0));
+  Constant *RHS = cast<Constant>(Cmp->getOperand(1));
+  
+  // If the phi isn't in the current block, an incoming edge to this block
+  // doesn't control the destination.
+  if (PN->getParent() != BB)
+    return false;
+  
+  // We can do this simplification if any comparisons fold to true or false.
+  // See if any do.
+  Constant *PredCst = 0;
+  bool TrueDirection = false;
+  for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+    PredCst = dyn_cast<Constant>(PN->getIncomingValue(i));
+    if (PredCst == 0) continue;
+    
+    Constant *Res;
+    if (ICmpInst *ICI = dyn_cast<ICmpInst>(Cmp))
+      Res = ConstantExpr::getICmp(ICI->getPredicate(), PredCst, RHS);
+    else
+      Res = ConstantExpr::getFCmp(cast<FCmpInst>(Cmp)->getPredicate(),
+                                  PredCst, RHS);
+    // If this folded to a constant expr, we can't do anything.
+    if (ConstantInt *ResC = dyn_cast<ConstantInt>(Res)) {
+      TrueDirection = ResC->getZExtValue();
+      break;
+    }
+    // If this folded to undef, just go the false way.
+    if (isa<UndefValue>(Res)) {
+      TrueDirection = false;
+      break;
+    }
+    
+    // Otherwise, we can't fold this input.
+    PredCst = 0;
+  }
+  
+  // If no match, bail out.
+  if (PredCst == 0)
+    return false;
+  
+  // See if the cost of duplicating this block is low enough.
+  unsigned JumpThreadCost = getJumpThreadDuplicationCost(BB);
+  if (JumpThreadCost > Threshold) {
+    DOUT << "  Not threading BB '" << BB->getNameStart()
+         << "' - Cost is too high: " << JumpThreadCost << "\n";
+    return false;
+  }
+  
+  // If so, we can actually do this threading.  Merge any common predecessors
+  // that will act the same.
+  BasicBlock *PredBB = FactorCommonPHIPreds(PN, PredCst);
+  
+  // Next, get our successor.
+  BasicBlock *SuccBB = BB->getTerminator()->getSuccessor(!TrueDirection);
+  
+  // Ok, try to thread it!
+  return ThreadEdge(BB, PredBB, SuccBB, JumpThreadCost);
+}
+
+
+/// ThreadEdge - We have decided that it is safe and profitable to thread an
+/// edge from PredBB to SuccBB across BB.  Transform the IR to reflect this
+/// change.
+bool JumpThreading::ThreadEdge(BasicBlock *BB, BasicBlock *PredBB, 
+                               BasicBlock *SuccBB, unsigned JumpThreadCost) {
+
+  // If threading to the same block as we come from, we would infinite loop.
+  if (SuccBB == BB) {
+    DOUT << "  Not threading across BB '" << BB->getNameStart()
+         << "' - would thread to self!\n";
+    return false;
+  }
+  
+  // If threading this would thread across a loop header, don't thread the edge.
+  // See the comments above FindLoopHeaders for justifications and caveats.
+  if (LoopHeaders.count(BB)) {
+    DOUT << "  Not threading from '" << PredBB->getNameStart()
+         << "' across loop header BB '" << BB->getNameStart()
+         << "' to dest BB '" << SuccBB->getNameStart()
+         << "' - it might create an irreducible loop!\n";
+    return false;
+  }
+
+  // And finally, do it!
+  DOUT << "  Threading edge from '" << PredBB->getNameStart() << "' to '"
+       << SuccBB->getNameStart() << "' with cost: " << JumpThreadCost
+       << ", across block:\n    "
+       << *BB << "\n";
+  
+  // Jump Threading can not update SSA properties correctly if the values
+  // defined in the duplicated block are used outside of the block itself.  For
+  // this reason, we spill all values that are used outside of BB to the stack.
+  for (BasicBlock::iterator I = BB->begin(); I != BB->end(); ++I) {
+    if (!I->isUsedOutsideOfBlock(BB))
+      continue;
+    
+    // We found a use of I outside of BB.  Create a new stack slot to
+    // break this inter-block usage pattern.
+    DemoteRegToStack(*I);
+  }
+ 
+  // We are going to have to map operands from the original BB block to the new
+  // copy of the block 'NewBB'.  If there are PHI nodes in BB, evaluate them to
+  // account for entry from PredBB.
+  DenseMap<Instruction*, Value*> ValueMapping;
+  
+  BasicBlock *NewBB =
+    BasicBlock::Create(BB->getName()+".thread", BB->getParent(), BB);
+  NewBB->moveAfter(PredBB);
+  
+  BasicBlock::iterator BI = BB->begin();
+  for (; PHINode *PN = dyn_cast<PHINode>(BI); ++BI)
+    ValueMapping[PN] = PN->getIncomingValueForBlock(PredBB);
+  
+  // Clone the non-phi instructions of BB into NewBB, keeping track of the
+  // mapping and using it to remap operands in the cloned instructions.
+  for (; !isa<TerminatorInst>(BI); ++BI) {
+    Instruction *New = BI->clone();
+    New->setName(BI->getNameStart());
+    NewBB->getInstList().push_back(New);
+    ValueMapping[BI] = New;
+   
+    // Remap operands to patch up intra-block references.
+    for (unsigned i = 0, e = New->getNumOperands(); i != e; ++i)
+      if (Instruction *Inst = dyn_cast<Instruction>(New->getOperand(i)))
+        if (Value *Remapped = ValueMapping[Inst])
+          New->setOperand(i, Remapped);
+  }
+  
+  // We didn't copy the terminator from BB over to NewBB, because there is now
+  // an unconditional jump to SuccBB.  Insert the unconditional jump.
+  BranchInst::Create(SuccBB, NewBB);
+  
+  // Check to see if SuccBB has PHI nodes. If so, we need to add entries to the
+  // PHI nodes for NewBB now.
+  for (BasicBlock::iterator PNI = SuccBB->begin(); isa<PHINode>(PNI); ++PNI) {
+    PHINode *PN = cast<PHINode>(PNI);
+    // Ok, we have a PHI node.  Figure out what the incoming value was for the
+    // DestBlock.
+    Value *IV = PN->getIncomingValueForBlock(BB);
+    
+    // Remap the value if necessary.
+    if (Instruction *Inst = dyn_cast<Instruction>(IV))
+      if (Value *MappedIV = ValueMapping[Inst])
+        IV = MappedIV;
+    PN->addIncoming(IV, NewBB);
+  }
+  
+  // Ok, NewBB is good to go.  Update the terminator of PredBB to jump to
+  // NewBB instead of BB.  This eliminates predecessors from BB, which requires
+  // us to simplify any PHI nodes in BB.
+  TerminatorInst *PredTerm = PredBB->getTerminator();
+  for (unsigned i = 0, e = PredTerm->getNumSuccessors(); i != e; ++i)
+    if (PredTerm->getSuccessor(i) == BB) {
+      BB->removePredecessor(PredBB);
+      PredTerm->setSuccessor(i, NewBB);
+    }
+  
+  // At this point, the IR is fully up to date and consistent.  Do a quick scan
+  // over the new instructions and zap any that are constants or dead.  This
+  // frequently happens because of phi translation.
+  BI = NewBB->begin();
+  for (BasicBlock::iterator E = NewBB->end(); BI != E; ) {
+    Instruction *Inst = BI++;
+    if (Constant *C = ConstantFoldInstruction(Inst, TD)) {
+      Inst->replaceAllUsesWith(C);
+      Inst->eraseFromParent();
+      continue;
+    }
+    
+    RecursivelyDeleteTriviallyDeadInstructions(Inst);
+  }
+  
+  // Threaded an edge!
+  ++NumThreads;
+  return true;
+}
diff --git a/lib/Transforms/Scalar/LICM.cpp b/lib/Transforms/Scalar/LICM.cpp
new file mode 100644
index 0000000..1021469
--- /dev/null
+++ b/lib/Transforms/Scalar/LICM.cpp
@@ -0,0 +1,885 @@
+//===-- LICM.cpp - Loop Invariant Code Motion Pass ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass performs loop invariant code motion, attempting to remove as much
+// code from the body of a loop as possible.  It does this by either hoisting
+// code into the preheader block, or by sinking code to the exit blocks if it is
+// safe.  This pass also promotes must-aliased memory locations in the loop to
+// live in registers, thus hoisting and sinking "invariant" loads and stores.
+//
+// This pass uses alias analysis for two purposes:
+//
+//  1. Moving loop invariant loads and calls out of loops.  If we can determine
+//     that a load or call inside of a loop never aliases anything stored to,
+//     we can hoist it or sink it like any other instruction.
+//  2. Scalar Promotion of Memory - If there is a store instruction inside of
+//     the loop, we try to move the store to happen AFTER the loop instead of
+//     inside of the loop.  This can only happen if a few conditions are true:
+//       A. The pointer stored through is loop invariant
+//       B. There are no stores or loads in the loop which _may_ alias the
+//          pointer.  There are no calls in the loop which mod/ref the pointer.
+//     If these conditions are true, we can promote the loads and stores in the
+//     loop of the pointer to use a temporary alloca'd variable.  We then use
+//     the mem2reg functionality to construct the appropriate SSA form for the
+//     variable.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "licm"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Instructions.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AliasSetTracker.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Transforms/Utils/PromoteMemToReg.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/ADT/Statistic.h"
+#include <algorithm>
+using namespace llvm;
+
+STATISTIC(NumSunk      , "Number of instructions sunk out of loop");
+STATISTIC(NumHoisted   , "Number of instructions hoisted out of loop");
+STATISTIC(NumMovedLoads, "Number of load insts hoisted or sunk");
+STATISTIC(NumMovedCalls, "Number of call insts hoisted or sunk");
+STATISTIC(NumPromoted  , "Number of memory locations promoted to registers");
+
+static cl::opt<bool>
+DisablePromotion("disable-licm-promotion", cl::Hidden,
+                 cl::desc("Disable memory promotion in LICM pass"));
+
+// This feature is currently disabled by default because CodeGen is not yet
+// capable of rematerializing these constants in PIC mode, so it can lead to
+// degraded performance. Compile test/CodeGen/X86/remat-constant.ll with
+// -relocation-model=pic to see an example of this.
+static cl::opt<bool>
+EnableLICMConstantMotion("enable-licm-constant-variables", cl::Hidden,
+                         cl::desc("Enable hoisting/sinking of constant "
+                                  "global variables"));
+
+namespace {
+  struct VISIBILITY_HIDDEN LICM : public LoopPass {
+    static char ID; // Pass identification, replacement for typeid
+    LICM() : LoopPass(&ID) {}
+
+    virtual bool runOnLoop(Loop *L, LPPassManager &LPM);
+
+    /// This transformation requires natural loop information & requires that
+    /// loop preheaders be inserted into the CFG...
+    ///
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesCFG();
+      AU.addRequiredID(LoopSimplifyID);
+      AU.addRequired<LoopInfo>();
+      AU.addRequired<DominatorTree>();
+      AU.addRequired<DominanceFrontier>();  // For scalar promotion (mem2reg)
+      AU.addRequired<AliasAnalysis>();
+      AU.addPreserved<ScalarEvolution>();
+      AU.addPreserved<DominanceFrontier>();
+    }
+
+    bool doFinalization() {
+      // Free the values stored in the map
+      for (std::map<Loop *, AliasSetTracker *>::iterator
+             I = LoopToAliasMap.begin(), E = LoopToAliasMap.end(); I != E; ++I)
+        delete I->second;
+
+      LoopToAliasMap.clear();
+      return false;
+    }
+
+  private:
+    // Various analyses that we use...
+    AliasAnalysis *AA;       // Current AliasAnalysis information
+    LoopInfo      *LI;       // Current LoopInfo
+    DominatorTree *DT;       // Dominator Tree for the current Loop...
+    DominanceFrontier *DF;   // Current Dominance Frontier
+
+    // State that is updated as we process loops
+    bool Changed;            // Set to true when we change anything.
+    BasicBlock *Preheader;   // The preheader block of the current loop...
+    Loop *CurLoop;           // The current loop we are working on...
+    AliasSetTracker *CurAST; // AliasSet information for the current loop...
+    std::map<Loop *, AliasSetTracker *> LoopToAliasMap;
+
+    /// cloneBasicBlockAnalysis - Simple Analysis hook. Clone alias set info.
+    void cloneBasicBlockAnalysis(BasicBlock *From, BasicBlock *To, Loop *L);
+
+    /// deleteAnalysisValue - Simple Analysis hook. Delete value V from alias
+    /// set.
+    void deleteAnalysisValue(Value *V, Loop *L);
+
+    /// SinkRegion - Walk the specified region of the CFG (defined by all blocks
+    /// dominated by the specified block, and that are in the current loop) in
+    /// reverse depth first order w.r.t the DominatorTree.  This allows us to
+    /// visit uses before definitions, allowing us to sink a loop body in one
+    /// pass without iteration.
+    ///
+    void SinkRegion(DomTreeNode *N);
+
+    /// HoistRegion - Walk the specified region of the CFG (defined by all
+    /// blocks dominated by the specified block, and that are in the current
+    /// loop) in depth first order w.r.t the DominatorTree.  This allows us to
+    /// visit definitions before uses, allowing us to hoist a loop body in one
+    /// pass without iteration.
+    ///
+    void HoistRegion(DomTreeNode *N);
+
+    /// inSubLoop - Little predicate that returns true if the specified basic
+    /// block is in a subloop of the current one, not the current one itself.
+    ///
+    bool inSubLoop(BasicBlock *BB) {
+      assert(CurLoop->contains(BB) && "Only valid if BB is IN the loop");
+      for (Loop::iterator I = CurLoop->begin(), E = CurLoop->end(); I != E; ++I)
+        if ((*I)->contains(BB))
+          return true;  // A subloop actually contains this block!
+      return false;
+    }
+
+    /// isExitBlockDominatedByBlockInLoop - This method checks to see if the
+    /// specified exit block of the loop is dominated by the specified block
+    /// that is in the body of the loop.  We use these constraints to
+    /// dramatically limit the amount of the dominator tree that needs to be
+    /// searched.
+    bool isExitBlockDominatedByBlockInLoop(BasicBlock *ExitBlock,
+                                           BasicBlock *BlockInLoop) const {
+      // If the block in the loop is the loop header, it must be dominated!
+      BasicBlock *LoopHeader = CurLoop->getHeader();
+      if (BlockInLoop == LoopHeader)
+        return true;
+
+      DomTreeNode *BlockInLoopNode = DT->getNode(BlockInLoop);
+      DomTreeNode *IDom            = DT->getNode(ExitBlock);
+
+      // Because the exit block is not in the loop, we know we have to get _at
+      // least_ its immediate dominator.
+      do {
+        // Get next Immediate Dominator.
+        IDom = IDom->getIDom();
+
+        // If we have got to the header of the loop, then the instructions block
+        // did not dominate the exit node, so we can't hoist it.
+        if (IDom->getBlock() == LoopHeader)
+          return false;
+
+      } while (IDom != BlockInLoopNode);
+
+      return true;
+    }
+
+    /// sink - When an instruction is found to only be used outside of the loop,
+    /// this function moves it to the exit blocks and patches up SSA form as
+    /// needed.
+    ///
+    void sink(Instruction &I);
+
+    /// hoist - When an instruction is found to only use loop invariant operands
+    /// that is safe to hoist, this instruction is called to do the dirty work.
+    ///
+    void hoist(Instruction &I);
+
+    /// isSafeToExecuteUnconditionally - Only sink or hoist an instruction if it
+    /// is not a trapping instruction or if it is a trapping instruction and is
+    /// guaranteed to execute.
+    ///
+    bool isSafeToExecuteUnconditionally(Instruction &I);
+
+    /// pointerInvalidatedByLoop - Return true if the body of this loop may
+    /// store into the memory location pointed to by V.
+    ///
+    bool pointerInvalidatedByLoop(Value *V, unsigned Size) {
+      // Check to see if any of the basic blocks in CurLoop invalidate *V.
+      return CurAST->getAliasSetForPointer(V, Size).isMod();
+    }
+
+    bool canSinkOrHoistInst(Instruction &I);
+    bool isLoopInvariantInst(Instruction &I);
+    bool isNotUsedInLoop(Instruction &I);
+
+    /// PromoteValuesInLoop - Look at the stores in the loop and promote as many
+    /// to scalars as we can.
+    ///
+    void PromoteValuesInLoop();
+
+    /// FindPromotableValuesInLoop - Check the current loop for stores to
+    /// definite pointers, which are not loaded and stored through may aliases.
+    /// If these are found, create an alloca for the value, add it to the
+    /// PromotedValues list, and keep track of the mapping from value to
+    /// alloca...
+    ///
+    void FindPromotableValuesInLoop(
+                   std::vector<std::pair<AllocaInst*, Value*> > &PromotedValues,
+                                    std::map<Value*, AllocaInst*> &Val2AlMap);
+  };
+}
+
+char LICM::ID = 0;
+static RegisterPass<LICM> X("licm", "Loop Invariant Code Motion");
+
+Pass *llvm::createLICMPass() { return new LICM(); }
+
+/// Hoist expressions out of the specified loop. Note, alias info for inner
+/// loop is not preserved so it is not a good idea to run LICM multiple 
+/// times on one loop.
+///
+bool LICM::runOnLoop(Loop *L, LPPassManager &LPM) {
+  Changed = false;
+
+  // Get our Loop and Alias Analysis information...
+  LI = &getAnalysis<LoopInfo>();
+  AA = &getAnalysis<AliasAnalysis>();
+  DF = &getAnalysis<DominanceFrontier>();
+  DT = &getAnalysis<DominatorTree>();
+
+  CurAST = new AliasSetTracker(*AA);
+  // Collect Alias info from subloops
+  for (Loop::iterator LoopItr = L->begin(), LoopItrE = L->end();
+       LoopItr != LoopItrE; ++LoopItr) {
+    Loop *InnerL = *LoopItr;
+    AliasSetTracker *InnerAST = LoopToAliasMap[InnerL];
+    assert (InnerAST && "Where is my AST?");
+
+    // What if InnerLoop was modified by other passes ?
+    CurAST->add(*InnerAST);
+  }
+  
+  CurLoop = L;
+
+  // Get the preheader block to move instructions into...
+  Preheader = L->getLoopPreheader();
+  assert(Preheader&&"Preheader insertion pass guarantees we have a preheader!");
+
+  // Loop over the body of this loop, looking for calls, invokes, and stores.
+  // Because subloops have already been incorporated into AST, we skip blocks in
+  // subloops.
+  //
+  for (Loop::block_iterator I = L->block_begin(), E = L->block_end();
+       I != E; ++I) {
+    BasicBlock *BB = *I;
+    if (LI->getLoopFor(BB) == L)        // Ignore blocks in subloops...
+      CurAST->add(*BB);                 // Incorporate the specified basic block
+  }
+
+  // We want to visit all of the instructions in this loop... that are not parts
+  // of our subloops (they have already had their invariants hoisted out of
+  // their loop, into this loop, so there is no need to process the BODIES of
+  // the subloops).
+  //
+  // Traverse the body of the loop in depth first order on the dominator tree so
+  // that we are guaranteed to see definitions before we see uses.  This allows
+  // us to sink instructions in one pass, without iteration.  After sinking
+  // instructions, we perform another pass to hoist them out of the loop.
+  //
+  SinkRegion(DT->getNode(L->getHeader()));
+  HoistRegion(DT->getNode(L->getHeader()));
+
+  // Now that all loop invariants have been removed from the loop, promote any
+  // memory references to scalars that we can...
+  if (!DisablePromotion)
+    PromoteValuesInLoop();
+
+  // Clear out loops state information for the next iteration
+  CurLoop = 0;
+  Preheader = 0;
+
+  LoopToAliasMap[L] = CurAST;
+  return Changed;
+}
+
+/// SinkRegion - Walk the specified region of the CFG (defined by all blocks
+/// dominated by the specified block, and that are in the current loop) in
+/// reverse depth first order w.r.t the DominatorTree.  This allows us to visit
+/// uses before definitions, allowing us to sink a loop body in one pass without
+/// iteration.
+///
+void LICM::SinkRegion(DomTreeNode *N) {
+  assert(N != 0 && "Null dominator tree node?");
+  BasicBlock *BB = N->getBlock();
+
+  // If this subregion is not in the top level loop at all, exit.
+  if (!CurLoop->contains(BB)) return;
+
+  // We are processing blocks in reverse dfo, so process children first...
+  const std::vector<DomTreeNode*> &Children = N->getChildren();
+  for (unsigned i = 0, e = Children.size(); i != e; ++i)
+    SinkRegion(Children[i]);
+
+  // Only need to process the contents of this block if it is not part of a
+  // subloop (which would already have been processed).
+  if (inSubLoop(BB)) return;
+
+  for (BasicBlock::iterator II = BB->end(); II != BB->begin(); ) {
+    Instruction &I = *--II;
+
+    // Check to see if we can sink this instruction to the exit blocks
+    // of the loop.  We can do this if the all users of the instruction are
+    // outside of the loop.  In this case, it doesn't even matter if the
+    // operands of the instruction are loop invariant.
+    //
+    if (isNotUsedInLoop(I) && canSinkOrHoistInst(I)) {
+      ++II;
+      sink(I);
+    }
+  }
+}
+
+
+/// HoistRegion - Walk the specified region of the CFG (defined by all blocks
+/// dominated by the specified block, and that are in the current loop) in depth
+/// first order w.r.t the DominatorTree.  This allows us to visit definitions
+/// before uses, allowing us to hoist a loop body in one pass without iteration.
+///
+void LICM::HoistRegion(DomTreeNode *N) {
+  assert(N != 0 && "Null dominator tree node?");
+  BasicBlock *BB = N->getBlock();
+
+  // If this subregion is not in the top level loop at all, exit.
+  if (!CurLoop->contains(BB)) return;
+
+  // Only need to process the contents of this block if it is not part of a
+  // subloop (which would already have been processed).
+  if (!inSubLoop(BB))
+    for (BasicBlock::iterator II = BB->begin(), E = BB->end(); II != E; ) {
+      Instruction &I = *II++;
+
+      // Try hoisting the instruction out to the preheader.  We can only do this
+      // if all of the operands of the instruction are loop invariant and if it
+      // is safe to hoist the instruction.
+      //
+      if (isLoopInvariantInst(I) && canSinkOrHoistInst(I) &&
+          isSafeToExecuteUnconditionally(I))
+        hoist(I);
+      }
+
+  const std::vector<DomTreeNode*> &Children = N->getChildren();
+  for (unsigned i = 0, e = Children.size(); i != e; ++i)
+    HoistRegion(Children[i]);
+}
+
+/// canSinkOrHoistInst - Return true if the hoister and sinker can handle this
+/// instruction.
+///
+bool LICM::canSinkOrHoistInst(Instruction &I) {
+  // Loads have extra constraints we have to verify before we can hoist them.
+  if (LoadInst *LI = dyn_cast<LoadInst>(&I)) {
+    if (LI->isVolatile())
+      return false;        // Don't hoist volatile loads!
+
+    // Loads from constant memory are always safe to move, even if they end up
+    // in the same alias set as something that ends up being modified.
+    if (EnableLICMConstantMotion &&
+        AA->pointsToConstantMemory(LI->getOperand(0)))
+      return true;
+    
+    // Don't hoist loads which have may-aliased stores in loop.
+    unsigned Size = 0;
+    if (LI->getType()->isSized())
+      Size = AA->getTargetData().getTypeStoreSize(LI->getType());
+    return !pointerInvalidatedByLoop(LI->getOperand(0), Size);
+  } else if (CallInst *CI = dyn_cast<CallInst>(&I)) {
+    // Handle obvious cases efficiently.
+    AliasAnalysis::ModRefBehavior Behavior = AA->getModRefBehavior(CI);
+    if (Behavior == AliasAnalysis::DoesNotAccessMemory)
+      return true;
+    else if (Behavior == AliasAnalysis::OnlyReadsMemory) {
+      // If this call only reads from memory and there are no writes to memory
+      // in the loop, we can hoist or sink the call as appropriate.
+      bool FoundMod = false;
+      for (AliasSetTracker::iterator I = CurAST->begin(), E = CurAST->end();
+           I != E; ++I) {
+        AliasSet &AS = *I;
+        if (!AS.isForwardingAliasSet() && AS.isMod()) {
+          FoundMod = true;
+          break;
+        }
+      }
+      if (!FoundMod) return true;
+    }
+
+    // FIXME: This should use mod/ref information to see if we can hoist or sink
+    // the call.
+
+    return false;
+  }
+
+  // Otherwise these instructions are hoistable/sinkable
+  return isa<BinaryOperator>(I) || isa<CastInst>(I) ||
+         isa<SelectInst>(I) || isa<GetElementPtrInst>(I) || isa<CmpInst>(I) ||
+         isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
+         isa<ShuffleVectorInst>(I);
+}
+
+/// isNotUsedInLoop - Return true if the only users of this instruction are
+/// outside of the loop.  If this is true, we can sink the instruction to the
+/// exit blocks of the loop.
+///
+bool LICM::isNotUsedInLoop(Instruction &I) {
+  for (Value::use_iterator UI = I.use_begin(), E = I.use_end(); UI != E; ++UI) {
+    Instruction *User = cast<Instruction>(*UI);
+    if (PHINode *PN = dyn_cast<PHINode>(User)) {
+      // PHI node uses occur in predecessor blocks!
+      for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+        if (PN->getIncomingValue(i) == &I)
+          if (CurLoop->contains(PN->getIncomingBlock(i)))
+            return false;
+    } else if (CurLoop->contains(User->getParent())) {
+      return false;
+    }
+  }
+  return true;
+}
+
+
+/// isLoopInvariantInst - Return true if all operands of this instruction are
+/// loop invariant.  We also filter out non-hoistable instructions here just for
+/// efficiency.
+///
+bool LICM::isLoopInvariantInst(Instruction &I) {
+  // The instruction is loop invariant if all of its operands are loop-invariant
+  for (unsigned i = 0, e = I.getNumOperands(); i != e; ++i)
+    if (!CurLoop->isLoopInvariant(I.getOperand(i)))
+      return false;
+
+  // If we got this far, the instruction is loop invariant!
+  return true;
+}
+
+/// sink - When an instruction is found to only be used outside of the loop,
+/// this function moves it to the exit blocks and patches up SSA form as needed.
+/// This method is guaranteed to remove the original instruction from its
+/// position, and may either delete it or move it to outside of the loop.
+///
+void LICM::sink(Instruction &I) {
+  DOUT << "LICM sinking instruction: " << I;
+
+  SmallVector<BasicBlock*, 8> ExitBlocks;
+  CurLoop->getExitBlocks(ExitBlocks);
+
+  if (isa<LoadInst>(I)) ++NumMovedLoads;
+  else if (isa<CallInst>(I)) ++NumMovedCalls;
+  ++NumSunk;
+  Changed = true;
+
+  // The case where there is only a single exit node of this loop is common
+  // enough that we handle it as a special (more efficient) case.  It is more
+  // efficient to handle because there are no PHI nodes that need to be placed.
+  if (ExitBlocks.size() == 1) {
+    if (!isExitBlockDominatedByBlockInLoop(ExitBlocks[0], I.getParent())) {
+      // Instruction is not used, just delete it.
+      CurAST->deleteValue(&I);
+      if (!I.use_empty())  // If I has users in unreachable blocks, eliminate.
+        I.replaceAllUsesWith(UndefValue::get(I.getType()));
+      I.eraseFromParent();
+    } else {
+      // Move the instruction to the start of the exit block, after any PHI
+      // nodes in it.
+      I.removeFromParent();
+
+      BasicBlock::iterator InsertPt = ExitBlocks[0]->getFirstNonPHI();
+      ExitBlocks[0]->getInstList().insert(InsertPt, &I);
+    }
+  } else if (ExitBlocks.empty()) {
+    // The instruction is actually dead if there ARE NO exit blocks.
+    CurAST->deleteValue(&I);
+    if (!I.use_empty())  // If I has users in unreachable blocks, eliminate.
+      I.replaceAllUsesWith(UndefValue::get(I.getType()));
+    I.eraseFromParent();
+  } else {
+    // Otherwise, if we have multiple exits, use the PromoteMem2Reg function to
+    // do all of the hard work of inserting PHI nodes as necessary.  We convert
+    // the value into a stack object to get it to do this.
+
+    // Firstly, we create a stack object to hold the value...
+    AllocaInst *AI = 0;
+
+    if (I.getType() != Type::VoidTy) {
+      AI = new AllocaInst(I.getType(), 0, I.getName(),
+                          I.getParent()->getParent()->getEntryBlock().begin());
+      CurAST->add(AI);
+    }
+
+    // Secondly, insert load instructions for each use of the instruction
+    // outside of the loop.
+    while (!I.use_empty()) {
+      Instruction *U = cast<Instruction>(I.use_back());
+
+      // If the user is a PHI Node, we actually have to insert load instructions
+      // in all predecessor blocks, not in the PHI block itself!
+      if (PHINode *UPN = dyn_cast<PHINode>(U)) {
+        // Only insert into each predecessor once, so that we don't have
+        // different incoming values from the same block!
+        std::map<BasicBlock*, Value*> InsertedBlocks;
+        for (unsigned i = 0, e = UPN->getNumIncomingValues(); i != e; ++i)
+          if (UPN->getIncomingValue(i) == &I) {
+            BasicBlock *Pred = UPN->getIncomingBlock(i);
+            Value *&PredVal = InsertedBlocks[Pred];
+            if (!PredVal) {
+              // Insert a new load instruction right before the terminator in
+              // the predecessor block.
+              PredVal = new LoadInst(AI, "", Pred->getTerminator());
+              CurAST->add(cast<LoadInst>(PredVal));
+            }
+
+            UPN->setIncomingValue(i, PredVal);
+          }
+
+      } else {
+        LoadInst *L = new LoadInst(AI, "", U);
+        U->replaceUsesOfWith(&I, L);
+        CurAST->add(L);
+      }
+    }
+
+    // Thirdly, insert a copy of the instruction in each exit block of the loop
+    // that is dominated by the instruction, storing the result into the memory
+    // location.  Be careful not to insert the instruction into any particular
+    // basic block more than once.
+    std::set<BasicBlock*> InsertedBlocks;
+    BasicBlock *InstOrigBB = I.getParent();
+
+    for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) {
+      BasicBlock *ExitBlock = ExitBlocks[i];
+
+      if (isExitBlockDominatedByBlockInLoop(ExitBlock, InstOrigBB)) {
+        // If we haven't already processed this exit block, do so now.
+        if (InsertedBlocks.insert(ExitBlock).second) {
+          // Insert the code after the last PHI node...
+          BasicBlock::iterator InsertPt = ExitBlock->getFirstNonPHI();
+
+          // If this is the first exit block processed, just move the original
+          // instruction, otherwise clone the original instruction and insert
+          // the copy.
+          Instruction *New;
+          if (InsertedBlocks.size() == 1) {
+            I.removeFromParent();
+            ExitBlock->getInstList().insert(InsertPt, &I);
+            New = &I;
+          } else {
+            New = I.clone();
+            CurAST->copyValue(&I, New);
+            if (!I.getName().empty())
+              New->setName(I.getName()+".le");
+            ExitBlock->getInstList().insert(InsertPt, New);
+          }
+
+          // Now that we have inserted the instruction, store it into the alloca
+          if (AI) new StoreInst(New, AI, InsertPt);
+        }
+      }
+    }
+
+    // If the instruction doesn't dominate any exit blocks, it must be dead.
+    if (InsertedBlocks.empty()) {
+      CurAST->deleteValue(&I);
+      I.eraseFromParent();
+    }
+
+    // Finally, promote the fine value to SSA form.
+    if (AI) {
+      std::vector<AllocaInst*> Allocas;
+      Allocas.push_back(AI);
+      PromoteMemToReg(Allocas, *DT, *DF, CurAST);
+    }
+  }
+}
+
+/// hoist - When an instruction is found to only use loop invariant operands
+/// that is safe to hoist, this instruction is called to do the dirty work.
+///
+void LICM::hoist(Instruction &I) {
+  DOUT << "LICM hoisting to " << Preheader->getName() << ": " << I;
+
+  // Remove the instruction from its current basic block... but don't delete the
+  // instruction.
+  I.removeFromParent();
+
+  // Insert the new node in Preheader, before the terminator.
+  Preheader->getInstList().insert(Preheader->getTerminator(), &I);
+
+  if (isa<LoadInst>(I)) ++NumMovedLoads;
+  else if (isa<CallInst>(I)) ++NumMovedCalls;
+  ++NumHoisted;
+  Changed = true;
+}
+
+/// isSafeToExecuteUnconditionally - Only sink or hoist an instruction if it is
+/// not a trapping instruction or if it is a trapping instruction and is
+/// guaranteed to execute.
+///
+bool LICM::isSafeToExecuteUnconditionally(Instruction &Inst) {
+  // If it is not a trapping instruction, it is always safe to hoist.
+  if (!Inst.isTrapping()) return true;
+
+  // Otherwise we have to check to make sure that the instruction dominates all
+  // of the exit blocks.  If it doesn't, then there is a path out of the loop
+  // which does not execute this instruction, so we can't hoist it.
+
+  // If the instruction is in the header block for the loop (which is very
+  // common), it is always guaranteed to dominate the exit blocks.  Since this
+  // is a common case, and can save some work, check it now.
+  if (Inst.getParent() == CurLoop->getHeader())
+    return true;
+
+  // It's always safe to load from a global or alloca.
+  if (isa<LoadInst>(Inst))
+    if (isa<AllocationInst>(Inst.getOperand(0)) ||
+        isa<GlobalVariable>(Inst.getOperand(0)))
+      return true;
+
+  // Get the exit blocks for the current loop.
+  SmallVector<BasicBlock*, 8> ExitBlocks;
+  CurLoop->getExitBlocks(ExitBlocks);
+
+  // For each exit block, get the DT node and walk up the DT until the
+  // instruction's basic block is found or we exit the loop.
+  for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i)
+    if (!isExitBlockDominatedByBlockInLoop(ExitBlocks[i], Inst.getParent()))
+      return false;
+
+  return true;
+}
+
+
+/// PromoteValuesInLoop - Try to promote memory values to scalars by sinking
+/// stores out of the loop and moving loads to before the loop.  We do this by
+/// looping over the stores in the loop, looking for stores to Must pointers
+/// which are loop invariant.  We promote these memory locations to use allocas
+/// instead.  These allocas can easily be raised to register values by the
+/// PromoteMem2Reg functionality.
+///
+void LICM::PromoteValuesInLoop() {
+  // PromotedValues - List of values that are promoted out of the loop.  Each
+  // value has an alloca instruction for it, and a canonical version of the
+  // pointer.
+  std::vector<std::pair<AllocaInst*, Value*> > PromotedValues;
+  std::map<Value*, AllocaInst*> ValueToAllocaMap; // Map of ptr to alloca
+
+  FindPromotableValuesInLoop(PromotedValues, ValueToAllocaMap);
+  if (ValueToAllocaMap.empty()) return;   // If there are values to promote.
+
+  Changed = true;
+  NumPromoted += PromotedValues.size();
+
+  std::vector<Value*> PointerValueNumbers;
+
+  // Emit a copy from the value into the alloca'd value in the loop preheader
+  TerminatorInst *LoopPredInst = Preheader->getTerminator();
+  for (unsigned i = 0, e = PromotedValues.size(); i != e; ++i) {
+    Value *Ptr = PromotedValues[i].second;
+
+    // If we are promoting a pointer value, update alias information for the
+    // inserted load.
+    Value *LoadValue = 0;
+    if (isa<PointerType>(cast<PointerType>(Ptr->getType())->getElementType())) {
+      // Locate a load or store through the pointer, and assign the same value
+      // to LI as we are loading or storing.  Since we know that the value is
+      // stored in this loop, this will always succeed.
+      for (Value::use_iterator UI = Ptr->use_begin(), E = Ptr->use_end();
+           UI != E; ++UI)
+        if (LoadInst *LI = dyn_cast<LoadInst>(*UI)) {
+          LoadValue = LI;
+          break;
+        } else if (StoreInst *SI = dyn_cast<StoreInst>(*UI)) {
+          if (SI->getOperand(1) == Ptr) {
+            LoadValue = SI->getOperand(0);
+            break;
+          }
+        }
+      assert(LoadValue && "No store through the pointer found!");
+      PointerValueNumbers.push_back(LoadValue);  // Remember this for later.
+    }
+
+    // Load from the memory we are promoting.
+    LoadInst *LI = new LoadInst(Ptr, Ptr->getName()+".promoted", LoopPredInst);
+
+    if (LoadValue) CurAST->copyValue(LoadValue, LI);
+
+    // Store into the temporary alloca.
+    new StoreInst(LI, PromotedValues[i].first, LoopPredInst);
+  }
+
+  // Scan the basic blocks in the loop, replacing uses of our pointers with
+  // uses of the allocas in question.
+  //
+  for (Loop::block_iterator I = CurLoop->block_begin(),
+         E = CurLoop->block_end(); I != E; ++I) {
+    BasicBlock *BB = *I;
+    // Rewrite all loads and stores in the block of the pointer...
+    for (BasicBlock::iterator II = BB->begin(), E = BB->end(); II != E; ++II) {
+      if (LoadInst *L = dyn_cast<LoadInst>(II)) {
+        std::map<Value*, AllocaInst*>::iterator
+          I = ValueToAllocaMap.find(L->getOperand(0));
+        if (I != ValueToAllocaMap.end())
+          L->setOperand(0, I->second);    // Rewrite load instruction...
+      } else if (StoreInst *S = dyn_cast<StoreInst>(II)) {
+        std::map<Value*, AllocaInst*>::iterator
+          I = ValueToAllocaMap.find(S->getOperand(1));
+        if (I != ValueToAllocaMap.end())
+          S->setOperand(1, I->second);    // Rewrite store instruction...
+      }
+    }
+  }
+
+  // Now that the body of the loop uses the allocas instead of the original
+  // memory locations, insert code to copy the alloca value back into the
+  // original memory location on all exits from the loop.  Note that we only
+  // want to insert one copy of the code in each exit block, though the loop may
+  // exit to the same block more than once.
+  //
+  SmallPtrSet<BasicBlock*, 16> ProcessedBlocks;
+
+  SmallVector<BasicBlock*, 8> ExitBlocks;
+  CurLoop->getExitBlocks(ExitBlocks);
+  for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) {
+    if (!ProcessedBlocks.insert(ExitBlocks[i]))
+      continue;
+  
+    // Copy all of the allocas into their memory locations.
+    BasicBlock::iterator BI = ExitBlocks[i]->getFirstNonPHI();
+    Instruction *InsertPos = BI;
+    unsigned PVN = 0;
+    for (unsigned i = 0, e = PromotedValues.size(); i != e; ++i) {
+      // Load from the alloca.
+      LoadInst *LI = new LoadInst(PromotedValues[i].first, "", InsertPos);
+
+      // If this is a pointer type, update alias info appropriately.
+      if (isa<PointerType>(LI->getType()))
+        CurAST->copyValue(PointerValueNumbers[PVN++], LI);
+
+      // Store into the memory we promoted.
+      new StoreInst(LI, PromotedValues[i].second, InsertPos);
+    }
+  }
+
+  // Now that we have done the deed, use the mem2reg functionality to promote
+  // all of the new allocas we just created into real SSA registers.
+  //
+  std::vector<AllocaInst*> PromotedAllocas;
+  PromotedAllocas.reserve(PromotedValues.size());
+  for (unsigned i = 0, e = PromotedValues.size(); i != e; ++i)
+    PromotedAllocas.push_back(PromotedValues[i].first);
+  PromoteMemToReg(PromotedAllocas, *DT, *DF, CurAST);
+}
+
+/// FindPromotableValuesInLoop - Check the current loop for stores to definite
+/// pointers, which are not loaded and stored through may aliases and are safe
+/// for promotion.  If these are found, create an alloca for the value, add it 
+/// to the PromotedValues list, and keep track of the mapping from value to 
+/// alloca. 
+void LICM::FindPromotableValuesInLoop(
+                   std::vector<std::pair<AllocaInst*, Value*> > &PromotedValues,
+                             std::map<Value*, AllocaInst*> &ValueToAllocaMap) {
+  Instruction *FnStart = CurLoop->getHeader()->getParent()->begin()->begin();
+
+  // Loop over all of the alias sets in the tracker object.
+  for (AliasSetTracker::iterator I = CurAST->begin(), E = CurAST->end();
+       I != E; ++I) {
+    AliasSet &AS = *I;
+    // We can promote this alias set if it has a store, if it is a "Must" alias
+    // set, if the pointer is loop invariant, and if we are not eliminating any
+    // volatile loads or stores.
+    if (AS.isForwardingAliasSet() || !AS.isMod() || !AS.isMustAlias() ||
+        AS.isVolatile() || !CurLoop->isLoopInvariant(AS.begin()->getValue()))
+      continue;
+    
+    assert(!AS.empty() &&
+           "Must alias set should have at least one pointer element in it!");
+    Value *V = AS.begin()->getValue();
+
+    // Check that all of the pointers in the alias set have the same type.  We
+    // cannot (yet) promote a memory location that is loaded and stored in
+    // different sizes.
+    {
+      bool PointerOk = true;
+      for (AliasSet::iterator I = AS.begin(), E = AS.end(); I != E; ++I)
+        if (V->getType() != I->getValue()->getType()) {
+          PointerOk = false;
+          break;
+        }
+      if (!PointerOk)
+        continue;
+    }
+
+    // It isn't safe to promote a load/store from the loop if the load/store is
+    // conditional.  For example, turning:
+    //
+    //    for () { if (c) *P += 1; }
+    //
+    // into:
+    //
+    //    tmp = *P;  for () { if (c) tmp +=1; } *P = tmp;
+    //
+    // is not safe, because *P may only be valid to access if 'c' is true.
+    // 
+    // It is safe to promote P if all uses are direct load/stores and if at
+    // least one is guaranteed to be executed.
+    bool GuaranteedToExecute = false;
+    bool InvalidInst = false;
+    for (Value::use_iterator UI = V->use_begin(), UE = V->use_end();
+         UI != UE; ++UI) {
+      // Ignore instructions not in this loop.
+      Instruction *Use = dyn_cast<Instruction>(*UI);
+      if (!Use || !CurLoop->contains(Use->getParent()))
+        continue;
+
+      if (!isa<LoadInst>(Use) && !isa<StoreInst>(Use)) {
+        InvalidInst = true;
+        break;
+      }
+      
+      if (!GuaranteedToExecute)
+        GuaranteedToExecute = isSafeToExecuteUnconditionally(*Use);
+    }
+
+    // If there is an non-load/store instruction in the loop, we can't promote
+    // it.  If there isn't a guaranteed-to-execute instruction, we can't
+    // promote.
+    if (InvalidInst || !GuaranteedToExecute)
+      continue;
+    
+    const Type *Ty = cast<PointerType>(V->getType())->getElementType();
+    AllocaInst *AI = new AllocaInst(Ty, 0, V->getName()+".tmp", FnStart);
+    PromotedValues.push_back(std::make_pair(AI, V));
+
+    // Update the AST and alias analysis.
+    CurAST->copyValue(V, AI);
+
+    for (AliasSet::iterator I = AS.begin(), E = AS.end(); I != E; ++I)
+      ValueToAllocaMap.insert(std::make_pair(I->getValue(), AI));
+
+    DOUT << "LICM: Promoting value: " << *V << "\n";
+  }
+}
+
+/// cloneBasicBlockAnalysis - Simple Analysis hook. Clone alias set info.
+void LICM::cloneBasicBlockAnalysis(BasicBlock *From, BasicBlock *To, Loop *L) {
+  AliasSetTracker *AST = LoopToAliasMap[L];
+  if (!AST)
+    return;
+
+  AST->copyValue(From, To);
+}
+
+/// deleteAnalysisValue - Simple Analysis hook. Delete value V from alias
+/// set.
+void LICM::deleteAnalysisValue(Value *V, Loop *L) {
+  AliasSetTracker *AST = LoopToAliasMap[L];
+  if (!AST)
+    return;
+
+  AST->deleteValue(V);
+}
diff --git a/lib/Transforms/Scalar/LoopDeletion.cpp b/lib/Transforms/Scalar/LoopDeletion.cpp
new file mode 100644
index 0000000..6512672
--- /dev/null
+++ b/lib/Transforms/Scalar/LoopDeletion.cpp
@@ -0,0 +1,280 @@
+//===- LoopDeletion.cpp - Dead Loop Deletion Pass ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Dead Loop Deletion Pass. This pass is responsible
+// for eliminating loops with non-infinite computable trip counts that have no
+// side effects or volatile instructions, and do not contribute to the
+// computation of the function's return value.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "loop-delete"
+
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/SmallVector.h"
+
+using namespace llvm;
+
+STATISTIC(NumDeleted, "Number of loops deleted");
+
+namespace {
+  class VISIBILITY_HIDDEN LoopDeletion : public LoopPass {
+  public:
+    static char ID; // Pass ID, replacement for typeid
+    LoopDeletion() : LoopPass(&ID) {}
+    
+    // Possibly eliminate loop L if it is dead.
+    bool runOnLoop(Loop* L, LPPassManager& LPM);
+    
+    bool SingleDominatingExit(Loop* L,
+                              SmallVector<BasicBlock*, 4>& exitingBlocks);
+    bool IsLoopDead(Loop* L, SmallVector<BasicBlock*, 4>& exitingBlocks,
+                    SmallVector<BasicBlock*, 4>& exitBlocks);
+    bool IsLoopInvariantInst(Instruction *I, Loop* L);
+    
+    virtual void getAnalysisUsage(AnalysisUsage& AU) const {
+      AU.addRequired<ScalarEvolution>();
+      AU.addRequired<DominatorTree>();
+      AU.addRequired<LoopInfo>();
+      AU.addRequiredID(LoopSimplifyID);
+      AU.addRequiredID(LCSSAID);
+      
+      AU.addPreserved<ScalarEvolution>();
+      AU.addPreserved<DominatorTree>();
+      AU.addPreserved<LoopInfo>();
+      AU.addPreservedID(LoopSimplifyID);
+      AU.addPreservedID(LCSSAID);
+      AU.addPreserved<DominanceFrontier>();
+    }
+  };
+}
+  
+char LoopDeletion::ID = 0;
+static RegisterPass<LoopDeletion> X("loop-deletion", "Delete dead loops");
+
+Pass* llvm::createLoopDeletionPass() {
+  return new LoopDeletion();
+}
+
+/// SingleDominatingExit - Checks that there is only a single blocks that 
+/// branches out of the loop, and that it also g the latch block.  Loops
+/// with multiple or non-latch-dominating exiting blocks could be dead, but we'd
+/// have to do more extensive analysis to make sure, for instance, that the 
+/// control flow logic involved was or could be made loop-invariant.
+bool LoopDeletion::SingleDominatingExit(Loop* L,
+                                   SmallVector<BasicBlock*, 4>& exitingBlocks) {
+  
+  if (exitingBlocks.size() != 1)
+    return false;
+  
+  BasicBlock* latch = L->getLoopLatch();
+  if (!latch)
+    return false;
+  
+  DominatorTree& DT = getAnalysis<DominatorTree>();
+  return DT.dominates(exitingBlocks[0], latch);
+}
+
+/// IsLoopInvariantInst - Checks if an instruction is invariant with respect to
+/// a loop, which is defined as being true if all of its operands are defined
+/// outside of the loop.  These instructions can be hoisted out of the loop
+/// if their results are needed.  This could be made more aggressive by
+/// recursively checking the operands for invariance, but it's not clear that
+/// it's worth it.
+bool LoopDeletion::IsLoopInvariantInst(Instruction *I, Loop* L)  {
+  // PHI nodes are not loop invariant if defined in  the loop.
+  if (isa<PHINode>(I) && L->contains(I->getParent()))
+    return false;
+    
+  // The instruction is loop invariant if all of its operands are loop-invariant
+  for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i)
+    if (!L->isLoopInvariant(I->getOperand(i)))
+      return false;
+  
+  // If we got this far, the instruction is loop invariant!
+  return true;
+}
+
+/// IsLoopDead - Determined if a loop is dead.  This assumes that we've already
+/// checked for unique exit and exiting blocks, and that the code is in LCSSA
+/// form.
+bool LoopDeletion::IsLoopDead(Loop* L,
+                              SmallVector<BasicBlock*, 4>& exitingBlocks,
+                              SmallVector<BasicBlock*, 4>& exitBlocks) {
+  BasicBlock* exitingBlock = exitingBlocks[0];
+  BasicBlock* exitBlock = exitBlocks[0];
+  
+  // Make sure that all PHI entries coming from the loop are loop invariant.
+  // Because the code is in LCSSA form, any values used outside of the loop
+  // must pass through a PHI in the exit block, meaning that this check is
+  // sufficient to guarantee that no loop-variant values are used outside
+  // of the loop.
+  BasicBlock::iterator BI = exitBlock->begin();
+  while (PHINode* P = dyn_cast<PHINode>(BI)) {
+    Value* incoming = P->getIncomingValueForBlock(exitingBlock);
+    if (Instruction* I = dyn_cast<Instruction>(incoming))
+      if (!IsLoopInvariantInst(I, L))
+        return false;
+      
+    BI++;
+  }
+  
+  // Make sure that no instructions in the block have potential side-effects.
+  // This includes instructions that could write to memory, and loads that are
+  // marked volatile.  This could be made more aggressive by using aliasing
+  // information to identify readonly and readnone calls.
+  for (Loop::block_iterator LI = L->block_begin(), LE = L->block_end();
+       LI != LE; ++LI) {
+    for (BasicBlock::iterator BI = (*LI)->begin(), BE = (*LI)->end();
+         BI != BE; ++BI) {
+      if (BI->mayHaveSideEffects())
+        return false;
+    }
+  }
+  
+  return true;
+}
+
+/// runOnLoop - Remove dead loops, by which we mean loops that do not impact the
+/// observable behavior of the program other than finite running time.  Note 
+/// we do ensure that this never remove a loop that might be infinite, as doing
+/// so could change the halting/non-halting nature of a program.
+/// NOTE: This entire process relies pretty heavily on LoopSimplify and LCSSA
+/// in order to make various safety checks work.
+bool LoopDeletion::runOnLoop(Loop* L, LPPassManager& LPM) {
+  // We can only remove the loop if there is a preheader that we can 
+  // branch from after removing it.
+  BasicBlock* preheader = L->getLoopPreheader();
+  if (!preheader)
+    return false;
+  
+  // We can't remove loops that contain subloops.  If the subloops were dead,
+  // they would already have been removed in earlier executions of this pass.
+  if (L->begin() != L->end())
+    return false;
+  
+  SmallVector<BasicBlock*, 4> exitingBlocks;
+  L->getExitingBlocks(exitingBlocks);
+  
+  SmallVector<BasicBlock*, 4> exitBlocks;
+  L->getUniqueExitBlocks(exitBlocks);
+  
+  // We require that the loop only have a single exit block.  Otherwise, we'd
+  // be in the situation of needing to be able to solve statically which exit
+  // block will be branched to, or trying to preserve the branching logic in
+  // a loop invariant manner.
+  if (exitBlocks.size() != 1)
+    return false;
+  
+  // Loops with multiple exits or exits that don't dominate the latch
+  // are too complicated to handle correctly.
+  if (!SingleDominatingExit(L, exitingBlocks))
+    return false;
+  
+  // Finally, we have to check that the loop really is dead.
+  if (!IsLoopDead(L, exitingBlocks, exitBlocks))
+    return false;
+  
+  // Don't remove loops for which we can't solve the trip count.
+  // They could be infinite, in which case we'd be changing program behavior.
+  ScalarEvolution& SE = getAnalysis<ScalarEvolution>();
+  SCEVHandle S = SE.getBackedgeTakenCount(L);
+  if (isa<SCEVCouldNotCompute>(S))
+    return false;
+  
+  // Now that we know the removal is safe, remove the loop by changing the
+  // branch from the preheader to go to the single exit block.  
+  BasicBlock* exitBlock = exitBlocks[0];
+  BasicBlock* exitingBlock = exitingBlocks[0];
+  
+  // Because we're deleting a large chunk of code at once, the sequence in which
+  // we remove things is very important to avoid invalidation issues.  Don't
+  // mess with this unless you have good reason and know what you're doing.
+  
+  // Move simple loop-invariant expressions out of the loop, since they
+  // might be needed by the exit phis.
+  for (Loop::block_iterator LI = L->block_begin(), LE = L->block_end();
+       LI != LE; ++LI)
+    for (BasicBlock::iterator BI = (*LI)->begin(), BE = (*LI)->end();
+         BI != BE; ) {
+      Instruction* I = BI++;
+      if (!I->use_empty() && IsLoopInvariantInst(I, L))
+        I->moveBefore(preheader->getTerminator());
+    }
+  
+  // Connect the preheader directly to the exit block.
+  TerminatorInst* TI = preheader->getTerminator();
+  TI->replaceUsesOfWith(L->getHeader(), exitBlock);
+
+  // Rewrite phis in the exit block to get their inputs from
+  // the preheader instead of the exiting block.
+  BasicBlock::iterator BI = exitBlock->begin();
+  while (PHINode* P = dyn_cast<PHINode>(BI)) {
+    P->replaceUsesOfWith(exitingBlock, preheader);
+    BI++;
+  }
+  
+  // Update the dominator tree and remove the instructions and blocks that will
+  // be deleted from the reference counting scheme.
+  DominatorTree& DT = getAnalysis<DominatorTree>();
+  DominanceFrontier* DF = getAnalysisIfAvailable<DominanceFrontier>();
+  SmallPtrSet<DomTreeNode*, 8> ChildNodes;
+  for (Loop::block_iterator LI = L->block_begin(), LE = L->block_end();
+       LI != LE; ++LI) {
+    // Move all of the block's children to be children of the preheader, which
+    // allows us to remove the domtree entry for the block.
+    ChildNodes.insert(DT[*LI]->begin(), DT[*LI]->end());
+    for (SmallPtrSet<DomTreeNode*, 8>::iterator DI = ChildNodes.begin(),
+         DE = ChildNodes.end(); DI != DE; ++DI) {
+      DT.changeImmediateDominator(*DI, DT[preheader]);
+      if (DF) DF->changeImmediateDominator((*DI)->getBlock(), preheader, &DT);
+    }
+    
+    ChildNodes.clear();
+    DT.eraseNode(*LI);
+    if (DF) DF->removeBlock(*LI);
+
+    // Remove the block from the reference counting scheme, so that we can
+    // delete it freely later.
+    (*LI)->dropAllReferences();
+  }
+  
+  // Tell ScalarEvolution that the loop is deleted. Do this before
+  // deleting the loop so that ScalarEvolution can look at the loop
+  // to determine what it needs to clean up.
+  SE.forgetLoopBackedgeTakenCount(L);
+
+  // Erase the instructions and the blocks without having to worry
+  // about ordering because we already dropped the references.
+  // NOTE: This iteration is safe because erasing the block does not remove its
+  // entry from the loop's block list.  We do that in the next section.
+  for (Loop::block_iterator LI = L->block_begin(), LE = L->block_end();
+       LI != LE; ++LI)
+    (*LI)->eraseFromParent();
+
+  // Finally, the blocks from loopinfo.  This has to happen late because
+  // otherwise our loop iterators won't work.
+  LoopInfo& loopInfo = getAnalysis<LoopInfo>();
+  SmallPtrSet<BasicBlock*, 8> blocks;
+  blocks.insert(L->block_begin(), L->block_end());
+  for (SmallPtrSet<BasicBlock*,8>::iterator I = blocks.begin(),
+       E = blocks.end(); I != E; ++I)
+    loopInfo.removeBlock(*I);
+  
+  // The last step is to inform the loop pass manager that we've
+  // eliminated this loop.
+  LPM.deleteLoopFromQueue(L);
+  
+  NumDeleted++;
+  
+  return true;
+}
diff --git a/lib/Transforms/Scalar/LoopIndexSplit.cpp b/lib/Transforms/Scalar/LoopIndexSplit.cpp
new file mode 100644
index 0000000..9c78596
--- /dev/null
+++ b/lib/Transforms/Scalar/LoopIndexSplit.cpp
@@ -0,0 +1,1237 @@
+//===- LoopIndexSplit.cpp - Loop Index Splitting Pass ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements Loop Index Splitting Pass. This pass handles three
+// kinds of loops.
+//
+// [1] A loop may be eliminated if the body is executed exactly once.
+//     For example,
+//
+// for (i = 0; i < N; ++i) {
+//   if (i == X) {
+//     body;
+//   }
+// }
+//
+// is transformed to
+//
+// i = X;
+// body;
+//
+// [2] A loop's iteration space may be shrunk if the loop body is executed
+//     for a proper sub-range of the loop's iteration space. For example,
+//
+// for (i = 0; i < N; ++i) {
+//   if (i > A && i < B) {
+//     ...
+//   }
+// }
+//
+// is transformed to iterators from A to B, if A > 0 and B < N.
+//
+// [3] A loop may be split if the loop body is dominated by a branch.
+//     For example,
+//
+// for (i = LB; i < UB; ++i) { if (i < SV) A; else B; }
+//
+// is transformed into
+//
+// AEV = BSV = SV
+// for (i = LB; i < min(UB, AEV); ++i)
+//    A;
+// for (i = max(LB, BSV); i < UB; ++i);
+//    B;
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "loop-index-split"
+
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/Statistic.h"
+
+using namespace llvm;
+
+STATISTIC(NumIndexSplit, "Number of loop index split");
+STATISTIC(NumIndexSplitRemoved, "Number of loops eliminated by loop index split");
+STATISTIC(NumRestrictBounds, "Number of loop iteration space restricted");
+
+namespace {
+
+  class VISIBILITY_HIDDEN LoopIndexSplit : public LoopPass {
+
+  public:
+    static char ID; // Pass ID, replacement for typeid
+    LoopIndexSplit() : LoopPass(&ID) {}
+
+    // Index split Loop L. Return true if loop is split.
+    bool runOnLoop(Loop *L, LPPassManager &LPM);
+
+    void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addPreserved<ScalarEvolution>();
+      AU.addRequiredID(LCSSAID);
+      AU.addPreservedID(LCSSAID);
+      AU.addRequired<LoopInfo>();
+      AU.addPreserved<LoopInfo>();
+      AU.addRequiredID(LoopSimplifyID);
+      AU.addPreservedID(LoopSimplifyID);
+      AU.addRequired<DominatorTree>();
+      AU.addRequired<DominanceFrontier>();
+      AU.addPreserved<DominatorTree>();
+      AU.addPreserved<DominanceFrontier>();
+    }
+
+  private:
+    /// processOneIterationLoop -- Eliminate loop if loop body is executed 
+    /// only once. For example,
+    /// for (i = 0; i < N; ++i) {
+    ///   if ( i == X) {
+    ///     ...
+    ///   }
+    /// }
+    ///
+    bool processOneIterationLoop();
+
+    // -- Routines used by updateLoopIterationSpace();
+
+    /// updateLoopIterationSpace -- Update loop's iteration space if loop 
+    /// body is executed for certain IV range only. For example,
+    /// 
+    /// for (i = 0; i < N; ++i) {
+    ///   if ( i > A && i < B) {
+    ///     ...
+    ///   }
+    /// }
+    /// is transformed to iterators from A to B, if A > 0 and B < N.
+    ///
+    bool updateLoopIterationSpace();
+
+    /// restrictLoopBound - Op dominates loop body. Op compares an IV based value
+    /// with a loop invariant value. Update loop's lower and upper bound based on
+    /// the loop invariant value.
+    bool restrictLoopBound(ICmpInst &Op);
+
+    // --- Routines used by splitLoop(). --- /
+
+    bool splitLoop();
+
+    /// removeBlocks - Remove basic block DeadBB and all blocks dominated by 
+    /// DeadBB. This routine is used to remove split condition's dead branch, 
+    /// dominated by DeadBB. LiveBB dominates split conidition's other branch.
+    void removeBlocks(BasicBlock *DeadBB, Loop *LP, BasicBlock *LiveBB);
+    
+    /// moveExitCondition - Move exit condition EC into split condition block.
+    void moveExitCondition(BasicBlock *CondBB, BasicBlock *ActiveBB,
+                           BasicBlock *ExitBB, ICmpInst *EC, ICmpInst *SC,
+                           PHINode *IV, Instruction *IVAdd, Loop *LP,
+                           unsigned);
+    
+    /// updatePHINodes - CFG has been changed. 
+    /// Before 
+    ///   - ExitBB's single predecessor was Latch
+    ///   - Latch's second successor was Header
+    /// Now
+    ///   - ExitBB's single predecessor was Header
+    ///   - Latch's one and only successor was Header
+    ///
+    /// Update ExitBB PHINodes' to reflect this change.
+    void updatePHINodes(BasicBlock *ExitBB, BasicBlock *Latch, 
+                        BasicBlock *Header,
+                        PHINode *IV, Instruction *IVIncrement, Loop *LP);
+
+    // --- Utility routines --- /
+
+    /// cleanBlock - A block is considered clean if all non terminal 
+    /// instructions are either PHINodes or IV based values.
+    bool cleanBlock(BasicBlock *BB);
+
+    /// IVisLT - If Op is comparing IV based value with an loop invariant and 
+    /// IV based value is less than  the loop invariant then return the loop 
+    /// invariant. Otherwise return NULL.
+    Value * IVisLT(ICmpInst &Op);
+
+    /// IVisLE - If Op is comparing IV based value with an loop invariant and 
+    /// IV based value is less than or equal to the loop invariant then 
+    /// return the loop invariant. Otherwise return NULL.
+    Value * IVisLE(ICmpInst &Op);
+
+    /// IVisGT - If Op is comparing IV based value with an loop invariant and 
+    /// IV based value is greater than  the loop invariant then return the loop 
+    /// invariant. Otherwise return NULL.
+    Value * IVisGT(ICmpInst &Op);
+
+    /// IVisGE - If Op is comparing IV based value with an loop invariant and 
+    /// IV based value is greater than or equal to the loop invariant then 
+    /// return the loop invariant. Otherwise return NULL.
+    Value * IVisGE(ICmpInst &Op);
+
+  private:
+
+    // Current Loop information.
+    Loop *L;
+    LPPassManager *LPM;
+    LoopInfo *LI;
+    DominatorTree *DT;
+    DominanceFrontier *DF;
+
+    PHINode *IndVar;
+    ICmpInst *ExitCondition;
+    ICmpInst *SplitCondition;
+    Value *IVStartValue;
+    Value *IVExitValue;
+    Instruction *IVIncrement;
+    SmallPtrSet<Value *, 4> IVBasedValues;
+  };
+}
+
+char LoopIndexSplit::ID = 0;
+static RegisterPass<LoopIndexSplit>
+X("loop-index-split", "Index Split Loops");
+
+Pass *llvm::createLoopIndexSplitPass() {
+  return new LoopIndexSplit();
+}
+
+// Index split Loop L. Return true if loop is split.
+bool LoopIndexSplit::runOnLoop(Loop *IncomingLoop, LPPassManager &LPM_Ref) {
+  L = IncomingLoop;
+  LPM = &LPM_Ref;
+
+  // FIXME - Nested loops make dominator info updates tricky. 
+  if (!L->getSubLoops().empty())
+    return false;
+
+  DT = &getAnalysis<DominatorTree>();
+  LI = &getAnalysis<LoopInfo>();
+  DF = &getAnalysis<DominanceFrontier>();
+
+  // Initialize loop data.
+  IndVar = L->getCanonicalInductionVariable();
+  if (!IndVar) return false;
+
+  bool P1InLoop = L->contains(IndVar->getIncomingBlock(1));
+  IVStartValue = IndVar->getIncomingValue(!P1InLoop);
+  IVIncrement = dyn_cast<Instruction>(IndVar->getIncomingValue(P1InLoop));
+  if (!IVIncrement) return false;
+  
+  IVBasedValues.clear();
+  IVBasedValues.insert(IndVar);
+  IVBasedValues.insert(IVIncrement);
+  for (Loop::block_iterator I = L->block_begin(), E = L->block_end();
+       I != E; ++I) 
+    for(BasicBlock::iterator BI = (*I)->begin(), BE = (*I)->end(); 
+        BI != BE; ++BI) {
+      if (BinaryOperator *BO = dyn_cast<BinaryOperator>(BI)) 
+        if (BO != IVIncrement 
+            && (BO->getOpcode() == Instruction::Add
+                || BO->getOpcode() == Instruction::Sub))
+          if (IVBasedValues.count(BO->getOperand(0))
+              && L->isLoopInvariant(BO->getOperand(1)))
+            IVBasedValues.insert(BO);
+    }
+
+  // Reject loop if loop exit condition is not suitable.
+  BasicBlock *ExitingBlock = L->getExitingBlock();
+  if (!ExitingBlock)
+    return false;
+  BranchInst *EBR = dyn_cast<BranchInst>(ExitingBlock->getTerminator());
+  if (!EBR) return false;
+  ExitCondition = dyn_cast<ICmpInst>(EBR->getCondition());
+  if (!ExitCondition) return false;
+  if (ExitingBlock != L->getLoopLatch()) return false;
+  IVExitValue = ExitCondition->getOperand(1);
+  if (!L->isLoopInvariant(IVExitValue))
+    IVExitValue = ExitCondition->getOperand(0);
+  if (!L->isLoopInvariant(IVExitValue))
+    return false;
+
+  // If start value is more then exit value where induction variable
+  // increments by 1 then we are potentially dealing with an infinite loop.
+  // Do not index split this loop.
+  if (ConstantInt *SV = dyn_cast<ConstantInt>(IVStartValue))
+    if (ConstantInt *EV = dyn_cast<ConstantInt>(IVExitValue))
+      if (SV->getSExtValue() > EV->getSExtValue())
+        return false;
+
+  if (processOneIterationLoop())
+    return true;
+
+  if (updateLoopIterationSpace())
+    return true;
+
+  if (splitLoop())
+    return true;
+
+  return false;
+}
+
+// --- Helper routines --- 
+// isUsedOutsideLoop - Returns true iff V is used outside the loop L.
+static bool isUsedOutsideLoop(Value *V, Loop *L) {
+  for(Value::use_iterator UI = V->use_begin(), E = V->use_end(); UI != E; ++UI)
+    if (!L->contains(cast<Instruction>(*UI)->getParent()))
+      return true;
+  return false;
+}
+
+// Return V+1
+static Value *getPlusOne(Value *V, bool Sign, Instruction *InsertPt) {
+  ConstantInt *One = ConstantInt::get(V->getType(), 1, Sign);
+  return BinaryOperator::CreateAdd(V, One, "lsp", InsertPt);
+}
+
+// Return V-1
+static Value *getMinusOne(Value *V, bool Sign, Instruction *InsertPt) {
+  ConstantInt *One = ConstantInt::get(V->getType(), 1, Sign);
+  return BinaryOperator::CreateSub(V, One, "lsp", InsertPt);
+}
+
+// Return min(V1, V1)
+static Value *getMin(Value *V1, Value *V2, bool Sign, Instruction *InsertPt) {
+ 
+  Value *C = new ICmpInst(Sign ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT,
+                          V1, V2, "lsp", InsertPt);
+  return SelectInst::Create(C, V1, V2, "lsp", InsertPt);
+}
+
+// Return max(V1, V2)
+static Value *getMax(Value *V1, Value *V2, bool Sign, Instruction *InsertPt) {
+ 
+  Value *C = new ICmpInst(Sign ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT,
+                          V1, V2, "lsp", InsertPt);
+  return SelectInst::Create(C, V2, V1, "lsp", InsertPt);
+}
+
+/// processOneIterationLoop -- Eliminate loop if loop body is executed 
+/// only once. For example,
+/// for (i = 0; i < N; ++i) {
+///   if ( i == X) {
+///     ...
+///   }
+/// }
+///
+bool LoopIndexSplit::processOneIterationLoop() {
+  SplitCondition = NULL;
+  BasicBlock *Latch = L->getLoopLatch();
+  BasicBlock *Header = L->getHeader();
+  BranchInst *BR = dyn_cast<BranchInst>(Header->getTerminator());
+  if (!BR) return false;
+  if (!isa<BranchInst>(Latch->getTerminator())) return false;
+  if (BR->isUnconditional()) return false;
+  SplitCondition = dyn_cast<ICmpInst>(BR->getCondition());
+  if (!SplitCondition) return false;
+  if (SplitCondition == ExitCondition) return false;
+  if (SplitCondition->getPredicate() != ICmpInst::ICMP_EQ) return false;
+  if (BR->getOperand(1) != Latch) return false;
+  if (!IVBasedValues.count(SplitCondition->getOperand(0))
+      && !IVBasedValues.count(SplitCondition->getOperand(1)))
+    return false;
+
+  // If IV is used outside the loop then this loop traversal is required.
+  // FIXME: Calculate and use last IV value. 
+  if (isUsedOutsideLoop(IVIncrement, L))
+    return false;
+
+  // If BR operands are not IV or not loop invariants then skip this loop.
+  Value *OPV = SplitCondition->getOperand(0);
+  Value *SplitValue = SplitCondition->getOperand(1);
+  if (!L->isLoopInvariant(SplitValue))
+    std::swap(OPV, SplitValue);
+  if (!L->isLoopInvariant(SplitValue))
+    return false;
+  Instruction *OPI = dyn_cast<Instruction>(OPV);
+  if (!OPI) 
+    return false;
+  if (OPI->getParent() != Header || isUsedOutsideLoop(OPI, L))
+    return false;
+  Value *StartValue = IVStartValue;
+  Value *ExitValue = IVExitValue;;
+
+  if (OPV != IndVar) {
+    // If BR operand is IV based then use this operand to calculate
+    // effective conditions for loop body.
+    BinaryOperator *BOPV = dyn_cast<BinaryOperator>(OPV);
+    if (!BOPV) 
+      return false;
+    if (BOPV->getOpcode() != Instruction::Add) 
+      return false;
+    StartValue = BinaryOperator::CreateAdd(OPV, StartValue, "" , BR);
+    ExitValue = BinaryOperator::CreateAdd(OPV, ExitValue, "" , BR);
+  }
+
+  if (!cleanBlock(Header))
+    return false;
+
+  if (!cleanBlock(Latch))
+    return false;
+    
+  // If the merge point for BR is not loop latch then skip this loop.
+  if (BR->getSuccessor(0) != Latch) {
+    DominanceFrontier::iterator DF0 = DF->find(BR->getSuccessor(0));
+    assert (DF0 != DF->end() && "Unable to find dominance frontier");
+    if (!DF0->second.count(Latch))
+      return false;
+  }
+  
+  if (BR->getSuccessor(1) != Latch) {
+    DominanceFrontier::iterator DF1 = DF->find(BR->getSuccessor(1));
+    assert (DF1 != DF->end() && "Unable to find dominance frontier");
+    if (!DF1->second.count(Latch))
+      return false;
+  }
+    
+  // Now, Current loop L contains compare instruction
+  // that compares induction variable, IndVar, against loop invariant. And
+  // entire (i.e. meaningful) loop body is dominated by this compare
+  // instruction. In such case eliminate 
+  // loop structure surrounding this loop body. For example,
+  //     for (int i = start; i < end; ++i) {
+  //         if ( i == somevalue) {
+  //           loop_body
+  //         }
+  //     }
+  // can be transformed into
+  //     if (somevalue >= start && somevalue < end) {
+  //        i = somevalue;
+  //        loop_body
+  //     }
+
+  // Replace index variable with split value in loop body. Loop body is executed
+  // only when index variable is equal to split value.
+  IndVar->replaceAllUsesWith(SplitValue);
+
+  // Replace split condition in header.
+  // Transform 
+  //      SplitCondition : icmp eq i32 IndVar, SplitValue
+  // into
+  //      c1 = icmp uge i32 SplitValue, StartValue
+  //      c2 = icmp ult i32 SplitValue, ExitValue
+  //      and i32 c1, c2 
+  Instruction *C1 = new ICmpInst(ExitCondition->isSignedPredicate() ? 
+                                 ICmpInst::ICMP_SGE : ICmpInst::ICMP_UGE,
+                                 SplitValue, StartValue, "lisplit", BR);
+
+  CmpInst::Predicate C2P  = ExitCondition->getPredicate();
+  BranchInst *LatchBR = cast<BranchInst>(Latch->getTerminator());
+  if (LatchBR->getOperand(0) != Header)
+    C2P = CmpInst::getInversePredicate(C2P);
+  Instruction *C2 = new ICmpInst(C2P, SplitValue, ExitValue, "lisplit", BR);
+  Instruction *NSplitCond = BinaryOperator::CreateAnd(C1, C2, "lisplit", BR);
+
+  SplitCondition->replaceAllUsesWith(NSplitCond);
+  SplitCondition->eraseFromParent();
+
+  // Remove Latch to Header edge.
+  BasicBlock *LatchSucc = NULL;
+  Header->removePredecessor(Latch);
+  for (succ_iterator SI = succ_begin(Latch), E = succ_end(Latch);
+       SI != E; ++SI) {
+    if (Header != *SI)
+      LatchSucc = *SI;
+  }
+
+  // Clean up latch block.
+  Value *LatchBRCond = LatchBR->getCondition();
+  LatchBR->setUnconditionalDest(LatchSucc);
+  RecursivelyDeleteTriviallyDeadInstructions(LatchBRCond);
+  
+  LPM->deleteLoopFromQueue(L);
+
+  // Update Dominator Info.
+  // Only CFG change done is to remove Latch to Header edge. This
+  // does not change dominator tree because Latch did not dominate
+  // Header.
+  if (DF) {
+    DominanceFrontier::iterator HeaderDF = DF->find(Header);
+    if (HeaderDF != DF->end()) 
+      DF->removeFromFrontier(HeaderDF, Header);
+
+    DominanceFrontier::iterator LatchDF = DF->find(Latch);
+    if (LatchDF != DF->end()) 
+      DF->removeFromFrontier(LatchDF, Header);
+  }
+
+  ++NumIndexSplitRemoved;
+  return true;
+}
+
+/// restrictLoopBound - Op dominates loop body. Op compares an IV based value 
+/// with a loop invariant value. Update loop's lower and upper bound based on 
+/// the loop invariant value.
+bool LoopIndexSplit::restrictLoopBound(ICmpInst &Op) {
+  bool Sign = Op.isSignedPredicate();
+  Instruction *PHTerm = L->getLoopPreheader()->getTerminator();
+
+  if (IVisGT(*ExitCondition) || IVisGE(*ExitCondition)) {
+    BranchInst *EBR = 
+      cast<BranchInst>(ExitCondition->getParent()->getTerminator());
+    ExitCondition->setPredicate(ExitCondition->getInversePredicate());
+    BasicBlock *T = EBR->getSuccessor(0);
+    EBR->setSuccessor(0, EBR->getSuccessor(1));
+    EBR->setSuccessor(1, T);
+  }
+
+  // New upper and lower bounds.
+  Value *NLB = NULL;
+  Value *NUB = NULL;
+  if (Value *V = IVisLT(Op)) {
+    // Restrict upper bound.
+    if (IVisLE(*ExitCondition)) 
+      V = getMinusOne(V, Sign, PHTerm);
+    NUB = getMin(V, IVExitValue, Sign, PHTerm);
+  } else if (Value *V = IVisLE(Op)) {
+    // Restrict upper bound.
+    if (IVisLT(*ExitCondition)) 
+      V = getPlusOne(V, Sign, PHTerm);
+    NUB = getMin(V, IVExitValue, Sign, PHTerm);
+  } else if (Value *V = IVisGT(Op)) {
+    // Restrict lower bound.
+    V = getPlusOne(V, Sign, PHTerm);
+    NLB = getMax(V, IVStartValue, Sign, PHTerm);
+  } else if (Value *V = IVisGE(Op))
+    // Restrict lower bound.
+    NLB = getMax(V, IVStartValue, Sign, PHTerm);
+
+  if (!NLB && !NUB) 
+    return false;
+
+  if (NLB) {
+    unsigned i = IndVar->getBasicBlockIndex(L->getLoopPreheader());
+    IndVar->setIncomingValue(i, NLB);
+  }
+
+  if (NUB) {
+    unsigned i = (ExitCondition->getOperand(0) != IVExitValue);
+    ExitCondition->setOperand(i, NUB);
+  }
+  return true;
+}
+
+/// updateLoopIterationSpace -- Update loop's iteration space if loop 
+/// body is executed for certain IV range only. For example,
+/// 
+/// for (i = 0; i < N; ++i) {
+///   if ( i > A && i < B) {
+///     ...
+///   }
+/// }
+/// is transformed to iterators from A to B, if A > 0 and B < N.
+///
+bool LoopIndexSplit::updateLoopIterationSpace() {
+  SplitCondition = NULL;
+  if (ExitCondition->getPredicate() == ICmpInst::ICMP_NE
+      || ExitCondition->getPredicate() == ICmpInst::ICMP_EQ)
+    return false;
+  BasicBlock *Latch = L->getLoopLatch();
+  BasicBlock *Header = L->getHeader();
+  BranchInst *BR = dyn_cast<BranchInst>(Header->getTerminator());
+  if (!BR) return false;
+  if (!isa<BranchInst>(Latch->getTerminator())) return false;
+  if (BR->isUnconditional()) return false;
+  BinaryOperator *AND = dyn_cast<BinaryOperator>(BR->getCondition());
+  if (!AND) return false;
+  if (AND->getOpcode() != Instruction::And) return false;
+  ICmpInst *Op0 = dyn_cast<ICmpInst>(AND->getOperand(0));
+  ICmpInst *Op1 = dyn_cast<ICmpInst>(AND->getOperand(1));
+  if (!Op0 || !Op1)
+    return false;
+  IVBasedValues.insert(AND);
+  IVBasedValues.insert(Op0);
+  IVBasedValues.insert(Op1);
+  if (!cleanBlock(Header)) return false;
+  BasicBlock *ExitingBlock = ExitCondition->getParent();
+  if (!cleanBlock(ExitingBlock)) return false;
+
+  // If the merge point for BR is not loop latch then skip this loop.
+  if (BR->getSuccessor(0) != Latch) {
+    DominanceFrontier::iterator DF0 = DF->find(BR->getSuccessor(0));
+    assert (DF0 != DF->end() && "Unable to find dominance frontier");
+    if (!DF0->second.count(Latch))
+      return false;
+  }
+  
+  if (BR->getSuccessor(1) != Latch) {
+    DominanceFrontier::iterator DF1 = DF->find(BR->getSuccessor(1));
+    assert (DF1 != DF->end() && "Unable to find dominance frontier");
+    if (!DF1->second.count(Latch))
+      return false;
+  }
+    
+  // Verify that loop exiting block has only two predecessor, where one pred
+  // is split condition block. The other predecessor will become exiting block's
+  // dominator after CFG is updated. TODO : Handle CFG's where exiting block has
+  // more then two predecessors. This requires extra work in updating dominator
+  // information.
+  BasicBlock *ExitingBBPred = NULL;
+  for (pred_iterator PI = pred_begin(ExitingBlock), PE = pred_end(ExitingBlock);
+       PI != PE; ++PI) {
+    BasicBlock *BB = *PI;
+    if (Header == BB)
+      continue;
+    if (ExitingBBPred)
+      return false;
+    else
+      ExitingBBPred = BB;
+  }
+
+  if (!restrictLoopBound(*Op0))
+    return false;
+
+  if (!restrictLoopBound(*Op1))
+    return false;
+
+  // Update CFG.
+  if (BR->getSuccessor(0) == ExitingBlock)
+    BR->setUnconditionalDest(BR->getSuccessor(1));
+  else
+    BR->setUnconditionalDest(BR->getSuccessor(0));
+
+  AND->eraseFromParent();
+  if (Op0->use_empty())
+    Op0->eraseFromParent();
+  if (Op1->use_empty())
+    Op1->eraseFromParent();
+
+  // Update domiantor info. Now, ExitingBlock has only one predecessor, 
+  // ExitingBBPred, and it is ExitingBlock's immediate domiantor.
+  DT->changeImmediateDominator(ExitingBlock, ExitingBBPred);
+
+  BasicBlock *ExitBlock = ExitingBlock->getTerminator()->getSuccessor(1);
+  if (L->contains(ExitBlock))
+    ExitBlock = ExitingBlock->getTerminator()->getSuccessor(0);
+
+  // If ExitingBlock is a member of the loop basic blocks' DF list then
+  // replace ExitingBlock with header and exit block in the DF list
+  DominanceFrontier::iterator ExitingBlockDF = DF->find(ExitingBlock);
+  for (Loop::block_iterator I = L->block_begin(), E = L->block_end();
+       I != E; ++I) {
+    BasicBlock *BB = *I;
+    if (BB == Header || BB == ExitingBlock)
+      continue;
+    DominanceFrontier::iterator BBDF = DF->find(BB);
+    DominanceFrontier::DomSetType::iterator DomSetI = BBDF->second.begin();
+    DominanceFrontier::DomSetType::iterator DomSetE = BBDF->second.end();
+    while (DomSetI != DomSetE) {
+      DominanceFrontier::DomSetType::iterator CurrentItr = DomSetI;
+      ++DomSetI;
+      BasicBlock *DFBB = *CurrentItr;
+      if (DFBB == ExitingBlock) {
+        BBDF->second.erase(DFBB);
+        for (DominanceFrontier::DomSetType::iterator 
+               EBI = ExitingBlockDF->second.begin(),
+               EBE = ExitingBlockDF->second.end(); EBI != EBE; ++EBI) 
+          BBDF->second.insert(*EBI);
+      }
+    }
+  }
+  NumRestrictBounds++;
+  return true;
+}
+
+/// removeBlocks - Remove basic block DeadBB and all blocks dominated by DeadBB.
+/// This routine is used to remove split condition's dead branch, dominated by
+/// DeadBB. LiveBB dominates split conidition's other branch.
+void LoopIndexSplit::removeBlocks(BasicBlock *DeadBB, Loop *LP, 
+                                  BasicBlock *LiveBB) {
+
+  // First update DeadBB's dominance frontier. 
+  SmallVector<BasicBlock *, 8> FrontierBBs;
+  DominanceFrontier::iterator DeadBBDF = DF->find(DeadBB);
+  if (DeadBBDF != DF->end()) {
+    SmallVector<BasicBlock *, 8> PredBlocks;
+    
+    DominanceFrontier::DomSetType DeadBBSet = DeadBBDF->second;
+    for (DominanceFrontier::DomSetType::iterator DeadBBSetI = DeadBBSet.begin(),
+           DeadBBSetE = DeadBBSet.end(); DeadBBSetI != DeadBBSetE; ++DeadBBSetI) 
+      {
+      BasicBlock *FrontierBB = *DeadBBSetI;
+      FrontierBBs.push_back(FrontierBB);
+
+      // Rremove any PHI incoming edge from blocks dominated by DeadBB.
+      PredBlocks.clear();
+      for(pred_iterator PI = pred_begin(FrontierBB), PE = pred_end(FrontierBB);
+          PI != PE; ++PI) {
+        BasicBlock *P = *PI;
+        if (P == DeadBB || DT->dominates(DeadBB, P))
+          PredBlocks.push_back(P);
+      }
+
+      for(BasicBlock::iterator FBI = FrontierBB->begin(), FBE = FrontierBB->end();
+          FBI != FBE; ++FBI) {
+        if (PHINode *PN = dyn_cast<PHINode>(FBI)) {
+          for(SmallVector<BasicBlock *, 8>::iterator PI = PredBlocks.begin(),
+                PE = PredBlocks.end(); PI != PE; ++PI) {
+            BasicBlock *P = *PI;
+            PN->removeIncomingValue(P);
+          }
+        }
+        else
+          break;
+      }      
+    }
+  }
+  
+  // Now remove DeadBB and all nodes dominated by DeadBB in df order.
+  SmallVector<BasicBlock *, 32> WorkList;
+  DomTreeNode *DN = DT->getNode(DeadBB);
+  for (df_iterator<DomTreeNode*> DI = df_begin(DN),
+         E = df_end(DN); DI != E; ++DI) {
+    BasicBlock *BB = DI->getBlock();
+    WorkList.push_back(BB);
+    BB->replaceAllUsesWith(UndefValue::get(Type::LabelTy));
+  }
+
+  while (!WorkList.empty()) {
+    BasicBlock *BB = WorkList.back(); WorkList.pop_back();
+    LPM->deleteSimpleAnalysisValue(BB, LP);
+    for(BasicBlock::iterator BBI = BB->begin(), BBE = BB->end(); 
+        BBI != BBE; ) {
+      Instruction *I = BBI;
+      ++BBI;
+      I->replaceAllUsesWith(UndefValue::get(I->getType()));
+      LPM->deleteSimpleAnalysisValue(I, LP);
+      I->eraseFromParent();
+    }
+    DT->eraseNode(BB);
+    DF->removeBlock(BB);
+    LI->removeBlock(BB);
+    BB->eraseFromParent();
+  }
+
+  // Update Frontier BBs' dominator info.
+  while (!FrontierBBs.empty()) {
+    BasicBlock *FBB = FrontierBBs.back(); FrontierBBs.pop_back();
+    BasicBlock *NewDominator = FBB->getSinglePredecessor();
+    if (!NewDominator) {
+      pred_iterator PI = pred_begin(FBB), PE = pred_end(FBB);
+      NewDominator = *PI;
+      ++PI;
+      if (NewDominator != LiveBB) {
+        for(; PI != PE; ++PI) {
+          BasicBlock *P = *PI;
+          if (P == LiveBB) {
+            NewDominator = LiveBB;
+            break;
+          }
+          NewDominator = DT->findNearestCommonDominator(NewDominator, P);
+        }
+      }
+    }
+    assert (NewDominator && "Unable to fix dominator info.");
+    DT->changeImmediateDominator(FBB, NewDominator);
+    DF->changeImmediateDominator(FBB, NewDominator, DT);
+  }
+
+}
+
+// moveExitCondition - Move exit condition EC into split condition block CondBB.
+void LoopIndexSplit::moveExitCondition(BasicBlock *CondBB, BasicBlock *ActiveBB,
+                                       BasicBlock *ExitBB, ICmpInst *EC, 
+                                       ICmpInst *SC, PHINode *IV, 
+                                       Instruction *IVAdd, Loop *LP,
+                                       unsigned ExitValueNum) {
+
+  BasicBlock *ExitingBB = EC->getParent();
+  Instruction *CurrentBR = CondBB->getTerminator();
+
+  // Move exit condition into split condition block.
+  EC->moveBefore(CurrentBR);
+  EC->setOperand(ExitValueNum == 0 ? 1 : 0, IV);
+
+  // Move exiting block's branch into split condition block. Update its branch
+  // destination.
+  BranchInst *ExitingBR = cast<BranchInst>(ExitingBB->getTerminator());
+  ExitingBR->moveBefore(CurrentBR);
+  BasicBlock *OrigDestBB = NULL;
+  if (ExitingBR->getSuccessor(0) == ExitBB) {
+    OrigDestBB = ExitingBR->getSuccessor(1);
+    ExitingBR->setSuccessor(1, ActiveBB);
+  }
+  else {
+    OrigDestBB = ExitingBR->getSuccessor(0);
+    ExitingBR->setSuccessor(0, ActiveBB);
+  }
+    
+  // Remove split condition and current split condition branch.
+  SC->eraseFromParent();
+  CurrentBR->eraseFromParent();
+
+  // Connect exiting block to original destination.
+  BranchInst::Create(OrigDestBB, ExitingBB);
+
+  // Update PHINodes
+  updatePHINodes(ExitBB, ExitingBB, CondBB, IV, IVAdd, LP);
+
+  // Fix dominator info.
+  // ExitBB is now dominated by CondBB
+  DT->changeImmediateDominator(ExitBB, CondBB);
+  DF->changeImmediateDominator(ExitBB, CondBB, DT);
+
+  // Blocks outside the loop may have been in the dominance frontier of blocks
+  // inside the condition; this is now impossible because the blocks inside the
+  // condition no loger dominate the exit.  Remove the relevant blocks from
+  // the dominance frontiers.
+  for (Loop::block_iterator I = LP->block_begin(), E = LP->block_end();
+       I != E; ++I) {
+    if (*I == CondBB || !DT->dominates(CondBB, *I)) continue;
+    DominanceFrontier::iterator BBDF = DF->find(*I);
+    DominanceFrontier::DomSetType::iterator DomSetI = BBDF->second.begin();
+    DominanceFrontier::DomSetType::iterator DomSetE = BBDF->second.end();
+    while (DomSetI != DomSetE) {
+      DominanceFrontier::DomSetType::iterator CurrentItr = DomSetI;
+      ++DomSetI;
+      BasicBlock *DFBB = *CurrentItr;
+      if (!LP->contains(DFBB))
+        BBDF->second.erase(DFBB);
+    }
+  }
+}
+
+/// updatePHINodes - CFG has been changed. 
+/// Before 
+///   - ExitBB's single predecessor was Latch
+///   - Latch's second successor was Header
+/// Now
+///   - ExitBB's single predecessor is Header
+///   - Latch's one and only successor is Header
+///
+/// Update ExitBB PHINodes' to reflect this change.
+void LoopIndexSplit::updatePHINodes(BasicBlock *ExitBB, BasicBlock *Latch, 
+                                    BasicBlock *Header,
+                                    PHINode *IV, Instruction *IVIncrement,
+                                    Loop *LP) {
+
+  for (BasicBlock::iterator BI = ExitBB->begin(), BE = ExitBB->end(); 
+       BI != BE; ) {
+    PHINode *PN = dyn_cast<PHINode>(BI);
+    ++BI;
+    if (!PN)
+      break;
+
+    Value *V = PN->getIncomingValueForBlock(Latch);
+    if (PHINode *PHV = dyn_cast<PHINode>(V)) {
+      // PHV is in Latch. PHV has one use is in ExitBB PHINode. And one use
+      // in Header which is new incoming value for PN.
+      Value *NewV = NULL;
+      for (Value::use_iterator UI = PHV->use_begin(), E = PHV->use_end(); 
+           UI != E; ++UI) 
+        if (PHINode *U = dyn_cast<PHINode>(*UI)) 
+          if (LP->contains(U->getParent())) {
+            NewV = U;
+            break;
+          }
+
+      // Add incoming value from header only if PN has any use inside the loop.
+      if (NewV)
+        PN->addIncoming(NewV, Header);
+
+    } else if (Instruction *PHI = dyn_cast<Instruction>(V)) {
+      // If this instruction is IVIncrement then IV is new incoming value 
+      // from header otherwise this instruction must be incoming value from 
+      // header because loop is in LCSSA form.
+      if (PHI == IVIncrement)
+        PN->addIncoming(IV, Header);
+      else
+        PN->addIncoming(V, Header);
+    } else
+      // Otherwise this is an incoming value from header because loop is in 
+      // LCSSA form.
+      PN->addIncoming(V, Header);
+    
+    // Remove incoming value from Latch.
+    PN->removeIncomingValue(Latch);
+  }
+}
+
+bool LoopIndexSplit::splitLoop() {
+  SplitCondition = NULL;
+  if (ExitCondition->getPredicate() == ICmpInst::ICMP_NE
+      || ExitCondition->getPredicate() == ICmpInst::ICMP_EQ)
+    return false;
+  BasicBlock *Header = L->getHeader();
+  BasicBlock *Latch = L->getLoopLatch();
+  BranchInst *SBR = NULL; // Split Condition Branch
+  BranchInst *EBR = cast<BranchInst>(ExitCondition->getParent()->getTerminator());
+  // If Exiting block includes loop variant instructions then this
+  // loop may not be split safely.
+  BasicBlock *ExitingBlock = ExitCondition->getParent();
+  if (!cleanBlock(ExitingBlock)) return false;
+
+  for (Loop::block_iterator I = L->block_begin(), E = L->block_end();
+       I != E; ++I) {
+    BranchInst *BR = dyn_cast<BranchInst>((*I)->getTerminator());
+    if (!BR || BR->isUnconditional()) continue;
+    ICmpInst *CI = dyn_cast<ICmpInst>(BR->getCondition());
+    if (!CI || CI == ExitCondition 
+        || CI->getPredicate() == ICmpInst::ICMP_NE
+        || CI->getPredicate() == ICmpInst::ICMP_EQ)
+      continue;
+
+    // Unable to handle triangle loops at the moment.
+    // In triangle loop, split condition is in header and one of the
+    // the split destination is loop latch. If split condition is EQ
+    // then such loops are already handle in processOneIterationLoop().
+    if (Header == (*I)
+        && (Latch == BR->getSuccessor(0) || Latch == BR->getSuccessor(1)))
+      continue;
+
+    // If the block does not dominate the latch then this is not a diamond.
+    // Such loop may not benefit from index split.
+    if (!DT->dominates((*I), Latch))
+      continue;
+
+    // If split condition branches heads do not have single predecessor, 
+    // SplitCondBlock, then is not possible to remove inactive branch.
+    if (!BR->getSuccessor(0)->getSinglePredecessor() 
+        || !BR->getSuccessor(1)->getSinglePredecessor())
+      return false;
+
+    // If the merge point for BR is not loop latch then skip this condition.
+    if (BR->getSuccessor(0) != Latch) {
+      DominanceFrontier::iterator DF0 = DF->find(BR->getSuccessor(0));
+      assert (DF0 != DF->end() && "Unable to find dominance frontier");
+      if (!DF0->second.count(Latch))
+        continue;
+    }
+    
+    if (BR->getSuccessor(1) != Latch) {
+      DominanceFrontier::iterator DF1 = DF->find(BR->getSuccessor(1));
+      assert (DF1 != DF->end() && "Unable to find dominance frontier");
+      if (!DF1->second.count(Latch))
+        continue;
+    }
+    SplitCondition = CI;
+    SBR = BR;
+    break;
+  }
+   
+  if (!SplitCondition)
+    return false;
+
+  // If the predicate sign does not match then skip.
+  if (ExitCondition->isSignedPredicate() != SplitCondition->isSignedPredicate())
+    return false;
+
+  unsigned EVOpNum = (ExitCondition->getOperand(1) == IVExitValue);
+  unsigned SVOpNum = IVBasedValues.count(SplitCondition->getOperand(0));
+  Value *SplitValue = SplitCondition->getOperand(SVOpNum);
+  if (!L->isLoopInvariant(SplitValue))
+    return false;
+  if (!IVBasedValues.count(SplitCondition->getOperand(!SVOpNum)))
+    return false;
+
+  // Normalize loop conditions so that it is easier to calculate new loop
+  // bounds.
+  if (IVisGT(*ExitCondition) || IVisGE(*ExitCondition)) {
+    ExitCondition->setPredicate(ExitCondition->getInversePredicate());
+    BasicBlock *T = EBR->getSuccessor(0);
+    EBR->setSuccessor(0, EBR->getSuccessor(1));
+    EBR->setSuccessor(1, T);
+  }
+
+  if (IVisGT(*SplitCondition) || IVisGE(*SplitCondition)) {
+    SplitCondition->setPredicate(SplitCondition->getInversePredicate());
+    BasicBlock *T = SBR->getSuccessor(0);
+    SBR->setSuccessor(0, SBR->getSuccessor(1));
+    SBR->setSuccessor(1, T);
+  }
+
+  //[*] Calculate new loop bounds.
+  Value *AEV = SplitValue;
+  Value *BSV = SplitValue;
+  bool Sign = SplitCondition->isSignedPredicate();
+  Instruction *PHTerm = L->getLoopPreheader()->getTerminator();
+
+  if (IVisLT(*ExitCondition)) {
+    if (IVisLT(*SplitCondition)) {
+      /* Do nothing */
+    }
+    else if (IVisLE(*SplitCondition)) {
+      AEV = getPlusOne(SplitValue, Sign, PHTerm);
+      BSV = getPlusOne(SplitValue, Sign, PHTerm);
+    } else {
+      assert (0 && "Unexpected split condition!");
+    }
+  }
+  else if (IVisLE(*ExitCondition)) {
+    if (IVisLT(*SplitCondition)) {
+      AEV = getMinusOne(SplitValue, Sign, PHTerm);
+    }
+    else if (IVisLE(*SplitCondition)) {
+      BSV = getPlusOne(SplitValue, Sign, PHTerm);
+    } else {
+      assert (0 && "Unexpected split condition!");
+    }
+  } else {
+    assert (0 && "Unexpected exit condition!");
+  }
+  AEV = getMin(AEV, IVExitValue, Sign, PHTerm);
+  BSV = getMax(BSV, IVStartValue, Sign, PHTerm);
+
+  // [*] Clone Loop
+  DenseMap<const Value *, Value *> ValueMap;
+  Loop *BLoop = CloneLoop(L, LPM, LI, ValueMap, this);
+  Loop *ALoop = L;
+
+  // [*] ALoop's exiting edge enters BLoop's header.
+  //    ALoop's original exit block becomes BLoop's exit block.
+  PHINode *B_IndVar = cast<PHINode>(ValueMap[IndVar]);
+  BasicBlock *A_ExitingBlock = ExitCondition->getParent();
+  BranchInst *A_ExitInsn =
+    dyn_cast<BranchInst>(A_ExitingBlock->getTerminator());
+  assert (A_ExitInsn && "Unable to find suitable loop exit branch");
+  BasicBlock *B_ExitBlock = A_ExitInsn->getSuccessor(1);
+  BasicBlock *B_Header = BLoop->getHeader();
+  if (ALoop->contains(B_ExitBlock)) {
+    B_ExitBlock = A_ExitInsn->getSuccessor(0);
+    A_ExitInsn->setSuccessor(0, B_Header);
+  } else
+    A_ExitInsn->setSuccessor(1, B_Header);
+
+  // [*] Update ALoop's exit value using new exit value.
+  ExitCondition->setOperand(EVOpNum, AEV);
+
+  // [*] Update BLoop's header phi nodes. Remove incoming PHINode's from
+  //     original loop's preheader. Add incoming PHINode values from
+  //     ALoop's exiting block. Update BLoop header's domiantor info.
+
+  // Collect inverse map of Header PHINodes.
+  DenseMap<Value *, Value *> InverseMap;
+  for (BasicBlock::iterator BI = ALoop->getHeader()->begin(), 
+         BE = ALoop->getHeader()->end(); BI != BE; ++BI) {
+    if (PHINode *PN = dyn_cast<PHINode>(BI)) {
+      PHINode *PNClone = cast<PHINode>(ValueMap[PN]);
+      InverseMap[PNClone] = PN;
+    } else
+      break;
+  }
+
+  BasicBlock *A_Preheader = ALoop->getLoopPreheader();
+  for (BasicBlock::iterator BI = B_Header->begin(), BE = B_Header->end();
+       BI != BE; ++BI) {
+    if (PHINode *PN = dyn_cast<PHINode>(BI)) {
+      // Remove incoming value from original preheader.
+      PN->removeIncomingValue(A_Preheader);
+
+      // Add incoming value from A_ExitingBlock.
+      if (PN == B_IndVar)
+        PN->addIncoming(BSV, A_ExitingBlock);
+      else { 
+        PHINode *OrigPN = cast<PHINode>(InverseMap[PN]);
+        Value *V2 = NULL;
+        // If loop header is also loop exiting block then
+        // OrigPN is incoming value for B loop header.
+        if (A_ExitingBlock == ALoop->getHeader())
+          V2 = OrigPN;
+        else
+          V2 = OrigPN->getIncomingValueForBlock(A_ExitingBlock);
+        PN->addIncoming(V2, A_ExitingBlock);
+      }
+    } else
+      break;
+  }
+
+  DT->changeImmediateDominator(B_Header, A_ExitingBlock);
+  DF->changeImmediateDominator(B_Header, A_ExitingBlock, DT);
+  
+  // [*] Update BLoop's exit block. Its new predecessor is BLoop's exit
+  //     block. Remove incoming PHINode values from ALoop's exiting block.
+  //     Add new incoming values from BLoop's incoming exiting value.
+  //     Update BLoop exit block's dominator info..
+  BasicBlock *B_ExitingBlock = cast<BasicBlock>(ValueMap[A_ExitingBlock]);
+  for (BasicBlock::iterator BI = B_ExitBlock->begin(), BE = B_ExitBlock->end();
+       BI != BE; ++BI) {
+    if (PHINode *PN = dyn_cast<PHINode>(BI)) {
+      PN->addIncoming(ValueMap[PN->getIncomingValueForBlock(A_ExitingBlock)], 
+                                                            B_ExitingBlock);
+      PN->removeIncomingValue(A_ExitingBlock);
+    } else
+      break;
+  }
+
+  DT->changeImmediateDominator(B_ExitBlock, B_ExitingBlock);
+  DF->changeImmediateDominator(B_ExitBlock, B_ExitingBlock, DT);
+
+  //[*] Split ALoop's exit edge. This creates a new block which
+  //    serves two purposes. First one is to hold PHINode defnitions
+  //    to ensure that ALoop's LCSSA form. Second use it to act
+  //    as a preheader for BLoop.
+  BasicBlock *A_ExitBlock = SplitEdge(A_ExitingBlock, B_Header, this);
+
+  //[*] Preserve ALoop's LCSSA form. Create new forwarding PHINodes
+  //    in A_ExitBlock to redefine outgoing PHI definitions from ALoop.
+  for(BasicBlock::iterator BI = B_Header->begin(), BE = B_Header->end();
+      BI != BE; ++BI) {
+    if (PHINode *PN = dyn_cast<PHINode>(BI)) {
+      Value *V1 = PN->getIncomingValueForBlock(A_ExitBlock);
+      PHINode *newPHI = PHINode::Create(PN->getType(), PN->getName());
+      newPHI->addIncoming(V1, A_ExitingBlock);
+      A_ExitBlock->getInstList().push_front(newPHI);
+      PN->removeIncomingValue(A_ExitBlock);
+      PN->addIncoming(newPHI, A_ExitBlock);
+    } else
+      break;
+  }
+
+  //[*] Eliminate split condition's inactive branch from ALoop.
+  BasicBlock *A_SplitCondBlock = SplitCondition->getParent();
+  BranchInst *A_BR = cast<BranchInst>(A_SplitCondBlock->getTerminator());
+  BasicBlock *A_InactiveBranch = NULL;
+  BasicBlock *A_ActiveBranch = NULL;
+  A_ActiveBranch = A_BR->getSuccessor(0);
+  A_InactiveBranch = A_BR->getSuccessor(1);
+  A_BR->setUnconditionalDest(A_ActiveBranch);
+  removeBlocks(A_InactiveBranch, L, A_ActiveBranch);
+
+  //[*] Eliminate split condition's inactive branch in from BLoop.
+  BasicBlock *B_SplitCondBlock = cast<BasicBlock>(ValueMap[A_SplitCondBlock]);
+  BranchInst *B_BR = cast<BranchInst>(B_SplitCondBlock->getTerminator());
+  BasicBlock *B_InactiveBranch = NULL;
+  BasicBlock *B_ActiveBranch = NULL;
+  B_ActiveBranch = B_BR->getSuccessor(1);
+  B_InactiveBranch = B_BR->getSuccessor(0);
+  B_BR->setUnconditionalDest(B_ActiveBranch);
+  removeBlocks(B_InactiveBranch, BLoop, B_ActiveBranch);
+
+  BasicBlock *A_Header = ALoop->getHeader();
+  if (A_ExitingBlock == A_Header)
+    return true;
+
+  //[*] Move exit condition into split condition block to avoid
+  //    executing dead loop iteration.
+  ICmpInst *B_ExitCondition = cast<ICmpInst>(ValueMap[ExitCondition]);
+  Instruction *B_IndVarIncrement = cast<Instruction>(ValueMap[IVIncrement]);
+  ICmpInst *B_SplitCondition = cast<ICmpInst>(ValueMap[SplitCondition]);
+
+  moveExitCondition(A_SplitCondBlock, A_ActiveBranch, A_ExitBlock, ExitCondition,
+                    cast<ICmpInst>(SplitCondition), IndVar, IVIncrement, 
+                    ALoop, EVOpNum);
+
+  moveExitCondition(B_SplitCondBlock, B_ActiveBranch, 
+                    B_ExitBlock, B_ExitCondition,
+                    B_SplitCondition, B_IndVar, B_IndVarIncrement, 
+                    BLoop, EVOpNum);
+
+  NumIndexSplit++;
+  return true;
+}
+
+/// cleanBlock - A block is considered clean if all non terminal instructions 
+/// are either, PHINodes, IV based.
+bool LoopIndexSplit::cleanBlock(BasicBlock *BB) {
+  Instruction *Terminator = BB->getTerminator();
+  for(BasicBlock::iterator BI = BB->begin(), BE = BB->end(); 
+      BI != BE; ++BI) {
+    Instruction *I = BI;
+
+    if (isa<PHINode>(I) || I == Terminator || I == ExitCondition
+        || I == SplitCondition || IVBasedValues.count(I) 
+        || isa<DbgInfoIntrinsic>(I))
+      continue;
+
+    if (I->mayHaveSideEffects())
+      return false;
+
+    // I is used only inside this block then it is OK.
+    bool usedOutsideBB = false;
+    for (Value::use_iterator UI = I->use_begin(), UE = I->use_end(); 
+         UI != UE; ++UI) {
+      Instruction *U = cast<Instruction>(UI);
+      if (U->getParent() != BB)
+        usedOutsideBB = true;
+    }
+    if (!usedOutsideBB)
+      continue;
+
+    // Otherwise we have a instruction that may not allow loop spliting.
+    return false;
+  }
+  return true;
+}
+
+/// IVisLT - If Op is comparing IV based value with an loop invariant and 
+/// IV based value is less than  the loop invariant then return the loop 
+/// invariant. Otherwise return NULL.
+Value * LoopIndexSplit::IVisLT(ICmpInst &Op) {
+  ICmpInst::Predicate P = Op.getPredicate();
+  if ((P == ICmpInst::ICMP_SLT || P == ICmpInst::ICMP_ULT) 
+      && IVBasedValues.count(Op.getOperand(0)) 
+      && L->isLoopInvariant(Op.getOperand(1)))
+    return Op.getOperand(1);
+
+  if ((P == ICmpInst::ICMP_SGT || P == ICmpInst::ICMP_UGT) 
+      && IVBasedValues.count(Op.getOperand(1)) 
+      && L->isLoopInvariant(Op.getOperand(0)))
+    return Op.getOperand(0);
+
+  return NULL;
+}
+
+/// IVisLE - If Op is comparing IV based value with an loop invariant and 
+/// IV based value is less than or equal to the loop invariant then 
+/// return the loop invariant. Otherwise return NULL.
+Value * LoopIndexSplit::IVisLE(ICmpInst &Op) {
+  ICmpInst::Predicate P = Op.getPredicate();
+  if ((P == ICmpInst::ICMP_SLE || P == ICmpInst::ICMP_ULE)
+      && IVBasedValues.count(Op.getOperand(0)) 
+      && L->isLoopInvariant(Op.getOperand(1)))
+    return Op.getOperand(1);
+
+  if ((P == ICmpInst::ICMP_SGE || P == ICmpInst::ICMP_UGE) 
+      && IVBasedValues.count(Op.getOperand(1)) 
+      && L->isLoopInvariant(Op.getOperand(0)))
+    return Op.getOperand(0);
+
+  return NULL;
+}
+
+/// IVisGT - If Op is comparing IV based value with an loop invariant and 
+/// IV based value is greater than  the loop invariant then return the loop 
+/// invariant. Otherwise return NULL.
+Value * LoopIndexSplit::IVisGT(ICmpInst &Op) {
+  ICmpInst::Predicate P = Op.getPredicate();
+  if ((P == ICmpInst::ICMP_SGT || P == ICmpInst::ICMP_UGT) 
+      && IVBasedValues.count(Op.getOperand(0)) 
+      && L->isLoopInvariant(Op.getOperand(1)))
+    return Op.getOperand(1);
+
+  if ((P == ICmpInst::ICMP_SLT || P == ICmpInst::ICMP_ULT) 
+      && IVBasedValues.count(Op.getOperand(1)) 
+      && L->isLoopInvariant(Op.getOperand(0)))
+    return Op.getOperand(0);
+
+  return NULL;
+}
+
+/// IVisGE - If Op is comparing IV based value with an loop invariant and 
+/// IV based value is greater than or equal to the loop invariant then 
+/// return the loop invariant. Otherwise return NULL.
+Value * LoopIndexSplit::IVisGE(ICmpInst &Op) {
+  ICmpInst::Predicate P = Op.getPredicate();
+  if ((P == ICmpInst::ICMP_SGE || P == ICmpInst::ICMP_UGE)
+      && IVBasedValues.count(Op.getOperand(0)) 
+      && L->isLoopInvariant(Op.getOperand(1)))
+    return Op.getOperand(1);
+
+  if ((P == ICmpInst::ICMP_SLE || P == ICmpInst::ICMP_ULE) 
+      && IVBasedValues.count(Op.getOperand(1)) 
+      && L->isLoopInvariant(Op.getOperand(0)))
+    return Op.getOperand(0);
+
+  return NULL;
+}
+
diff --git a/lib/Transforms/Scalar/LoopRotation.cpp b/lib/Transforms/Scalar/LoopRotation.cpp
new file mode 100644
index 0000000..a088230
--- /dev/null
+++ b/lib/Transforms/Scalar/LoopRotation.cpp
@@ -0,0 +1,572 @@
+//===- LoopRotation.cpp - Loop Rotation Pass ------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements Loop Rotation Pass.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "loop-rotate"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Function.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/SmallVector.h"
+using namespace llvm;
+
+#define MAX_HEADER_SIZE 16
+
+STATISTIC(NumRotated, "Number of loops rotated");
+namespace {
+
+  class VISIBILITY_HIDDEN RenameData {
+  public:
+    RenameData(Instruction *O, Value *P, Instruction *H) 
+      : Original(O), PreHeader(P), Header(H) { }
+  public:
+    Instruction *Original; // Original instruction
+    Value *PreHeader; // Original pre-header replacement
+    Instruction *Header; // New header replacement
+  };
+  
+  class VISIBILITY_HIDDEN LoopRotate : public LoopPass {
+
+  public:
+    static char ID; // Pass ID, replacement for typeid
+    LoopRotate() : LoopPass(&ID) {}
+
+    // Rotate Loop L as many times as possible. Return true if
+    // loop is rotated at least once.
+    bool runOnLoop(Loop *L, LPPassManager &LPM);
+
+    // LCSSA form makes instruction renaming easier.
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequiredID(LoopSimplifyID);
+      AU.addPreservedID(LoopSimplifyID);
+      AU.addRequiredID(LCSSAID);
+      AU.addPreservedID(LCSSAID);
+      AU.addPreserved<ScalarEvolution>();
+      AU.addPreserved<LoopInfo>();
+      AU.addPreserved<DominatorTree>();
+      AU.addPreserved<DominanceFrontier>();
+    }
+
+    // Helper functions
+
+    /// Do actual work
+    bool rotateLoop(Loop *L, LPPassManager &LPM);
+    
+    /// Initialize local data
+    void initialize();
+
+    /// Make sure all Exit block PHINodes have required incoming values.
+    /// If incoming value is constant or defined outside the loop then
+    /// PHINode may not have an entry for original pre-header. 
+    void  updateExitBlock();
+
+    /// Return true if this instruction is used outside original header.
+    bool usedOutsideOriginalHeader(Instruction *In);
+
+    /// Find Replacement information for instruction. Return NULL if it is
+    /// not available.
+    const RenameData *findReplacementData(Instruction *I);
+
+    /// After loop rotation, loop pre-header has multiple sucessors.
+    /// Insert one forwarding basic block to ensure that loop pre-header
+    /// has only one successor.
+    void preserveCanonicalLoopForm(LPPassManager &LPM);
+
+  private:
+
+    Loop *L;
+    BasicBlock *OrigHeader;
+    BasicBlock *OrigPreHeader;
+    BasicBlock *OrigLatch;
+    BasicBlock *NewHeader;
+    BasicBlock *Exit;
+    LPPassManager *LPM_Ptr;
+    SmallVector<RenameData, MAX_HEADER_SIZE> LoopHeaderInfo;
+  };
+}
+  
+char LoopRotate::ID = 0;
+static RegisterPass<LoopRotate> X("loop-rotate", "Rotate Loops");
+
+Pass *llvm::createLoopRotatePass() { return new LoopRotate(); }
+
+/// Rotate Loop L as many times as possible. Return true if
+/// loop is rotated at least once.
+bool LoopRotate::runOnLoop(Loop *Lp, LPPassManager &LPM) {
+
+  bool RotatedOneLoop = false;
+  initialize();
+  LPM_Ptr = &LPM;
+
+  // One loop can be rotated multiple times.
+  while (rotateLoop(Lp,LPM)) {
+    RotatedOneLoop = true;
+    initialize();
+  }
+
+  return RotatedOneLoop;
+}
+
+/// Rotate loop LP. Return true if the loop is rotated.
+bool LoopRotate::rotateLoop(Loop *Lp, LPPassManager &LPM) {
+  L = Lp;
+
+  OrigHeader =  L->getHeader();
+  OrigPreHeader = L->getLoopPreheader();
+  OrigLatch = L->getLoopLatch();
+
+  // If loop has only one block then there is not much to rotate.
+  if (L->getBlocks().size() == 1)
+    return false;
+
+  assert(OrigHeader && OrigLatch && OrigPreHeader &&
+         "Loop is not in canonical form");
+
+  // If loop header is not one of the loop exit block then
+  // either this loop is already rotated or it is not 
+  // suitable for loop rotation transformations.
+  if (!L->isLoopExit(OrigHeader))
+    return false;
+
+  BranchInst *BI = dyn_cast<BranchInst>(OrigHeader->getTerminator());
+  if (!BI)
+    return false;
+  assert(BI->isConditional() && "Branch Instruction is not conditional");
+
+  // Updating PHInodes in loops with multiple exits adds complexity. 
+  // Keep it simple, and restrict loop rotation to loops with one exit only.
+  // In future, lift this restriction and support for multiple exits if
+  // required.
+  SmallVector<BasicBlock*, 8> ExitBlocks;
+  L->getExitBlocks(ExitBlocks);
+  if (ExitBlocks.size() > 1)
+    return false;
+
+  // Check size of original header and reject
+  // loop if it is very big.
+  unsigned Size = 0;
+  
+  // FIXME: Use common api to estimate size.
+  for (BasicBlock::const_iterator OI = OrigHeader->begin(), 
+         OE = OrigHeader->end(); OI != OE; ++OI) {
+      if (isa<PHINode>(OI)) 
+        continue;           // PHI nodes don't count.
+      if (isa<DbgInfoIntrinsic>(OI))
+        continue;  // Debug intrinsics don't count as size.
+      Size++;
+  }
+
+  if (Size > MAX_HEADER_SIZE)
+    return false;
+
+  // Now, this loop is suitable for rotation.
+
+  // Find new Loop header. NewHeader is a Header's one and only successor
+  // that is inside loop.  Header's other successor is outside the
+  // loop.  Otherwise loop is not suitable for rotation.
+  Exit = BI->getSuccessor(0);
+  NewHeader = BI->getSuccessor(1);
+  if (L->contains(Exit))
+    std::swap(Exit, NewHeader);
+  assert(NewHeader && "Unable to determine new loop header");
+  assert(L->contains(NewHeader) && !L->contains(Exit) && 
+         "Unable to determine loop header and exit blocks");
+  
+  // This code assumes that new header has exactly one predecessor.  Remove any
+  // single entry PHI nodes in it.
+  assert(NewHeader->getSinglePredecessor() &&
+         "New header doesn't have one pred!");
+  FoldSingleEntryPHINodes(NewHeader);
+
+  // Copy PHI nodes and other instructions from original header
+  // into original pre-header. Unlike original header, original pre-header is
+  // not a member of loop. 
+  //
+  // New loop header is one and only successor of original header that 
+  // is inside the loop. All other original header successors are outside 
+  // the loop. Copy PHI Nodes from original header into new loop header. 
+  // Add second incoming value, from original loop pre-header into these phi 
+  // nodes. If a value defined in original header is used outside original 
+  // header then new loop header will need new phi nodes with two incoming 
+  // values, one definition from original header and second definition is 
+  // from original loop pre-header.
+
+  // Remove terminator from Original pre-header. Original pre-header will
+  // receive a clone of original header terminator as a new terminator.
+  OrigPreHeader->getInstList().pop_back();
+  BasicBlock::iterator I = OrigHeader->begin(), E = OrigHeader->end();
+  PHINode *PN = 0;
+  for (; (PN = dyn_cast<PHINode>(I)); ++I) {
+    // PHI nodes are not copied into original pre-header. Instead their values
+    // are directly propagated.
+    Value *NPV = PN->getIncomingValueForBlock(OrigPreHeader);
+
+    // Create new PHI node with two incoming values for NewHeader.
+    // One incoming value is from OrigLatch (through OrigHeader) and 
+    // second incoming value is from original pre-header.
+    PHINode *NH = PHINode::Create(PN->getType(), PN->getName(),
+                                  NewHeader->begin());
+    NH->addIncoming(PN->getIncomingValueForBlock(OrigLatch), OrigHeader);
+    NH->addIncoming(NPV, OrigPreHeader);
+    
+    // "In" can be replaced by NH at various places.
+    LoopHeaderInfo.push_back(RenameData(PN, NPV, NH));
+  }
+
+  // Now, handle non-phi instructions.
+  for (; I != E; ++I) {
+    Instruction *In = I;
+    assert(!isa<PHINode>(In) && "PHINode is not expected here");
+    
+    // This is not a PHI instruction. Insert its clone into original pre-header.
+    // If this instruction is using a value from same basic block then
+    // update it to use value from cloned instruction.
+    Instruction *C = In->clone();
+    C->setName(In->getName());
+    OrigPreHeader->getInstList().push_back(C);
+
+    for (unsigned opi = 0, e = In->getNumOperands(); opi != e; ++opi) {
+      Instruction *OpInsn = dyn_cast<Instruction>(In->getOperand(opi));
+      if (!OpInsn) continue;  // Ignore non-instruction values.
+      if (const RenameData *D = findReplacementData(OpInsn))
+        C->setOperand(opi, D->PreHeader);
+    }
+
+    // If this instruction is used outside this basic block then
+    // create new PHINode for this instruction.
+    Instruction *NewHeaderReplacement = NULL;
+    if (usedOutsideOriginalHeader(In)) {
+      PHINode *PN = PHINode::Create(In->getType(), In->getName(),
+                                    NewHeader->begin());
+      PN->addIncoming(In, OrigHeader);
+      PN->addIncoming(C, OrigPreHeader);
+      NewHeaderReplacement = PN;
+    }
+    LoopHeaderInfo.push_back(RenameData(In, C, NewHeaderReplacement));
+  }
+
+  // Rename uses of original header instructions to reflect their new
+  // definitions (either from original pre-header node or from newly created
+  // new header PHINodes.
+  //
+  // Original header instructions are used in
+  // 1) Original header:
+  //
+  //    If instruction is used in non-phi instructions then it is using
+  //    defintion from original heder iteself. Do not replace this use
+  //    with definition from new header or original pre-header.
+  //
+  //    If instruction is used in phi node then it is an incoming 
+  //    value. Rename its use to reflect new definition from new-preheader
+  //    or new header.
+  //
+  // 2) Inside loop but not in original header
+  //
+  //    Replace this use to reflect definition from new header.
+  for (unsigned LHI = 0, LHI_E = LoopHeaderInfo.size(); LHI != LHI_E; ++LHI) {
+    const RenameData &ILoopHeaderInfo = LoopHeaderInfo[LHI];
+
+    if (!ILoopHeaderInfo.Header)
+      continue;
+
+    Instruction *OldPhi = ILoopHeaderInfo.Original;
+    Instruction *NewPhi = ILoopHeaderInfo.Header;
+
+    // Before replacing uses, collect them first, so that iterator is
+    // not invalidated.
+    SmallVector<Instruction *, 16> AllUses;
+    for (Value::use_iterator UI = OldPhi->use_begin(), UE = OldPhi->use_end();
+         UI != UE; ++UI)
+      AllUses.push_back(cast<Instruction>(UI));
+
+    for (SmallVector<Instruction *, 16>::iterator UI = AllUses.begin(), 
+           UE = AllUses.end(); UI != UE; ++UI) {
+      Instruction *U = *UI;
+      BasicBlock *Parent = U->getParent();
+
+      // Used inside original header
+      if (Parent == OrigHeader) {
+        // Do not rename uses inside original header non-phi instructions.
+        PHINode *PU = dyn_cast<PHINode>(U);
+        if (!PU)
+          continue;
+
+        // Do not rename uses inside original header phi nodes, if the
+        // incoming value is for new header.
+        if (PU->getBasicBlockIndex(NewHeader) != -1
+            && PU->getIncomingValueForBlock(NewHeader) == U)
+          continue;
+        
+       U->replaceUsesOfWith(OldPhi, NewPhi);
+       continue;
+      }
+
+      // Used inside loop, but not in original header.
+      if (L->contains(U->getParent())) {
+        if (U != NewPhi)
+          U->replaceUsesOfWith(OldPhi, NewPhi);
+        continue;
+      }
+      
+      // Used inside Exit Block. Since we are in LCSSA form, U must be PHINode.
+      if (U->getParent() == Exit) {
+        assert(isa<PHINode>(U) && "Use in Exit Block that is not PHINode");
+        
+        PHINode *UPhi = cast<PHINode>(U);
+        // UPhi already has one incoming argument from original header. 
+        // Add second incoming argument from new Pre header.
+        UPhi->addIncoming(ILoopHeaderInfo.PreHeader, OrigPreHeader);
+      } else {
+        // Used outside Exit block. Create a new PHI node from exit block
+        // to receive value from ne new header ane pre header.
+        PHINode *PN = PHINode::Create(U->getType(), U->getName(),
+                                      Exit->begin());
+        PN->addIncoming(ILoopHeaderInfo.PreHeader, OrigPreHeader);
+        PN->addIncoming(OldPhi, OrigHeader);
+        U->replaceUsesOfWith(OldPhi, PN);
+      }
+    }
+  }
+  
+  /// Make sure all Exit block PHINodes have required incoming values.
+  updateExitBlock();
+
+  // Update CFG
+
+  // Removing incoming branch from loop preheader to original header.
+  // Now original header is inside the loop.
+  for (BasicBlock::iterator I = OrigHeader->begin(), E = OrigHeader->end();
+       I != E; ++I)
+    if (PHINode *PN = dyn_cast<PHINode>(I))
+      PN->removeIncomingValue(OrigPreHeader);
+
+  // Make NewHeader as the new header for the loop.
+  L->moveToHeader(NewHeader);
+
+  preserveCanonicalLoopForm(LPM);
+
+  NumRotated++;
+  return true;
+}
+
+/// Make sure all Exit block PHINodes have required incoming values.
+/// If incoming value is constant or defined outside the loop then
+/// PHINode may not have an entry for original pre-header. 
+void LoopRotate::updateExitBlock() {
+
+  for (BasicBlock::iterator I = Exit->begin(), E = Exit->end();
+       I != E; ++I) {
+
+    PHINode *PN = dyn_cast<PHINode>(I);
+    if (!PN)
+      break;
+
+    // There is already one incoming value from original pre-header block.
+    if (PN->getBasicBlockIndex(OrigPreHeader) != -1)
+      continue;
+
+    const RenameData *ILoopHeaderInfo;
+    Value *V = PN->getIncomingValueForBlock(OrigHeader);
+    if (isa<Instruction>(V) && 
+        (ILoopHeaderInfo = findReplacementData(cast<Instruction>(V)))) {
+      assert(ILoopHeaderInfo->PreHeader && "Missing New Preheader Instruction");
+      PN->addIncoming(ILoopHeaderInfo->PreHeader, OrigPreHeader);
+    } else {
+      PN->addIncoming(V, OrigPreHeader);
+    }
+  }
+}
+
+/// Initialize local data
+void LoopRotate::initialize() {
+  L = NULL;
+  OrigHeader = NULL;
+  OrigPreHeader = NULL;
+  NewHeader = NULL;
+  Exit = NULL;
+
+  LoopHeaderInfo.clear();
+}
+
+/// Return true if this instruction is used by any instructions in the loop that
+/// aren't in original header.
+bool LoopRotate::usedOutsideOriginalHeader(Instruction *In) {
+  for (Value::use_iterator UI = In->use_begin(), UE = In->use_end();
+       UI != UE; ++UI) {
+    BasicBlock *UserBB = cast<Instruction>(UI)->getParent();
+    if (UserBB != OrigHeader && L->contains(UserBB))
+      return true;
+  }
+
+  return false;
+}
+
+/// Find Replacement information for instruction. Return NULL if it is
+/// not available.
+const RenameData *LoopRotate::findReplacementData(Instruction *In) {
+
+  // Since LoopHeaderInfo is small, linear walk is OK.
+  for (unsigned LHI = 0, LHI_E = LoopHeaderInfo.size(); LHI != LHI_E; ++LHI) {
+    const RenameData &ILoopHeaderInfo = LoopHeaderInfo[LHI];
+    if (ILoopHeaderInfo.Original == In)
+      return &ILoopHeaderInfo;
+  }
+  return NULL;
+}
+
+/// After loop rotation, loop pre-header has multiple sucessors.
+/// Insert one forwarding basic block to ensure that loop pre-header
+/// has only one successor.
+void LoopRotate::preserveCanonicalLoopForm(LPPassManager &LPM) {
+
+  // Right now original pre-header has two successors, new header and
+  // exit block. Insert new block between original pre-header and
+  // new header such that loop's new pre-header has only one successor.
+  BasicBlock *NewPreHeader = BasicBlock::Create("bb.nph",
+                                                OrigHeader->getParent(), 
+                                                NewHeader);
+  LoopInfo &LI = LPM.getAnalysis<LoopInfo>();
+  if (Loop *PL = LI.getLoopFor(OrigPreHeader))
+    PL->addBasicBlockToLoop(NewPreHeader, LI.getBase());
+  BranchInst::Create(NewHeader, NewPreHeader);
+  
+  BranchInst *OrigPH_BI = cast<BranchInst>(OrigPreHeader->getTerminator());
+  if (OrigPH_BI->getSuccessor(0) == NewHeader)
+    OrigPH_BI->setSuccessor(0, NewPreHeader);
+  else {
+    assert(OrigPH_BI->getSuccessor(1) == NewHeader &&
+           "Unexpected original pre-header terminator");
+    OrigPH_BI->setSuccessor(1, NewPreHeader);
+  }
+  
+  for (BasicBlock::iterator I = NewHeader->begin(), E = NewHeader->end();
+       I != E; ++I) {
+    PHINode *PN = dyn_cast<PHINode>(I);
+    if (!PN)
+      break;
+
+    int index = PN->getBasicBlockIndex(OrigPreHeader);
+    assert(index != -1 && "Expected incoming value from Original PreHeader");
+    PN->setIncomingBlock(index, NewPreHeader);
+    assert(PN->getBasicBlockIndex(OrigPreHeader) == -1 && 
+           "Expected only one incoming value from Original PreHeader");
+  }
+
+  if (DominatorTree *DT = getAnalysisIfAvailable<DominatorTree>()) {
+    DT->addNewBlock(NewPreHeader, OrigPreHeader);
+    DT->changeImmediateDominator(L->getHeader(), NewPreHeader);
+    DT->changeImmediateDominator(Exit, OrigPreHeader);
+    for (Loop::block_iterator BI = L->block_begin(), BE = L->block_end();
+         BI != BE; ++BI) {
+      BasicBlock *B = *BI;
+      if (L->getHeader() != B) {
+        DomTreeNode *Node = DT->getNode(B);
+        if (Node && Node->getBlock() == OrigHeader)
+          DT->changeImmediateDominator(*BI, L->getHeader());
+      }
+    }
+    DT->changeImmediateDominator(OrigHeader, OrigLatch);
+  }
+
+  if (DominanceFrontier *DF = getAnalysisIfAvailable<DominanceFrontier>()) {
+    // New Preheader's dominance frontier is Exit block.
+    DominanceFrontier::DomSetType NewPHSet;
+    NewPHSet.insert(Exit);
+    DF->addBasicBlock(NewPreHeader, NewPHSet);
+
+    // New Header's dominance frontier now includes itself and Exit block
+    DominanceFrontier::iterator HeadI = DF->find(L->getHeader());
+    if (HeadI != DF->end()) {
+      DominanceFrontier::DomSetType & HeaderSet = HeadI->second;
+      HeaderSet.clear();
+      HeaderSet.insert(L->getHeader());
+      HeaderSet.insert(Exit);
+    } else {
+      DominanceFrontier::DomSetType HeaderSet;
+      HeaderSet.insert(L->getHeader());
+      HeaderSet.insert(Exit);
+      DF->addBasicBlock(L->getHeader(), HeaderSet);
+    }
+
+    // Original header (new Loop Latch)'s dominance frontier is Exit.
+    DominanceFrontier::iterator LatchI = DF->find(L->getLoopLatch());
+    if (LatchI != DF->end()) {
+      DominanceFrontier::DomSetType &LatchSet = LatchI->second;
+      LatchSet = LatchI->second;
+      LatchSet.clear();
+      LatchSet.insert(Exit);
+    } else {
+      DominanceFrontier::DomSetType LatchSet;
+      LatchSet.insert(Exit);
+      DF->addBasicBlock(L->getHeader(), LatchSet);
+    }
+
+    // If a loop block dominates new loop latch then its frontier is
+    // new header and Exit.
+    BasicBlock *NewLatch = L->getLoopLatch();
+    DominatorTree *DT = getAnalysisIfAvailable<DominatorTree>();
+    for (Loop::block_iterator BI = L->block_begin(), BE = L->block_end();
+         BI != BE; ++BI) {
+      BasicBlock *B = *BI;
+      if (DT->dominates(B, NewLatch)) {
+        DominanceFrontier::iterator BDFI = DF->find(B);
+        if (BDFI != DF->end()) {
+          DominanceFrontier::DomSetType &BSet = BDFI->second;
+          BSet = BDFI->second;
+          BSet.clear();
+          BSet.insert(L->getHeader());
+          BSet.insert(Exit);
+        } else {
+          DominanceFrontier::DomSetType BSet;
+          BSet.insert(L->getHeader());
+          BSet.insert(Exit);
+          DF->addBasicBlock(B, BSet);
+        }
+      }
+    }
+  }
+
+  // Preserve canonical loop form, which means Exit block should
+  // have only one predecessor.
+  BasicBlock *NExit = SplitEdge(L->getLoopLatch(), Exit, this);
+
+  // Preserve LCSSA.
+  BasicBlock::iterator I = Exit->begin(), E = Exit->end();
+  PHINode *PN = NULL;
+  for (; (PN = dyn_cast<PHINode>(I)); ++I) {
+    unsigned N = PN->getNumIncomingValues();
+    for (unsigned index = 0; index < N; ++index)
+      if (PN->getIncomingBlock(index) == NExit) {
+        PHINode *NewPN = PHINode::Create(PN->getType(), PN->getName(),
+                                         NExit->begin());
+        NewPN->addIncoming(PN->getIncomingValue(index), L->getLoopLatch());
+        PN->setIncomingValue(index, NewPN);
+        PN->setIncomingBlock(index, NExit);
+        break;
+      }
+  }
+
+  assert(NewHeader && L->getHeader() == NewHeader &&
+         "Invalid loop header after loop rotation");
+  assert(NewPreHeader && L->getLoopPreheader() == NewPreHeader &&
+         "Invalid loop preheader after loop rotation");
+  assert(L->getLoopLatch() &&
+         "Invalid loop latch after loop rotation");
+}
diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
new file mode 100644
index 0000000..92270b5
--- /dev/null
+++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -0,0 +1,2605 @@
+//===- LoopStrengthReduce.cpp - Strength Reduce IVs in Loops --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This transformation analyzes and transforms the induction variables (and
+// computations derived from them) into forms suitable for efficient execution
+// on the target.
+//
+// This pass performs a strength reduction on array references inside loops that
+// have as one or more of their components the loop induction variable, it
+// rewrites expressions to take advantage of scaled-index addressing modes
+// available on the target, and it performs a variety of other optimizations
+// related to loop induction variables.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "loop-reduce"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Constants.h"
+#include "llvm/Instructions.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Type.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/IVUsers.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/Transforms/Utils/AddrModeMatcher.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ValueHandle.h"
+#include "llvm/Target/TargetLowering.h"
+#include <algorithm>
+using namespace llvm;
+
+STATISTIC(NumReduced ,    "Number of IV uses strength reduced");
+STATISTIC(NumInserted,    "Number of PHIs inserted");
+STATISTIC(NumVariable,    "Number of PHIs with variable strides");
+STATISTIC(NumEliminated,  "Number of strides eliminated");
+STATISTIC(NumShadow,      "Number of Shadow IVs optimized");
+STATISTIC(NumImmSunk,     "Number of common expr immediates sunk into uses");
+STATISTIC(NumLoopCond,    "Number of loop terminating conds optimized");
+
+static cl::opt<bool> EnableFullLSRMode("enable-full-lsr",
+                                       cl::init(false),
+                                       cl::Hidden);
+
+namespace {
+
+  struct BasedUser;
+
+  /// IVInfo - This structure keeps track of one IV expression inserted during
+  /// StrengthReduceStridedIVUsers. It contains the stride, the common base, as
+  /// well as the PHI node and increment value created for rewrite.
+  struct VISIBILITY_HIDDEN IVExpr {
+    SCEVHandle  Stride;
+    SCEVHandle  Base;
+    PHINode    *PHI;
+
+    IVExpr(const SCEVHandle &stride, const SCEVHandle &base, PHINode *phi)
+      : Stride(stride), Base(base), PHI(phi) {}
+  };
+
+  /// IVsOfOneStride - This structure keeps track of all IV expression inserted
+  /// during StrengthReduceStridedIVUsers for a particular stride of the IV.
+  struct VISIBILITY_HIDDEN IVsOfOneStride {
+    std::vector<IVExpr> IVs;
+
+    void addIV(const SCEVHandle &Stride, const SCEVHandle &Base, PHINode *PHI) {
+      IVs.push_back(IVExpr(Stride, Base, PHI));
+    }
+  };
+
+  class VISIBILITY_HIDDEN LoopStrengthReduce : public LoopPass {
+    IVUsers *IU;
+    LoopInfo *LI;
+    DominatorTree *DT;
+    ScalarEvolution *SE;
+    bool Changed;
+
+    /// IVsByStride - Keep track of all IVs that have been inserted for a
+    /// particular stride.
+    std::map<SCEVHandle, IVsOfOneStride> IVsByStride;
+
+    /// StrideNoReuse - Keep track of all the strides whose ivs cannot be
+    /// reused (nor should they be rewritten to reuse other strides).
+    SmallSet<SCEVHandle, 4> StrideNoReuse;
+
+    /// DeadInsts - Keep track of instructions we may have made dead, so that
+    /// we can remove them after we are done working.
+    SmallVector<WeakVH, 16> DeadInsts;
+
+    /// TLI - Keep a pointer of a TargetLowering to consult for determining
+    /// transformation profitability.
+    const TargetLowering *TLI;
+
+  public:
+    static char ID; // Pass ID, replacement for typeid
+    explicit LoopStrengthReduce(const TargetLowering *tli = NULL) : 
+      LoopPass(&ID), TLI(tli) {
+    }
+
+    bool runOnLoop(Loop *L, LPPassManager &LPM);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      // We split critical edges, so we change the CFG.  However, we do update
+      // many analyses if they are around.
+      AU.addPreservedID(LoopSimplifyID);
+      AU.addPreserved<LoopInfo>();
+      AU.addPreserved<DominanceFrontier>();
+      AU.addPreserved<DominatorTree>();
+
+      AU.addRequiredID(LoopSimplifyID);
+      AU.addRequired<LoopInfo>();
+      AU.addRequired<DominatorTree>();
+      AU.addRequired<ScalarEvolution>();
+      AU.addPreserved<ScalarEvolution>();
+      AU.addRequired<IVUsers>();
+      AU.addPreserved<IVUsers>();
+    }
+
+  private:
+    ICmpInst *ChangeCompareStride(Loop *L, ICmpInst *Cond,
+                                  IVStrideUse* &CondUse,
+                                  const SCEVHandle* &CondStride);
+
+    void OptimizeIndvars(Loop *L);
+    void OptimizeLoopCountIV(Loop *L);
+    void OptimizeLoopTermCond(Loop *L);
+
+    /// OptimizeShadowIV - If IV is used in a int-to-float cast
+    /// inside the loop then try to eliminate the cast opeation.
+    void OptimizeShadowIV(Loop *L);
+
+    /// OptimizeSMax - Rewrite the loop's terminating condition
+    /// if it uses an smax computation.
+    ICmpInst *OptimizeSMax(Loop *L, ICmpInst *Cond,
+                           IVStrideUse* &CondUse);
+
+    bool FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse,
+                           const SCEVHandle *&CondStride);
+    bool RequiresTypeConversion(const Type *Ty, const Type *NewTy);
+    SCEVHandle CheckForIVReuse(bool, bool, bool, const SCEVHandle&,
+                             IVExpr&, const Type*,
+                             const std::vector<BasedUser>& UsersToProcess);
+    bool ValidScale(bool, int64_t,
+                    const std::vector<BasedUser>& UsersToProcess);
+    bool ValidOffset(bool, int64_t, int64_t,
+                     const std::vector<BasedUser>& UsersToProcess);
+    SCEVHandle CollectIVUsers(const SCEVHandle &Stride,
+                              IVUsersOfOneStride &Uses,
+                              Loop *L,
+                              bool &AllUsesAreAddresses,
+                              bool &AllUsesAreOutsideLoop,
+                              std::vector<BasedUser> &UsersToProcess);
+    bool ShouldUseFullStrengthReductionMode(
+                                const std::vector<BasedUser> &UsersToProcess,
+                                const Loop *L,
+                                bool AllUsesAreAddresses,
+                                SCEVHandle Stride);
+    void PrepareToStrengthReduceFully(
+                             std::vector<BasedUser> &UsersToProcess,
+                             SCEVHandle Stride,
+                             SCEVHandle CommonExprs,
+                             const Loop *L,
+                             SCEVExpander &PreheaderRewriter);
+    void PrepareToStrengthReduceFromSmallerStride(
+                                         std::vector<BasedUser> &UsersToProcess,
+                                         Value *CommonBaseV,
+                                         const IVExpr &ReuseIV,
+                                         Instruction *PreInsertPt);
+    void PrepareToStrengthReduceWithNewPhi(
+                                  std::vector<BasedUser> &UsersToProcess,
+                                  SCEVHandle Stride,
+                                  SCEVHandle CommonExprs,
+                                  Value *CommonBaseV,
+                                  Instruction *IVIncInsertPt,
+                                  const Loop *L,
+                                  SCEVExpander &PreheaderRewriter);
+    void StrengthReduceStridedIVUsers(const SCEVHandle &Stride,
+                                      IVUsersOfOneStride &Uses,
+                                      Loop *L);
+    void DeleteTriviallyDeadInstructions();
+  };
+}
+
+char LoopStrengthReduce::ID = 0;
+static RegisterPass<LoopStrengthReduce>
+X("loop-reduce", "Loop Strength Reduction");
+
+Pass *llvm::createLoopStrengthReducePass(const TargetLowering *TLI) {
+  return new LoopStrengthReduce(TLI);
+}
+
+/// DeleteTriviallyDeadInstructions - If any of the instructions is the
+/// specified set are trivially dead, delete them and see if this makes any of
+/// their operands subsequently dead.
+void LoopStrengthReduce::DeleteTriviallyDeadInstructions() {
+  if (DeadInsts.empty()) return;
+  
+  while (!DeadInsts.empty()) {
+    Instruction *I = dyn_cast_or_null<Instruction>(DeadInsts.back());
+    DeadInsts.pop_back();
+    
+    if (I == 0 || !isInstructionTriviallyDead(I))
+      continue;
+
+    for (User::op_iterator OI = I->op_begin(), E = I->op_end(); OI != E; ++OI) {
+      if (Instruction *U = dyn_cast<Instruction>(*OI)) {
+        *OI = 0;
+        if (U->use_empty())
+          DeadInsts.push_back(U);
+      }
+    }
+    
+    I->eraseFromParent();
+    Changed = true;
+  }
+}
+
+/// containsAddRecFromDifferentLoop - Determine whether expression S involves a 
+/// subexpression that is an AddRec from a loop other than L.  An outer loop 
+/// of L is OK, but not an inner loop nor a disjoint loop.
+static bool containsAddRecFromDifferentLoop(SCEVHandle S, Loop *L) {
+  // This is very common, put it first.
+  if (isa<SCEVConstant>(S))
+    return false;
+  if (const SCEVCommutativeExpr *AE = dyn_cast<SCEVCommutativeExpr>(S)) {
+    for (unsigned int i=0; i< AE->getNumOperands(); i++)
+      if (containsAddRecFromDifferentLoop(AE->getOperand(i), L))
+        return true;
+    return false;
+  }
+  if (const SCEVAddRecExpr *AE = dyn_cast<SCEVAddRecExpr>(S)) {
+    if (const Loop *newLoop = AE->getLoop()) {
+      if (newLoop == L)
+        return false;
+      // if newLoop is an outer loop of L, this is OK.
+      if (!LoopInfoBase<BasicBlock>::isNotAlreadyContainedIn(L, newLoop))
+        return false;
+    }
+    return true;
+  }
+  if (const SCEVUDivExpr *DE = dyn_cast<SCEVUDivExpr>(S))
+    return containsAddRecFromDifferentLoop(DE->getLHS(), L) ||
+           containsAddRecFromDifferentLoop(DE->getRHS(), L);
+#if 0
+  // SCEVSDivExpr has been backed out temporarily, but will be back; we'll 
+  // need this when it is.
+  if (const SCEVSDivExpr *DE = dyn_cast<SCEVSDivExpr>(S))
+    return containsAddRecFromDifferentLoop(DE->getLHS(), L) ||
+           containsAddRecFromDifferentLoop(DE->getRHS(), L);
+#endif
+  if (const SCEVCastExpr *CE = dyn_cast<SCEVCastExpr>(S))
+    return containsAddRecFromDifferentLoop(CE->getOperand(), L);
+  return false;
+}
+
+/// isAddressUse - Returns true if the specified instruction is using the
+/// specified value as an address.
+static bool isAddressUse(Instruction *Inst, Value *OperandVal) {
+  bool isAddress = isa<LoadInst>(Inst);
+  if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
+    if (SI->getOperand(1) == OperandVal)
+      isAddress = true;
+  } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
+    // Addressing modes can also be folded into prefetches and a variety
+    // of intrinsics.
+    switch (II->getIntrinsicID()) {
+      default: break;
+      case Intrinsic::prefetch:
+      case Intrinsic::x86_sse2_loadu_dq:
+      case Intrinsic::x86_sse2_loadu_pd:
+      case Intrinsic::x86_sse_loadu_ps:
+      case Intrinsic::x86_sse_storeu_ps:
+      case Intrinsic::x86_sse2_storeu_pd:
+      case Intrinsic::x86_sse2_storeu_dq:
+      case Intrinsic::x86_sse2_storel_dq:
+        if (II->getOperand(1) == OperandVal)
+          isAddress = true;
+        break;
+    }
+  }
+  return isAddress;
+}
+
+/// getAccessType - Return the type of the memory being accessed.
+static const Type *getAccessType(const Instruction *Inst) {
+  const Type *AccessTy = Inst->getType();
+  if (const StoreInst *SI = dyn_cast<StoreInst>(Inst))
+    AccessTy = SI->getOperand(0)->getType();
+  else if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
+    // Addressing modes can also be folded into prefetches and a variety
+    // of intrinsics.
+    switch (II->getIntrinsicID()) {
+    default: break;
+    case Intrinsic::x86_sse_storeu_ps:
+    case Intrinsic::x86_sse2_storeu_pd:
+    case Intrinsic::x86_sse2_storeu_dq:
+    case Intrinsic::x86_sse2_storel_dq:
+      AccessTy = II->getOperand(1)->getType();
+      break;
+    }
+  }
+  return AccessTy;
+}
+
+namespace {
+  /// BasedUser - For a particular base value, keep information about how we've
+  /// partitioned the expression so far.
+  struct BasedUser {
+    /// SE - The current ScalarEvolution object.
+    ScalarEvolution *SE;
+
+    /// Base - The Base value for the PHI node that needs to be inserted for
+    /// this use.  As the use is processed, information gets moved from this
+    /// field to the Imm field (below).  BasedUser values are sorted by this
+    /// field.
+    SCEVHandle Base;
+    
+    /// Inst - The instruction using the induction variable.
+    Instruction *Inst;
+
+    /// OperandValToReplace - The operand value of Inst to replace with the
+    /// EmittedBase.
+    Value *OperandValToReplace;
+
+    /// isSigned - The stride (and thus also the Base) of this use may be in
+    /// a narrower type than the use itself (OperandValToReplace->getType()).
+    /// When this is the case, the isSigned field indicates whether the
+    /// IV expression should be signed-extended instead of zero-extended to
+    /// fit the type of the use.
+    bool isSigned;
+
+    /// Imm - The immediate value that should be added to the base immediately
+    /// before Inst, because it will be folded into the imm field of the
+    /// instruction.  This is also sometimes used for loop-variant values that
+    /// must be added inside the loop.
+    SCEVHandle Imm;
+
+    /// Phi - The induction variable that performs the striding that
+    /// should be used for this user.
+    PHINode *Phi;
+
+    // isUseOfPostIncrementedValue - True if this should use the
+    // post-incremented version of this IV, not the preincremented version.
+    // This can only be set in special cases, such as the terminating setcc
+    // instruction for a loop and uses outside the loop that are dominated by
+    // the loop.
+    bool isUseOfPostIncrementedValue;
+    
+    BasedUser(IVStrideUse &IVSU, ScalarEvolution *se)
+      : SE(se), Base(IVSU.getOffset()), Inst(IVSU.getUser()),
+        OperandValToReplace(IVSU.getOperandValToReplace()),
+        isSigned(IVSU.isSigned()),
+        Imm(SE->getIntegerSCEV(0, Base->getType())), 
+        isUseOfPostIncrementedValue(IVSU.isUseOfPostIncrementedValue()) {}
+
+    // Once we rewrite the code to insert the new IVs we want, update the
+    // operands of Inst to use the new expression 'NewBase', with 'Imm' added
+    // to it.
+    void RewriteInstructionToUseNewBase(const SCEVHandle &NewBase,
+                                        Instruction *InsertPt,
+                                       SCEVExpander &Rewriter, Loop *L, Pass *P,
+                                        LoopInfo &LI,
+                                        SmallVectorImpl<WeakVH> &DeadInsts);
+    
+    Value *InsertCodeForBaseAtPosition(const SCEVHandle &NewBase, 
+                                       const Type *Ty,
+                                       SCEVExpander &Rewriter,
+                                       Instruction *IP, Loop *L,
+                                       LoopInfo &LI);
+    void dump() const;
+  };
+}
+
+void BasedUser::dump() const {
+  cerr << " Base=" << *Base;
+  cerr << " Imm=" << *Imm;
+  cerr << "   Inst: " << *Inst;
+}
+
+Value *BasedUser::InsertCodeForBaseAtPosition(const SCEVHandle &NewBase, 
+                                              const Type *Ty,
+                                              SCEVExpander &Rewriter,
+                                              Instruction *IP, Loop *L,
+                                              LoopInfo &LI) {
+  // Figure out where we *really* want to insert this code.  In particular, if
+  // the user is inside of a loop that is nested inside of L, we really don't
+  // want to insert this expression before the user, we'd rather pull it out as
+  // many loops as possible.
+  Instruction *BaseInsertPt = IP;
+  
+  // Figure out the most-nested loop that IP is in.
+  Loop *InsertLoop = LI.getLoopFor(IP->getParent());
+  
+  // If InsertLoop is not L, and InsertLoop is nested inside of L, figure out
+  // the preheader of the outer-most loop where NewBase is not loop invariant.
+  if (L->contains(IP->getParent()))
+    while (InsertLoop && NewBase->isLoopInvariant(InsertLoop)) {
+      BaseInsertPt = InsertLoop->getLoopPreheader()->getTerminator();
+      InsertLoop = InsertLoop->getParentLoop();
+    }
+  
+  Value *Base = Rewriter.expandCodeFor(NewBase, 0, BaseInsertPt);
+
+  SCEVHandle NewValSCEV = SE->getUnknown(Base);
+
+  // If there is no immediate value, skip the next part.
+  if (!Imm->isZero()) {
+    // If we are inserting the base and imm values in the same block, make sure
+    // to adjust the IP position if insertion reused a result.
+    if (IP == BaseInsertPt)
+      IP = Rewriter.getInsertionPoint();
+
+    // Always emit the immediate (if non-zero) into the same block as the user.
+    NewValSCEV = SE->getAddExpr(NewValSCEV, Imm);
+  }
+
+  if (isSigned)
+    NewValSCEV = SE->getTruncateOrSignExtend(NewValSCEV, Ty);
+  else
+    NewValSCEV = SE->getTruncateOrZeroExtend(NewValSCEV, Ty);
+
+  return Rewriter.expandCodeFor(NewValSCEV, Ty, IP);
+}
+
+
+// Once we rewrite the code to insert the new IVs we want, update the
+// operands of Inst to use the new expression 'NewBase', with 'Imm' added
+// to it. NewBasePt is the last instruction which contributes to the
+// value of NewBase in the case that it's a diffferent instruction from
+// the PHI that NewBase is computed from, or null otherwise.
+//
+void BasedUser::RewriteInstructionToUseNewBase(const SCEVHandle &NewBase,
+                                               Instruction *NewBasePt,
+                                      SCEVExpander &Rewriter, Loop *L, Pass *P,
+                                      LoopInfo &LI,
+                                      SmallVectorImpl<WeakVH> &DeadInsts) {
+  if (!isa<PHINode>(Inst)) {
+    // By default, insert code at the user instruction.
+    BasicBlock::iterator InsertPt = Inst;
+    
+    // However, if the Operand is itself an instruction, the (potentially
+    // complex) inserted code may be shared by many users.  Because of this, we
+    // want to emit code for the computation of the operand right before its old
+    // computation.  This is usually safe, because we obviously used to use the
+    // computation when it was computed in its current block.  However, in some
+    // cases (e.g. use of a post-incremented induction variable) the NewBase
+    // value will be pinned to live somewhere after the original computation.
+    // In this case, we have to back off.
+    //
+    // If this is a use outside the loop (which means after, since it is based
+    // on a loop indvar) we use the post-incremented value, so that we don't
+    // artificially make the preinc value live out the bottom of the loop. 
+    if (!isUseOfPostIncrementedValue && L->contains(Inst->getParent())) {
+      if (NewBasePt && isa<PHINode>(OperandValToReplace)) {
+        InsertPt = NewBasePt;
+        ++InsertPt;
+      } else if (Instruction *OpInst
+                 = dyn_cast<Instruction>(OperandValToReplace)) {
+        InsertPt = OpInst;
+        while (isa<PHINode>(InsertPt)) ++InsertPt;
+      }
+    }
+    Value *NewVal = InsertCodeForBaseAtPosition(NewBase,
+                                                OperandValToReplace->getType(),
+                                                Rewriter, InsertPt, L, LI);
+    // Replace the use of the operand Value with the new Phi we just created.
+    Inst->replaceUsesOfWith(OperandValToReplace, NewVal);
+
+    DOUT << "      Replacing with ";
+    DEBUG(WriteAsOperand(*DOUT, NewVal, /*PrintType=*/false));
+    DOUT << ", which has value " << *NewBase << " plus IMM " << *Imm << "\n";
+    return;
+  }
+
+  // PHI nodes are more complex.  We have to insert one copy of the NewBase+Imm
+  // expression into each operand block that uses it.  Note that PHI nodes can
+  // have multiple entries for the same predecessor.  We use a map to make sure
+  // that a PHI node only has a single Value* for each predecessor (which also
+  // prevents us from inserting duplicate code in some blocks).
+  DenseMap<BasicBlock*, Value*> InsertedCode;
+  PHINode *PN = cast<PHINode>(Inst);
+  for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+    if (PN->getIncomingValue(i) == OperandValToReplace) {
+      // If the original expression is outside the loop, put the replacement
+      // code in the same place as the original expression,
+      // which need not be an immediate predecessor of this PHI.  This way we 
+      // need only one copy of it even if it is referenced multiple times in
+      // the PHI.  We don't do this when the original expression is inside the
+      // loop because multiple copies sometimes do useful sinking of code in
+      // that case(?).
+      Instruction *OldLoc = dyn_cast<Instruction>(OperandValToReplace);
+      if (L->contains(OldLoc->getParent())) {
+        // If this is a critical edge, split the edge so that we do not insert
+        // the code on all predecessor/successor paths.  We do this unless this
+        // is the canonical backedge for this loop, as this can make some
+        // inserted code be in an illegal position.
+        BasicBlock *PHIPred = PN->getIncomingBlock(i);
+        if (e != 1 && PHIPred->getTerminator()->getNumSuccessors() > 1 &&
+            (PN->getParent() != L->getHeader() || !L->contains(PHIPred))) {
+
+          // First step, split the critical edge.
+          SplitCriticalEdge(PHIPred, PN->getParent(), P, false);
+
+          // Next step: move the basic block.  In particular, if the PHI node
+          // is outside of the loop, and PredTI is in the loop, we want to
+          // move the block to be immediately before the PHI block, not
+          // immediately after PredTI.
+          if (L->contains(PHIPred) && !L->contains(PN->getParent())) {
+            BasicBlock *NewBB = PN->getIncomingBlock(i);
+            NewBB->moveBefore(PN->getParent());
+          }
+
+          // Splitting the edge can reduce the number of PHI entries we have.
+          e = PN->getNumIncomingValues();
+        }
+      }
+      Value *&Code = InsertedCode[PN->getIncomingBlock(i)];
+      if (!Code) {
+        // Insert the code into the end of the predecessor block.
+        Instruction *InsertPt = (L->contains(OldLoc->getParent())) ?
+                                PN->getIncomingBlock(i)->getTerminator() :
+                                OldLoc->getParent()->getTerminator();
+        Code = InsertCodeForBaseAtPosition(NewBase, PN->getType(),
+                                           Rewriter, InsertPt, L, LI);
+
+        DOUT << "      Changing PHI use to ";
+        DEBUG(WriteAsOperand(*DOUT, Code, /*PrintType=*/false));
+        DOUT << ", which has value " << *NewBase << " plus IMM " << *Imm << "\n";
+      }
+
+      // Replace the use of the operand Value with the new Phi we just created.
+      PN->setIncomingValue(i, Code);
+      Rewriter.clear();
+    }
+  }
+
+  // PHI node might have become a constant value after SplitCriticalEdge.
+  DeadInsts.push_back(Inst);
+}
+
+
+/// fitsInAddressMode - Return true if V can be subsumed within an addressing
+/// mode, and does not need to be put in a register first.
+static bool fitsInAddressMode(const SCEVHandle &V, const Type *AccessTy,
+                             const TargetLowering *TLI, bool HasBaseReg) {
+  if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(V)) {
+    int64_t VC = SC->getValue()->getSExtValue();
+    if (TLI) {
+      TargetLowering::AddrMode AM;
+      AM.BaseOffs = VC;
+      AM.HasBaseReg = HasBaseReg;
+      return TLI->isLegalAddressingMode(AM, AccessTy);
+    } else {
+      // Defaults to PPC. PPC allows a sign-extended 16-bit immediate field.
+      return (VC > -(1 << 16) && VC < (1 << 16)-1);
+    }
+  }
+
+  if (const SCEVUnknown *SU = dyn_cast<SCEVUnknown>(V))
+    if (GlobalValue *GV = dyn_cast<GlobalValue>(SU->getValue())) {
+      if (TLI) {
+        TargetLowering::AddrMode AM;
+        AM.BaseGV = GV;
+        AM.HasBaseReg = HasBaseReg;
+        return TLI->isLegalAddressingMode(AM, AccessTy);
+      } else {
+        // Default: assume global addresses are not legal.
+      }
+    }
+
+  return false;
+}
+
+/// MoveLoopVariantsToImmediateField - Move any subexpressions from Val that are
+/// loop varying to the Imm operand.
+static void MoveLoopVariantsToImmediateField(SCEVHandle &Val, SCEVHandle &Imm,
+                                             Loop *L, ScalarEvolution *SE) {
+  if (Val->isLoopInvariant(L)) return;  // Nothing to do.
+  
+  if (const SCEVAddExpr *SAE = dyn_cast<SCEVAddExpr>(Val)) {
+    std::vector<SCEVHandle> NewOps;
+    NewOps.reserve(SAE->getNumOperands());
+    
+    for (unsigned i = 0; i != SAE->getNumOperands(); ++i)
+      if (!SAE->getOperand(i)->isLoopInvariant(L)) {
+        // If this is a loop-variant expression, it must stay in the immediate
+        // field of the expression.
+        Imm = SE->getAddExpr(Imm, SAE->getOperand(i));
+      } else {
+        NewOps.push_back(SAE->getOperand(i));
+      }
+
+    if (NewOps.empty())
+      Val = SE->getIntegerSCEV(0, Val->getType());
+    else
+      Val = SE->getAddExpr(NewOps);
+  } else if (const SCEVAddRecExpr *SARE = dyn_cast<SCEVAddRecExpr>(Val)) {
+    // Try to pull immediates out of the start value of nested addrec's.
+    SCEVHandle Start = SARE->getStart();
+    MoveLoopVariantsToImmediateField(Start, Imm, L, SE);
+    
+    std::vector<SCEVHandle> Ops(SARE->op_begin(), SARE->op_end());
+    Ops[0] = Start;
+    Val = SE->getAddRecExpr(Ops, SARE->getLoop());
+  } else {
+    // Otherwise, all of Val is variant, move the whole thing over.
+    Imm = SE->getAddExpr(Imm, Val);
+    Val = SE->getIntegerSCEV(0, Val->getType());
+  }
+}
+
+
+/// MoveImmediateValues - Look at Val, and pull out any additions of constants
+/// that can fit into the immediate field of instructions in the target.
+/// Accumulate these immediate values into the Imm value.
+static void MoveImmediateValues(const TargetLowering *TLI,
+                                const Type *AccessTy,
+                                SCEVHandle &Val, SCEVHandle &Imm,
+                                bool isAddress, Loop *L,
+                                ScalarEvolution *SE) {
+  if (const SCEVAddExpr *SAE = dyn_cast<SCEVAddExpr>(Val)) {
+    std::vector<SCEVHandle> NewOps;
+    NewOps.reserve(SAE->getNumOperands());
+    
+    for (unsigned i = 0; i != SAE->getNumOperands(); ++i) {
+      SCEVHandle NewOp = SAE->getOperand(i);
+      MoveImmediateValues(TLI, AccessTy, NewOp, Imm, isAddress, L, SE);
+      
+      if (!NewOp->isLoopInvariant(L)) {
+        // If this is a loop-variant expression, it must stay in the immediate
+        // field of the expression.
+        Imm = SE->getAddExpr(Imm, NewOp);
+      } else {
+        NewOps.push_back(NewOp);
+      }
+    }
+
+    if (NewOps.empty())
+      Val = SE->getIntegerSCEV(0, Val->getType());
+    else
+      Val = SE->getAddExpr(NewOps);
+    return;
+  } else if (const SCEVAddRecExpr *SARE = dyn_cast<SCEVAddRecExpr>(Val)) {
+    // Try to pull immediates out of the start value of nested addrec's.
+    SCEVHandle Start = SARE->getStart();
+    MoveImmediateValues(TLI, AccessTy, Start, Imm, isAddress, L, SE);
+    
+    if (Start != SARE->getStart()) {
+      std::vector<SCEVHandle> Ops(SARE->op_begin(), SARE->op_end());
+      Ops[0] = Start;
+      Val = SE->getAddRecExpr(Ops, SARE->getLoop());
+    }
+    return;
+  } else if (const SCEVMulExpr *SME = dyn_cast<SCEVMulExpr>(Val)) {
+    // Transform "8 * (4 + v)" -> "32 + 8*V" if "32" fits in the immed field.
+    if (isAddress &&
+        fitsInAddressMode(SME->getOperand(0), AccessTy, TLI, false) &&
+        SME->getNumOperands() == 2 && SME->isLoopInvariant(L)) {
+
+      SCEVHandle SubImm = SE->getIntegerSCEV(0, Val->getType());
+      SCEVHandle NewOp = SME->getOperand(1);
+      MoveImmediateValues(TLI, AccessTy, NewOp, SubImm, isAddress, L, SE);
+      
+      // If we extracted something out of the subexpressions, see if we can 
+      // simplify this!
+      if (NewOp != SME->getOperand(1)) {
+        // Scale SubImm up by "8".  If the result is a target constant, we are
+        // good.
+        SubImm = SE->getMulExpr(SubImm, SME->getOperand(0));
+        if (fitsInAddressMode(SubImm, AccessTy, TLI, false)) {
+          // Accumulate the immediate.
+          Imm = SE->getAddExpr(Imm, SubImm);
+          
+          // Update what is left of 'Val'.
+          Val = SE->getMulExpr(SME->getOperand(0), NewOp);
+          return;
+        }
+      }
+    }
+  }
+
+  // Loop-variant expressions must stay in the immediate field of the
+  // expression.
+  if ((isAddress && fitsInAddressMode(Val, AccessTy, TLI, false)) ||
+      !Val->isLoopInvariant(L)) {
+    Imm = SE->getAddExpr(Imm, Val);
+    Val = SE->getIntegerSCEV(0, Val->getType());
+    return;
+  }
+
+  // Otherwise, no immediates to move.
+}
+
+static void MoveImmediateValues(const TargetLowering *TLI,
+                                Instruction *User,
+                                SCEVHandle &Val, SCEVHandle &Imm,
+                                bool isAddress, Loop *L,
+                                ScalarEvolution *SE) {
+  const Type *AccessTy = getAccessType(User);
+  MoveImmediateValues(TLI, AccessTy, Val, Imm, isAddress, L, SE);
+}
+
+/// SeparateSubExprs - Decompose Expr into all of the subexpressions that are
+/// added together.  This is used to reassociate common addition subexprs
+/// together for maximal sharing when rewriting bases.
+static void SeparateSubExprs(std::vector<SCEVHandle> &SubExprs,
+                             SCEVHandle Expr,
+                             ScalarEvolution *SE) {
+  if (const SCEVAddExpr *AE = dyn_cast<SCEVAddExpr>(Expr)) {
+    for (unsigned j = 0, e = AE->getNumOperands(); j != e; ++j)
+      SeparateSubExprs(SubExprs, AE->getOperand(j), SE);
+  } else if (const SCEVAddRecExpr *SARE = dyn_cast<SCEVAddRecExpr>(Expr)) {
+    SCEVHandle Zero = SE->getIntegerSCEV(0, Expr->getType());
+    if (SARE->getOperand(0) == Zero) {
+      SubExprs.push_back(Expr);
+    } else {
+      // Compute the addrec with zero as its base.
+      std::vector<SCEVHandle> Ops(SARE->op_begin(), SARE->op_end());
+      Ops[0] = Zero;   // Start with zero base.
+      SubExprs.push_back(SE->getAddRecExpr(Ops, SARE->getLoop()));
+      
+
+      SeparateSubExprs(SubExprs, SARE->getOperand(0), SE);
+    }
+  } else if (!Expr->isZero()) {
+    // Do not add zero.
+    SubExprs.push_back(Expr);
+  }
+}
+
+// This is logically local to the following function, but C++ says we have 
+// to make it file scope.
+struct SubExprUseData { unsigned Count; bool notAllUsesAreFree; };
+
+/// RemoveCommonExpressionsFromUseBases - Look through all of the Bases of all
+/// the Uses, removing any common subexpressions, except that if all such
+/// subexpressions can be folded into an addressing mode for all uses inside
+/// the loop (this case is referred to as "free" in comments herein) we do
+/// not remove anything.  This looks for things like (a+b+c) and
+/// (a+c+d) and computes the common (a+c) subexpression.  The common expression
+/// is *removed* from the Bases and returned.
+static SCEVHandle 
+RemoveCommonExpressionsFromUseBases(std::vector<BasedUser> &Uses,
+                                    ScalarEvolution *SE, Loop *L,
+                                    const TargetLowering *TLI) {
+  unsigned NumUses = Uses.size();
+
+  // Only one use?  This is a very common case, so we handle it specially and
+  // cheaply.
+  SCEVHandle Zero = SE->getIntegerSCEV(0, Uses[0].Base->getType());
+  SCEVHandle Result = Zero;
+  SCEVHandle FreeResult = Zero;
+  if (NumUses == 1) {
+    // If the use is inside the loop, use its base, regardless of what it is:
+    // it is clearly shared across all the IV's.  If the use is outside the loop
+    // (which means after it) we don't want to factor anything *into* the loop,
+    // so just use 0 as the base.
+    if (L->contains(Uses[0].Inst->getParent()))
+      std::swap(Result, Uses[0].Base);
+    return Result;
+  }
+
+  // To find common subexpressions, count how many of Uses use each expression.
+  // If any subexpressions are used Uses.size() times, they are common.
+  // Also track whether all uses of each expression can be moved into an
+  // an addressing mode "for free"; such expressions are left within the loop.
+  // struct SubExprUseData { unsigned Count; bool notAllUsesAreFree; };
+  std::map<SCEVHandle, SubExprUseData> SubExpressionUseData;
+  
+  // UniqueSubExprs - Keep track of all of the subexpressions we see in the
+  // order we see them.
+  std::vector<SCEVHandle> UniqueSubExprs;
+
+  std::vector<SCEVHandle> SubExprs;
+  unsigned NumUsesInsideLoop = 0;
+  for (unsigned i = 0; i != NumUses; ++i) {
+    // If the user is outside the loop, just ignore it for base computation.
+    // Since the user is outside the loop, it must be *after* the loop (if it
+    // were before, it could not be based on the loop IV).  We don't want users
+    // after the loop to affect base computation of values *inside* the loop,
+    // because we can always add their offsets to the result IV after the loop
+    // is done, ensuring we get good code inside the loop.
+    if (!L->contains(Uses[i].Inst->getParent()))
+      continue;
+    NumUsesInsideLoop++;
+    
+    // If the base is zero (which is common), return zero now, there are no
+    // CSEs we can find.
+    if (Uses[i].Base == Zero) return Zero;
+
+    // If this use is as an address we may be able to put CSEs in the addressing
+    // mode rather than hoisting them.
+    bool isAddrUse = isAddressUse(Uses[i].Inst, Uses[i].OperandValToReplace);
+    // We may need the AccessTy below, but only when isAddrUse, so compute it
+    // only in that case.
+    const Type *AccessTy = 0;
+    if (isAddrUse)
+      AccessTy = getAccessType(Uses[i].Inst);
+
+    // Split the expression into subexprs.
+    SeparateSubExprs(SubExprs, Uses[i].Base, SE);
+    // Add one to SubExpressionUseData.Count for each subexpr present, and
+    // if the subexpr is not a valid immediate within an addressing mode use,
+    // set SubExpressionUseData.notAllUsesAreFree.  We definitely want to
+    // hoist these out of the loop (if they are common to all uses).
+    for (unsigned j = 0, e = SubExprs.size(); j != e; ++j) {
+      if (++SubExpressionUseData[SubExprs[j]].Count == 1)
+        UniqueSubExprs.push_back(SubExprs[j]);
+      if (!isAddrUse || !fitsInAddressMode(SubExprs[j], AccessTy, TLI, false))
+        SubExpressionUseData[SubExprs[j]].notAllUsesAreFree = true;
+    }
+    SubExprs.clear();
+  }
+
+  // Now that we know how many times each is used, build Result.  Iterate over
+  // UniqueSubexprs so that we have a stable ordering.
+  for (unsigned i = 0, e = UniqueSubExprs.size(); i != e; ++i) {
+    std::map<SCEVHandle, SubExprUseData>::iterator I = 
+       SubExpressionUseData.find(UniqueSubExprs[i]);
+    assert(I != SubExpressionUseData.end() && "Entry not found?");
+    if (I->second.Count == NumUsesInsideLoop) { // Found CSE! 
+      if (I->second.notAllUsesAreFree)
+        Result = SE->getAddExpr(Result, I->first);
+      else 
+        FreeResult = SE->getAddExpr(FreeResult, I->first);
+    } else
+      // Remove non-cse's from SubExpressionUseData.
+      SubExpressionUseData.erase(I);
+  }
+
+  if (FreeResult != Zero) {
+    // We have some subexpressions that can be subsumed into addressing
+    // modes in every use inside the loop.  However, it's possible that
+    // there are so many of them that the combined FreeResult cannot
+    // be subsumed, or that the target cannot handle both a FreeResult
+    // and a Result in the same instruction (for example because it would
+    // require too many registers).  Check this.
+    for (unsigned i=0; i<NumUses; ++i) {
+      if (!L->contains(Uses[i].Inst->getParent()))
+        continue;
+      // We know this is an addressing mode use; if there are any uses that
+      // are not, FreeResult would be Zero.
+      const Type *AccessTy = getAccessType(Uses[i].Inst);
+      if (!fitsInAddressMode(FreeResult, AccessTy, TLI, Result!=Zero)) {
+        // FIXME:  could split up FreeResult into pieces here, some hoisted
+        // and some not.  There is no obvious advantage to this.
+        Result = SE->getAddExpr(Result, FreeResult);
+        FreeResult = Zero;
+        break;
+      }
+    }
+  }
+
+  // If we found no CSE's, return now.
+  if (Result == Zero) return Result;
+  
+  // If we still have a FreeResult, remove its subexpressions from
+  // SubExpressionUseData.  This means they will remain in the use Bases.
+  if (FreeResult != Zero) {
+    SeparateSubExprs(SubExprs, FreeResult, SE);
+    for (unsigned j = 0, e = SubExprs.size(); j != e; ++j) {
+      std::map<SCEVHandle, SubExprUseData>::iterator I = 
+         SubExpressionUseData.find(SubExprs[j]);
+      SubExpressionUseData.erase(I);
+    }
+    SubExprs.clear();
+  }
+
+  // Otherwise, remove all of the CSE's we found from each of the base values.
+  for (unsigned i = 0; i != NumUses; ++i) {
+    // Uses outside the loop don't necessarily include the common base, but
+    // the final IV value coming into those uses does.  Instead of trying to
+    // remove the pieces of the common base, which might not be there,
+    // subtract off the base to compensate for this.
+    if (!L->contains(Uses[i].Inst->getParent())) {
+      Uses[i].Base = SE->getMinusSCEV(Uses[i].Base, Result);
+      continue;
+    }
+
+    // Split the expression into subexprs.
+    SeparateSubExprs(SubExprs, Uses[i].Base, SE);
+
+    // Remove any common subexpressions.
+    for (unsigned j = 0, e = SubExprs.size(); j != e; ++j)
+      if (SubExpressionUseData.count(SubExprs[j])) {
+        SubExprs.erase(SubExprs.begin()+j);
+        --j; --e;
+      }
+    
+    // Finally, add the non-shared expressions together.
+    if (SubExprs.empty())
+      Uses[i].Base = Zero;
+    else
+      Uses[i].Base = SE->getAddExpr(SubExprs);
+    SubExprs.clear();
+  }
+ 
+  return Result;
+}
+
+/// ValidScale - Check whether the given Scale is valid for all loads and 
+/// stores in UsersToProcess.
+///
+bool LoopStrengthReduce::ValidScale(bool HasBaseReg, int64_t Scale,
+                               const std::vector<BasedUser>& UsersToProcess) {
+  if (!TLI)
+    return true;
+
+  for (unsigned i = 0, e = UsersToProcess.size(); i!=e; ++i) {
+    // If this is a load or other access, pass the type of the access in.
+    const Type *AccessTy = Type::VoidTy;
+    if (isAddressUse(UsersToProcess[i].Inst,
+                     UsersToProcess[i].OperandValToReplace))
+      AccessTy = getAccessType(UsersToProcess[i].Inst);
+    else if (isa<PHINode>(UsersToProcess[i].Inst))
+      continue;
+    
+    TargetLowering::AddrMode AM;
+    if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(UsersToProcess[i].Imm))
+      AM.BaseOffs = SC->getValue()->getSExtValue();
+    AM.HasBaseReg = HasBaseReg || !UsersToProcess[i].Base->isZero();
+    AM.Scale = Scale;
+
+    // If load[imm+r*scale] is illegal, bail out.
+    if (!TLI->isLegalAddressingMode(AM, AccessTy))
+      return false;
+  }
+  return true;
+}
+
+/// ValidOffset - Check whether the given Offset is valid for all loads and
+/// stores in UsersToProcess.
+///
+bool LoopStrengthReduce::ValidOffset(bool HasBaseReg,
+                               int64_t Offset,
+                               int64_t Scale,
+                               const std::vector<BasedUser>& UsersToProcess) {
+  if (!TLI)
+    return true;
+
+  for (unsigned i=0, e = UsersToProcess.size(); i!=e; ++i) {
+    // If this is a load or other access, pass the type of the access in.
+    const Type *AccessTy = Type::VoidTy;
+    if (isAddressUse(UsersToProcess[i].Inst,
+                     UsersToProcess[i].OperandValToReplace))
+      AccessTy = getAccessType(UsersToProcess[i].Inst);
+    else if (isa<PHINode>(UsersToProcess[i].Inst))
+      continue;
+
+    TargetLowering::AddrMode AM;
+    if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(UsersToProcess[i].Imm))
+      AM.BaseOffs = SC->getValue()->getSExtValue();
+    AM.BaseOffs = (uint64_t)AM.BaseOffs + (uint64_t)Offset;
+    AM.HasBaseReg = HasBaseReg || !UsersToProcess[i].Base->isZero();
+    AM.Scale = Scale;
+
+    // If load[imm+r*scale] is illegal, bail out.
+    if (!TLI->isLegalAddressingMode(AM, AccessTy))
+      return false;
+  }
+  return true;
+}
+
+/// RequiresTypeConversion - Returns true if converting Ty1 to Ty2 is not
+/// a nop.
+bool LoopStrengthReduce::RequiresTypeConversion(const Type *Ty1,
+                                                const Type *Ty2) {
+  if (Ty1 == Ty2)
+    return false;
+  Ty1 = SE->getEffectiveSCEVType(Ty1);
+  Ty2 = SE->getEffectiveSCEVType(Ty2);
+  if (Ty1 == Ty2)
+    return false;
+  if (Ty1->canLosslesslyBitCastTo(Ty2))
+    return false;
+  if (TLI && TLI->isTruncateFree(Ty1, Ty2))
+    return false;
+  return true;
+}
+
+/// CheckForIVReuse - Returns the multiple if the stride is the multiple
+/// of a previous stride and it is a legal value for the target addressing
+/// mode scale component and optional base reg. This allows the users of
+/// this stride to be rewritten as prev iv * factor. It returns 0 if no
+/// reuse is possible.  Factors can be negative on same targets, e.g. ARM.
+///
+/// If all uses are outside the loop, we don't require that all multiplies
+/// be folded into the addressing mode, nor even that the factor be constant; 
+/// a multiply (executed once) outside the loop is better than another IV 
+/// within.  Well, usually.
+SCEVHandle LoopStrengthReduce::CheckForIVReuse(bool HasBaseReg,
+                                bool AllUsesAreAddresses,
+                                bool AllUsesAreOutsideLoop,
+                                const SCEVHandle &Stride, 
+                                IVExpr &IV, const Type *Ty,
+                                const std::vector<BasedUser>& UsersToProcess) {
+  if (StrideNoReuse.count(Stride))
+    return SE->getIntegerSCEV(0, Stride->getType());
+
+  if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(Stride)) {
+    int64_t SInt = SC->getValue()->getSExtValue();
+    for (unsigned NewStride = 0, e = IU->StrideOrder.size();
+         NewStride != e; ++NewStride) {
+      std::map<SCEVHandle, IVsOfOneStride>::iterator SI = 
+                IVsByStride.find(IU->StrideOrder[NewStride]);
+      if (SI == IVsByStride.end() || !isa<SCEVConstant>(SI->first) ||
+          StrideNoReuse.count(SI->first))
+        continue;
+      int64_t SSInt = cast<SCEVConstant>(SI->first)->getValue()->getSExtValue();
+      if (SI->first != Stride &&
+          (unsigned(abs64(SInt)) < SSInt || (SInt % SSInt) != 0))
+        continue;
+      int64_t Scale = SInt / SSInt;
+      // Check that this stride is valid for all the types used for loads and
+      // stores; if it can be used for some and not others, we might as well use
+      // the original stride everywhere, since we have to create the IV for it
+      // anyway. If the scale is 1, then we don't need to worry about folding
+      // multiplications.
+      if (Scale == 1 ||
+          (AllUsesAreAddresses &&
+           ValidScale(HasBaseReg, Scale, UsersToProcess))) {
+        // Prefer to reuse an IV with a base of zero.
+        for (std::vector<IVExpr>::iterator II = SI->second.IVs.begin(),
+               IE = SI->second.IVs.end(); II != IE; ++II)
+          // Only reuse previous IV if it would not require a type conversion
+          // and if the base difference can be folded.
+          if (II->Base->isZero() &&
+              !RequiresTypeConversion(II->Base->getType(), Ty)) {
+            IV = *II;
+            return SE->getIntegerSCEV(Scale, Stride->getType());
+          }
+        // Otherwise, settle for an IV with a foldable base.
+        if (AllUsesAreAddresses)
+          for (std::vector<IVExpr>::iterator II = SI->second.IVs.begin(),
+                 IE = SI->second.IVs.end(); II != IE; ++II)
+            // Only reuse previous IV if it would not require a type conversion
+            // and if the base difference can be folded.
+            if (SE->getEffectiveSCEVType(II->Base->getType()) ==
+                SE->getEffectiveSCEVType(Ty) &&
+                isa<SCEVConstant>(II->Base)) {
+              int64_t Base =
+                cast<SCEVConstant>(II->Base)->getValue()->getSExtValue();
+              if (Base > INT32_MIN && Base <= INT32_MAX &&
+                  ValidOffset(HasBaseReg, -Base * Scale,
+                              Scale, UsersToProcess)) {
+                IV = *II;
+                return SE->getIntegerSCEV(Scale, Stride->getType());
+              }
+            }
+      }
+    }
+  } else if (AllUsesAreOutsideLoop) {
+    // Accept nonconstant strides here; it is really really right to substitute
+    // an existing IV if we can.
+    for (unsigned NewStride = 0, e = IU->StrideOrder.size();
+         NewStride != e; ++NewStride) {
+      std::map<SCEVHandle, IVsOfOneStride>::iterator SI = 
+                IVsByStride.find(IU->StrideOrder[NewStride]);
+      if (SI == IVsByStride.end() || !isa<SCEVConstant>(SI->first))
+        continue;
+      int64_t SSInt = cast<SCEVConstant>(SI->first)->getValue()->getSExtValue();
+      if (SI->first != Stride && SSInt != 1)
+        continue;
+      for (std::vector<IVExpr>::iterator II = SI->second.IVs.begin(),
+             IE = SI->second.IVs.end(); II != IE; ++II)
+        // Accept nonzero base here.
+        // Only reuse previous IV if it would not require a type conversion.
+        if (!RequiresTypeConversion(II->Base->getType(), Ty)) {
+          IV = *II;
+          return Stride;
+        }
+    }
+    // Special case, old IV is -1*x and this one is x.  Can treat this one as
+    // -1*old.
+    for (unsigned NewStride = 0, e = IU->StrideOrder.size();
+         NewStride != e; ++NewStride) {
+      std::map<SCEVHandle, IVsOfOneStride>::iterator SI = 
+                IVsByStride.find(IU->StrideOrder[NewStride]);
+      if (SI == IVsByStride.end()) 
+        continue;
+      if (const SCEVMulExpr *ME = dyn_cast<SCEVMulExpr>(SI->first))
+        if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(ME->getOperand(0)))
+          if (Stride == ME->getOperand(1) &&
+              SC->getValue()->getSExtValue() == -1LL)
+            for (std::vector<IVExpr>::iterator II = SI->second.IVs.begin(),
+                   IE = SI->second.IVs.end(); II != IE; ++II)
+              // Accept nonzero base here.
+              // Only reuse previous IV if it would not require type conversion.
+              if (!RequiresTypeConversion(II->Base->getType(), Ty)) {
+                IV = *II;
+                return SE->getIntegerSCEV(-1LL, Stride->getType());
+              }
+    }
+  }
+  return SE->getIntegerSCEV(0, Stride->getType());
+}
+
+/// PartitionByIsUseOfPostIncrementedValue - Simple boolean predicate that
+/// returns true if Val's isUseOfPostIncrementedValue is true.
+static bool PartitionByIsUseOfPostIncrementedValue(const BasedUser &Val) {
+  return Val.isUseOfPostIncrementedValue;
+}
+
+/// isNonConstantNegative - Return true if the specified scev is negated, but
+/// not a constant.
+static bool isNonConstantNegative(const SCEVHandle &Expr) {
+  const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(Expr);
+  if (!Mul) return false;
+  
+  // If there is a constant factor, it will be first.
+  const SCEVConstant *SC = dyn_cast<SCEVConstant>(Mul->getOperand(0));
+  if (!SC) return false;
+  
+  // Return true if the value is negative, this matches things like (-42 * V).
+  return SC->getValue()->getValue().isNegative();
+}
+
+// CollectIVUsers - Transform our list of users and offsets to a bit more
+// complex table. In this new vector, each 'BasedUser' contains 'Base', the base
+// of the strided accesses, as well as the old information from Uses. We
+// progressively move information from the Base field to the Imm field, until
+// we eventually have the full access expression to rewrite the use.
+SCEVHandle LoopStrengthReduce::CollectIVUsers(const SCEVHandle &Stride,
+                                              IVUsersOfOneStride &Uses,
+                                              Loop *L,
+                                              bool &AllUsesAreAddresses,
+                                              bool &AllUsesAreOutsideLoop,
+                                       std::vector<BasedUser> &UsersToProcess) {
+  // FIXME: Generalize to non-affine IV's.
+  if (!Stride->isLoopInvariant(L))
+    return SE->getIntegerSCEV(0, Stride->getType());
+
+  UsersToProcess.reserve(Uses.Users.size());
+  for (ilist<IVStrideUse>::iterator I = Uses.Users.begin(),
+       E = Uses.Users.end(); I != E; ++I) {
+    UsersToProcess.push_back(BasedUser(*I, SE));
+
+    // Move any loop variant operands from the offset field to the immediate
+    // field of the use, so that we don't try to use something before it is
+    // computed.
+    MoveLoopVariantsToImmediateField(UsersToProcess.back().Base,
+                                     UsersToProcess.back().Imm, L, SE);
+    assert(UsersToProcess.back().Base->isLoopInvariant(L) &&
+           "Base value is not loop invariant!");
+  }
+
+  // We now have a whole bunch of uses of like-strided induction variables, but
+  // they might all have different bases.  We want to emit one PHI node for this
+  // stride which we fold as many common expressions (between the IVs) into as
+  // possible.  Start by identifying the common expressions in the base values 
+  // for the strides (e.g. if we have "A+C+B" and "A+B+D" as our bases, find
+  // "A+B"), emit it to the preheader, then remove the expression from the
+  // UsersToProcess base values.
+  SCEVHandle CommonExprs =
+    RemoveCommonExpressionsFromUseBases(UsersToProcess, SE, L, TLI);
+
+  // Next, figure out what we can represent in the immediate fields of
+  // instructions.  If we can represent anything there, move it to the imm
+  // fields of the BasedUsers.  We do this so that it increases the commonality
+  // of the remaining uses.
+  unsigned NumPHI = 0;
+  bool HasAddress = false;
+  for (unsigned i = 0, e = UsersToProcess.size(); i != e; ++i) {
+    // If the user is not in the current loop, this means it is using the exit
+    // value of the IV.  Do not put anything in the base, make sure it's all in
+    // the immediate field to allow as much factoring as possible.
+    if (!L->contains(UsersToProcess[i].Inst->getParent())) {
+      UsersToProcess[i].Imm = SE->getAddExpr(UsersToProcess[i].Imm,
+                                             UsersToProcess[i].Base);
+      UsersToProcess[i].Base = 
+        SE->getIntegerSCEV(0, UsersToProcess[i].Base->getType());
+    } else {
+      // Not all uses are outside the loop.
+      AllUsesAreOutsideLoop = false; 
+
+      // Addressing modes can be folded into loads and stores.  Be careful that
+      // the store is through the expression, not of the expression though.
+      bool isPHI = false;
+      bool isAddress = isAddressUse(UsersToProcess[i].Inst,
+                                    UsersToProcess[i].OperandValToReplace);
+      if (isa<PHINode>(UsersToProcess[i].Inst)) {
+        isPHI = true;
+        ++NumPHI;
+      }
+
+      if (isAddress)
+        HasAddress = true;
+     
+      // If this use isn't an address, then not all uses are addresses.
+      if (!isAddress && !isPHI)
+        AllUsesAreAddresses = false;
+      
+      MoveImmediateValues(TLI, UsersToProcess[i].Inst, UsersToProcess[i].Base,
+                          UsersToProcess[i].Imm, isAddress, L, SE);
+    }
+  }
+
+  // If one of the use is a PHI node and all other uses are addresses, still
+  // allow iv reuse. Essentially we are trading one constant multiplication
+  // for one fewer iv.
+  if (NumPHI > 1)
+    AllUsesAreAddresses = false;
+    
+  // There are no in-loop address uses.
+  if (AllUsesAreAddresses && (!HasAddress && !AllUsesAreOutsideLoop))
+    AllUsesAreAddresses = false;
+
+  return CommonExprs;
+}
+
+/// ShouldUseFullStrengthReductionMode - Test whether full strength-reduction
+/// is valid and profitable for the given set of users of a stride. In
+/// full strength-reduction mode, all addresses at the current stride are
+/// strength-reduced all the way down to pointer arithmetic.
+///
+bool LoopStrengthReduce::ShouldUseFullStrengthReductionMode(
+                                   const std::vector<BasedUser> &UsersToProcess,
+                                   const Loop *L,
+                                   bool AllUsesAreAddresses,
+                                   SCEVHandle Stride) {
+  if (!EnableFullLSRMode)
+    return false;
+
+  // The heuristics below aim to avoid increasing register pressure, but
+  // fully strength-reducing all the addresses increases the number of
+  // add instructions, so don't do this when optimizing for size.
+  // TODO: If the loop is large, the savings due to simpler addresses
+  // may oughtweight the costs of the extra increment instructions.
+  if (L->getHeader()->getParent()->hasFnAttr(Attribute::OptimizeForSize))
+    return false;
+
+  // TODO: For now, don't do full strength reduction if there could
+  // potentially be greater-stride multiples of the current stride
+  // which could reuse the current stride IV.
+  if (IU->StrideOrder.back() != Stride)
+    return false;
+
+  // Iterate through the uses to find conditions that automatically rule out
+  // full-lsr mode.
+  for (unsigned i = 0, e = UsersToProcess.size(); i != e; ) {
+    const SCEV *Base = UsersToProcess[i].Base;
+    const SCEV *Imm = UsersToProcess[i].Imm;
+    // If any users have a loop-variant component, they can't be fully
+    // strength-reduced.
+    if (Imm && !Imm->isLoopInvariant(L))
+      return false;
+    // If there are to users with the same base and the difference between
+    // the two Imm values can't be folded into the address, full
+    // strength reduction would increase register pressure.
+    do {
+      const SCEV *CurImm = UsersToProcess[i].Imm;
+      if ((CurImm || Imm) && CurImm != Imm) {
+        if (!CurImm) CurImm = SE->getIntegerSCEV(0, Stride->getType());
+        if (!Imm)       Imm = SE->getIntegerSCEV(0, Stride->getType());
+        const Instruction *Inst = UsersToProcess[i].Inst;
+        const Type *AccessTy = getAccessType(Inst);
+        SCEVHandle Diff = SE->getMinusSCEV(UsersToProcess[i].Imm, Imm);
+        if (!Diff->isZero() &&
+            (!AllUsesAreAddresses ||
+             !fitsInAddressMode(Diff, AccessTy, TLI, /*HasBaseReg=*/true)))
+          return false;
+      }
+    } while (++i != e && Base == UsersToProcess[i].Base);
+  }
+
+  // If there's exactly one user in this stride, fully strength-reducing it
+  // won't increase register pressure. If it's starting from a non-zero base,
+  // it'll be simpler this way.
+  if (UsersToProcess.size() == 1 && !UsersToProcess[0].Base->isZero())
+    return true;
+
+  // Otherwise, if there are any users in this stride that don't require
+  // a register for their base, full strength-reduction will increase
+  // register pressure.
+  for (unsigned i = 0, e = UsersToProcess.size(); i != e; ++i)
+    if (UsersToProcess[i].Base->isZero())
+      return false;
+
+  // Otherwise, go for it.
+  return true;
+}
+
+/// InsertAffinePhi Create and insert a PHI node for an induction variable
+/// with the specified start and step values in the specified loop.
+///
+/// If NegateStride is true, the stride should be negated by using a
+/// subtract instead of an add.
+///
+/// Return the created phi node.
+///
+static PHINode *InsertAffinePhi(SCEVHandle Start, SCEVHandle Step,
+                                Instruction *IVIncInsertPt,
+                                const Loop *L,
+                                SCEVExpander &Rewriter) {
+  assert(Start->isLoopInvariant(L) && "New PHI start is not loop invariant!");
+  assert(Step->isLoopInvariant(L) && "New PHI stride is not loop invariant!");
+
+  BasicBlock *Header = L->getHeader();
+  BasicBlock *Preheader = L->getLoopPreheader();
+  BasicBlock *LatchBlock = L->getLoopLatch();
+  const Type *Ty = Start->getType();
+  Ty = Rewriter.SE.getEffectiveSCEVType(Ty);
+
+  PHINode *PN = PHINode::Create(Ty, "lsr.iv", Header->begin());
+  PN->addIncoming(Rewriter.expandCodeFor(Start, Ty, Preheader->getTerminator()),
+                  Preheader);
+
+  // If the stride is negative, insert a sub instead of an add for the
+  // increment.
+  bool isNegative = isNonConstantNegative(Step);
+  SCEVHandle IncAmount = Step;
+  if (isNegative)
+    IncAmount = Rewriter.SE.getNegativeSCEV(Step);
+
+  // Insert an add instruction right before the terminator corresponding
+  // to the back-edge or just before the only use. The location is determined
+  // by the caller and passed in as IVIncInsertPt.
+  Value *StepV = Rewriter.expandCodeFor(IncAmount, Ty,
+                                        Preheader->getTerminator());
+  Instruction *IncV;
+  if (isNegative) {
+    IncV = BinaryOperator::CreateSub(PN, StepV, "lsr.iv.next",
+                                     IVIncInsertPt);
+  } else {
+    IncV = BinaryOperator::CreateAdd(PN, StepV, "lsr.iv.next",
+                                     IVIncInsertPt);
+  }
+  if (!isa<ConstantInt>(StepV)) ++NumVariable;
+
+  PN->addIncoming(IncV, LatchBlock);
+
+  ++NumInserted;
+  return PN;
+}
+
+static void SortUsersToProcess(std::vector<BasedUser> &UsersToProcess) {
+  // We want to emit code for users inside the loop first.  To do this, we
+  // rearrange BasedUser so that the entries at the end have
+  // isUseOfPostIncrementedValue = false, because we pop off the end of the
+  // vector (so we handle them first).
+  std::partition(UsersToProcess.begin(), UsersToProcess.end(),
+                 PartitionByIsUseOfPostIncrementedValue);
+
+  // Sort this by base, so that things with the same base are handled
+  // together.  By partitioning first and stable-sorting later, we are
+  // guaranteed that within each base we will pop off users from within the
+  // loop before users outside of the loop with a particular base.
+  //
+  // We would like to use stable_sort here, but we can't.  The problem is that
+  // SCEVHandle's don't have a deterministic ordering w.r.t to each other, so
+  // we don't have anything to do a '<' comparison on.  Because we think the
+  // number of uses is small, do a horrible bubble sort which just relies on
+  // ==.
+  for (unsigned i = 0, e = UsersToProcess.size(); i != e; ++i) {
+    // Get a base value.
+    SCEVHandle Base = UsersToProcess[i].Base;
+
+    // Compact everything with this base to be consecutive with this one.
+    for (unsigned j = i+1; j != e; ++j) {
+      if (UsersToProcess[j].Base == Base) {
+        std::swap(UsersToProcess[i+1], UsersToProcess[j]);
+        ++i;
+      }
+    }
+  }
+}
+
+/// PrepareToStrengthReduceFully - Prepare to fully strength-reduce
+/// UsersToProcess, meaning lowering addresses all the way down to direct
+/// pointer arithmetic.
+///
+void
+LoopStrengthReduce::PrepareToStrengthReduceFully(
+                                        std::vector<BasedUser> &UsersToProcess,
+                                        SCEVHandle Stride,
+                                        SCEVHandle CommonExprs,
+                                        const Loop *L,
+                                        SCEVExpander &PreheaderRewriter) {
+  DOUT << "  Fully reducing all users\n";
+
+  // Rewrite the UsersToProcess records, creating a separate PHI for each
+  // unique Base value.
+  Instruction *IVIncInsertPt = L->getLoopLatch()->getTerminator();
+  for (unsigned i = 0, e = UsersToProcess.size(); i != e; ) {
+    // TODO: The uses are grouped by base, but not sorted. We arbitrarily
+    // pick the first Imm value here to start with, and adjust it for the
+    // other uses.
+    SCEVHandle Imm = UsersToProcess[i].Imm;
+    SCEVHandle Base = UsersToProcess[i].Base;
+    SCEVHandle Start = SE->getAddExpr(CommonExprs, Base, Imm);
+    PHINode *Phi = InsertAffinePhi(Start, Stride, IVIncInsertPt, L,
+                                   PreheaderRewriter);
+    // Loop over all the users with the same base.
+    do {
+      UsersToProcess[i].Base = SE->getIntegerSCEV(0, Stride->getType());
+      UsersToProcess[i].Imm = SE->getMinusSCEV(UsersToProcess[i].Imm, Imm);
+      UsersToProcess[i].Phi = Phi;
+      assert(UsersToProcess[i].Imm->isLoopInvariant(L) &&
+             "ShouldUseFullStrengthReductionMode should reject this!");
+    } while (++i != e && Base == UsersToProcess[i].Base);
+  }
+}
+
+/// FindIVIncInsertPt - Return the location to insert the increment instruction.
+/// If the only use if a use of postinc value, (must be the loop termination
+/// condition), then insert it just before the use.
+static Instruction *FindIVIncInsertPt(std::vector<BasedUser> &UsersToProcess,
+                                      const Loop *L) {
+  if (UsersToProcess.size() == 1 &&
+      UsersToProcess[0].isUseOfPostIncrementedValue &&
+      L->contains(UsersToProcess[0].Inst->getParent()))
+    return UsersToProcess[0].Inst;
+  return L->getLoopLatch()->getTerminator();
+}
+
+/// PrepareToStrengthReduceWithNewPhi - Insert a new induction variable for the
+/// given users to share.
+///
+void
+LoopStrengthReduce::PrepareToStrengthReduceWithNewPhi(
+                                         std::vector<BasedUser> &UsersToProcess,
+                                         SCEVHandle Stride,
+                                         SCEVHandle CommonExprs,
+                                         Value *CommonBaseV,
+                                         Instruction *IVIncInsertPt,
+                                         const Loop *L,
+                                         SCEVExpander &PreheaderRewriter) {
+  DOUT << "  Inserting new PHI:\n";
+
+  PHINode *Phi = InsertAffinePhi(SE->getUnknown(CommonBaseV),
+                                 Stride, IVIncInsertPt, L,
+                                 PreheaderRewriter);
+
+  // Remember this in case a later stride is multiple of this.
+  IVsByStride[Stride].addIV(Stride, CommonExprs, Phi);
+
+  // All the users will share this new IV.
+  for (unsigned i = 0, e = UsersToProcess.size(); i != e; ++i)
+    UsersToProcess[i].Phi = Phi;
+
+  DOUT << "    IV=";
+  DEBUG(WriteAsOperand(*DOUT, Phi, /*PrintType=*/false));
+  DOUT << "\n";
+}
+
+/// PrepareToStrengthReduceFromSmallerStride - Prepare for the given users to
+/// reuse an induction variable with a stride that is a factor of the current
+/// induction variable.
+///
+void
+LoopStrengthReduce::PrepareToStrengthReduceFromSmallerStride(
+                                         std::vector<BasedUser> &UsersToProcess,
+                                         Value *CommonBaseV,
+                                         const IVExpr &ReuseIV,
+                                         Instruction *PreInsertPt) {
+  DOUT << "  Rewriting in terms of existing IV of STRIDE " << *ReuseIV.Stride
+       << " and BASE " << *ReuseIV.Base << "\n";
+
+  // All the users will share the reused IV.
+  for (unsigned i = 0, e = UsersToProcess.size(); i != e; ++i)
+    UsersToProcess[i].Phi = ReuseIV.PHI;
+
+  Constant *C = dyn_cast<Constant>(CommonBaseV);
+  if (C &&
+      (!C->isNullValue() &&
+       !fitsInAddressMode(SE->getUnknown(CommonBaseV), CommonBaseV->getType(),
+                         TLI, false)))
+    // We want the common base emitted into the preheader! This is just
+    // using cast as a copy so BitCast (no-op cast) is appropriate
+    CommonBaseV = new BitCastInst(CommonBaseV, CommonBaseV->getType(),
+                                  "commonbase", PreInsertPt);
+}
+
+static bool IsImmFoldedIntoAddrMode(GlobalValue *GV, int64_t Offset,
+                                    const Type *AccessTy,
+                                   std::vector<BasedUser> &UsersToProcess,
+                                   const TargetLowering *TLI) {
+  SmallVector<Instruction*, 16> AddrModeInsts;
+  for (unsigned i = 0, e = UsersToProcess.size(); i != e; ++i) {
+    if (UsersToProcess[i].isUseOfPostIncrementedValue)
+      continue;
+    ExtAddrMode AddrMode =
+      AddressingModeMatcher::Match(UsersToProcess[i].OperandValToReplace,
+                                   AccessTy, UsersToProcess[i].Inst,
+                                   AddrModeInsts, *TLI);
+    if (GV && GV != AddrMode.BaseGV)
+      return false;
+    if (Offset && !AddrMode.BaseOffs)
+      // FIXME: How to accurate check it's immediate offset is folded.
+      return false;
+    AddrModeInsts.clear();
+  }
+  return true;
+}
+
+/// StrengthReduceStridedIVUsers - Strength reduce all of the users of a single
+/// stride of IV.  All of the users may have different starting values, and this
+/// may not be the only stride.
+void LoopStrengthReduce::StrengthReduceStridedIVUsers(const SCEVHandle &Stride,
+                                                      IVUsersOfOneStride &Uses,
+                                                      Loop *L) {
+  // If all the users are moved to another stride, then there is nothing to do.
+  if (Uses.Users.empty())
+    return;
+
+  // Keep track if every use in UsersToProcess is an address. If they all are,
+  // we may be able to rewrite the entire collection of them in terms of a
+  // smaller-stride IV.
+  bool AllUsesAreAddresses = true;
+
+  // Keep track if every use of a single stride is outside the loop.  If so,
+  // we want to be more aggressive about reusing a smaller-stride IV; a
+  // multiply outside the loop is better than another IV inside.  Well, usually.
+  bool AllUsesAreOutsideLoop = true;
+
+  // Transform our list of users and offsets to a bit more complex table.  In
+  // this new vector, each 'BasedUser' contains 'Base' the base of the
+  // strided accessas well as the old information from Uses.  We progressively
+  // move information from the Base field to the Imm field, until we eventually
+  // have the full access expression to rewrite the use.
+  std::vector<BasedUser> UsersToProcess;
+  SCEVHandle CommonExprs = CollectIVUsers(Stride, Uses, L, AllUsesAreAddresses,
+                                          AllUsesAreOutsideLoop,
+                                          UsersToProcess);
+
+  // Sort the UsersToProcess array so that users with common bases are
+  // next to each other.
+  SortUsersToProcess(UsersToProcess);
+
+  // If we managed to find some expressions in common, we'll need to carry
+  // their value in a register and add it in for each use. This will take up
+  // a register operand, which potentially restricts what stride values are
+  // valid.
+  bool HaveCommonExprs = !CommonExprs->isZero();
+  const Type *ReplacedTy = CommonExprs->getType();
+
+  // If all uses are addresses, consider sinking the immediate part of the
+  // common expression back into uses if they can fit in the immediate fields.
+  if (TLI && HaveCommonExprs && AllUsesAreAddresses) {
+    SCEVHandle NewCommon = CommonExprs;
+    SCEVHandle Imm = SE->getIntegerSCEV(0, ReplacedTy);
+    MoveImmediateValues(TLI, Type::VoidTy, NewCommon, Imm, true, L, SE);
+    if (!Imm->isZero()) {
+      bool DoSink = true;
+
+      // If the immediate part of the common expression is a GV, check if it's
+      // possible to fold it into the target addressing mode.
+      GlobalValue *GV = 0;
+      if (const SCEVUnknown *SU = dyn_cast<SCEVUnknown>(Imm))
+        GV = dyn_cast<GlobalValue>(SU->getValue());
+      int64_t Offset = 0;
+      if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(Imm))
+        Offset = SC->getValue()->getSExtValue();
+      if (GV || Offset)
+        // Pass VoidTy as the AccessTy to be conservative, because
+        // there could be multiple access types among all the uses.
+        DoSink = IsImmFoldedIntoAddrMode(GV, Offset, Type::VoidTy,
+                                         UsersToProcess, TLI);
+
+      if (DoSink) {
+        DOUT << "  Sinking " << *Imm << " back down into uses\n";
+        for (unsigned i = 0, e = UsersToProcess.size(); i != e; ++i)
+          UsersToProcess[i].Imm = SE->getAddExpr(UsersToProcess[i].Imm, Imm);
+        CommonExprs = NewCommon;
+        HaveCommonExprs = !CommonExprs->isZero();
+        ++NumImmSunk;
+      }
+    }
+  }
+
+  // Now that we know what we need to do, insert the PHI node itself.
+  //
+  DOUT << "LSR: Examining IVs of TYPE " << *ReplacedTy << " of STRIDE "
+       << *Stride << ":\n"
+       << "  Common base: " << *CommonExprs << "\n";
+
+  SCEVExpander Rewriter(*SE);
+  SCEVExpander PreheaderRewriter(*SE);
+
+  BasicBlock  *Preheader = L->getLoopPreheader();
+  Instruction *PreInsertPt = Preheader->getTerminator();
+  BasicBlock *LatchBlock = L->getLoopLatch();
+  Instruction *IVIncInsertPt = LatchBlock->getTerminator();
+
+  Value *CommonBaseV = Constant::getNullValue(ReplacedTy);
+
+  SCEVHandle RewriteFactor = SE->getIntegerSCEV(0, ReplacedTy);
+  IVExpr   ReuseIV(SE->getIntegerSCEV(0, Type::Int32Ty),
+                   SE->getIntegerSCEV(0, Type::Int32Ty),
+                   0);
+
+  /// Choose a strength-reduction strategy and prepare for it by creating
+  /// the necessary PHIs and adjusting the bookkeeping.
+  if (ShouldUseFullStrengthReductionMode(UsersToProcess, L,
+                                         AllUsesAreAddresses, Stride)) {
+    PrepareToStrengthReduceFully(UsersToProcess, Stride, CommonExprs, L,
+                                 PreheaderRewriter);
+  } else {
+    // Emit the initial base value into the loop preheader.
+    CommonBaseV = PreheaderRewriter.expandCodeFor(CommonExprs, ReplacedTy,
+                                                  PreInsertPt);
+
+    // If all uses are addresses, check if it is possible to reuse an IV.  The
+    // new IV must have a stride that is a multiple of the old stride; the
+    // multiple must be a number that can be encoded in the scale field of the
+    // target addressing mode; and we must have a valid instruction after this 
+    // substitution, including the immediate field, if any.
+    RewriteFactor = CheckForIVReuse(HaveCommonExprs, AllUsesAreAddresses,
+                                    AllUsesAreOutsideLoop,
+                                    Stride, ReuseIV, ReplacedTy,
+                                    UsersToProcess);
+    if (!RewriteFactor->isZero())
+      PrepareToStrengthReduceFromSmallerStride(UsersToProcess, CommonBaseV,
+                                               ReuseIV, PreInsertPt);
+    else {
+      IVIncInsertPt = FindIVIncInsertPt(UsersToProcess, L);
+      PrepareToStrengthReduceWithNewPhi(UsersToProcess, Stride, CommonExprs,
+                                        CommonBaseV, IVIncInsertPt,
+                                        L, PreheaderRewriter);
+    }
+  }
+
+  // Process all the users now, replacing their strided uses with
+  // strength-reduced forms.  This outer loop handles all bases, the inner
+  // loop handles all users of a particular base.
+  while (!UsersToProcess.empty()) {
+    SCEVHandle Base = UsersToProcess.back().Base;
+    Instruction *Inst = UsersToProcess.back().Inst;
+
+    // Emit the code for Base into the preheader.
+    Value *BaseV = 0;
+    if (!Base->isZero()) {
+      BaseV = PreheaderRewriter.expandCodeFor(Base, 0, PreInsertPt);
+
+      DOUT << "  INSERTING code for BASE = " << *Base << ":";
+      if (BaseV->hasName())
+        DOUT << " Result value name = %" << BaseV->getNameStr();
+      DOUT << "\n";
+
+      // If BaseV is a non-zero constant, make sure that it gets inserted into
+      // the preheader, instead of being forward substituted into the uses.  We
+      // do this by forcing a BitCast (noop cast) to be inserted into the
+      // preheader in this case.
+      if (!fitsInAddressMode(Base, getAccessType(Inst), TLI, false)) {
+        // We want this constant emitted into the preheader! This is just
+        // using cast as a copy so BitCast (no-op cast) is appropriate
+        BaseV = new BitCastInst(BaseV, BaseV->getType(), "preheaderinsert",
+                                PreInsertPt);       
+      }
+    }
+
+    // Emit the code to add the immediate offset to the Phi value, just before
+    // the instructions that we identified as using this stride and base.
+    do {
+      // FIXME: Use emitted users to emit other users.
+      BasedUser &User = UsersToProcess.back();
+
+      DOUT << "    Examining ";
+      if (User.isUseOfPostIncrementedValue)
+        DOUT << "postinc";
+      else
+        DOUT << "preinc";
+      DOUT << " use ";
+      DEBUG(WriteAsOperand(*DOUT, UsersToProcess.back().OperandValToReplace,
+                           /*PrintType=*/false));
+      DOUT << " in Inst: " << *(User.Inst);
+
+      // If this instruction wants to use the post-incremented value, move it
+      // after the post-inc and use its value instead of the PHI.
+      Value *RewriteOp = User.Phi;
+      if (User.isUseOfPostIncrementedValue) {
+        RewriteOp = User.Phi->getIncomingValueForBlock(LatchBlock);
+        // If this user is in the loop, make sure it is the last thing in the
+        // loop to ensure it is dominated by the increment. In case it's the
+        // only use of the iv, the increment instruction is already before the
+        // use.
+        if (L->contains(User.Inst->getParent()) && User.Inst != IVIncInsertPt)
+          User.Inst->moveBefore(IVIncInsertPt);
+      }
+
+      SCEVHandle RewriteExpr = SE->getUnknown(RewriteOp);
+
+      if (SE->getEffectiveSCEVType(RewriteOp->getType()) !=
+          SE->getEffectiveSCEVType(ReplacedTy)) {
+        assert(SE->getTypeSizeInBits(RewriteOp->getType()) >
+               SE->getTypeSizeInBits(ReplacedTy) &&
+               "Unexpected widening cast!");
+        RewriteExpr = SE->getTruncateExpr(RewriteExpr, ReplacedTy);
+      }
+
+      // If we had to insert new instructions for RewriteOp, we have to
+      // consider that they may not have been able to end up immediately
+      // next to RewriteOp, because non-PHI instructions may never precede
+      // PHI instructions in a block. In this case, remember where the last
+      // instruction was inserted so that if we're replacing a different
+      // PHI node, we can use the later point to expand the final
+      // RewriteExpr.
+      Instruction *NewBasePt = dyn_cast<Instruction>(RewriteOp);
+      if (RewriteOp == User.Phi) NewBasePt = 0;
+
+      // Clear the SCEVExpander's expression map so that we are guaranteed
+      // to have the code emitted where we expect it.
+      Rewriter.clear();
+
+      // If we are reusing the iv, then it must be multiplied by a constant
+      // factor to take advantage of the addressing mode scale component.
+      if (!RewriteFactor->isZero()) {
+        // If we're reusing an IV with a nonzero base (currently this happens
+        // only when all reuses are outside the loop) subtract that base here.
+        // The base has been used to initialize the PHI node but we don't want
+        // it here.
+        if (!ReuseIV.Base->isZero()) {
+          SCEVHandle typedBase = ReuseIV.Base;
+          if (SE->getEffectiveSCEVType(RewriteExpr->getType()) !=
+              SE->getEffectiveSCEVType(ReuseIV.Base->getType())) {
+            // It's possible the original IV is a larger type than the new IV,
+            // in which case we have to truncate the Base.  We checked in
+            // RequiresTypeConversion that this is valid.
+            assert(SE->getTypeSizeInBits(RewriteExpr->getType()) <
+                   SE->getTypeSizeInBits(ReuseIV.Base->getType()) &&
+                   "Unexpected lengthening conversion!");
+            typedBase = SE->getTruncateExpr(ReuseIV.Base, 
+                                            RewriteExpr->getType());
+          }
+          RewriteExpr = SE->getMinusSCEV(RewriteExpr, typedBase);
+        }
+
+        // Multiply old variable, with base removed, by new scale factor.
+        RewriteExpr = SE->getMulExpr(RewriteFactor,
+                                     RewriteExpr);
+
+        // The common base is emitted in the loop preheader. But since we
+        // are reusing an IV, it has not been used to initialize the PHI node.
+        // Add it to the expression used to rewrite the uses.
+        // When this use is outside the loop, we earlier subtracted the
+        // common base, and are adding it back here.  Use the same expression
+        // as before, rather than CommonBaseV, so DAGCombiner will zap it.
+        if (!CommonExprs->isZero()) {
+          if (L->contains(User.Inst->getParent()))
+            RewriteExpr = SE->getAddExpr(RewriteExpr,
+                                       SE->getUnknown(CommonBaseV));
+          else
+            RewriteExpr = SE->getAddExpr(RewriteExpr, CommonExprs);
+        }
+      }
+
+      // Now that we know what we need to do, insert code before User for the
+      // immediate and any loop-variant expressions.
+      if (BaseV)
+        // Add BaseV to the PHI value if needed.
+        RewriteExpr = SE->getAddExpr(RewriteExpr, SE->getUnknown(BaseV));
+
+      User.RewriteInstructionToUseNewBase(RewriteExpr, NewBasePt,
+                                          Rewriter, L, this, *LI,
+                                          DeadInsts);
+
+      // Mark old value we replaced as possibly dead, so that it is eliminated
+      // if we just replaced the last use of that value.
+      DeadInsts.push_back(User.OperandValToReplace);
+
+      UsersToProcess.pop_back();
+      ++NumReduced;
+
+      // If there are any more users to process with the same base, process them
+      // now.  We sorted by base above, so we just have to check the last elt.
+    } while (!UsersToProcess.empty() && UsersToProcess.back().Base == Base);
+    // TODO: Next, find out which base index is the most common, pull it out.
+  }
+
+  // IMPORTANT TODO: Figure out how to partition the IV's with this stride, but
+  // different starting values, into different PHIs.
+}
+
+/// FindIVUserForCond - If Cond has an operand that is an expression of an IV,
+/// set the IV user and stride information and return true, otherwise return
+/// false.
+bool LoopStrengthReduce::FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse,
+                                       const SCEVHandle *&CondStride) {
+  for (unsigned Stride = 0, e = IU->StrideOrder.size();
+       Stride != e && !CondUse; ++Stride) {
+    std::map<SCEVHandle, IVUsersOfOneStride *>::iterator SI =
+      IU->IVUsesByStride.find(IU->StrideOrder[Stride]);
+    assert(SI != IU->IVUsesByStride.end() && "Stride doesn't exist!");
+
+    for (ilist<IVStrideUse>::iterator UI = SI->second->Users.begin(),
+         E = SI->second->Users.end(); UI != E; ++UI)
+      if (UI->getUser() == Cond) {
+        // NOTE: we could handle setcc instructions with multiple uses here, but
+        // InstCombine does it as well for simple uses, it's not clear that it
+        // occurs enough in real life to handle.
+        CondUse = UI;
+        CondStride = &SI->first;
+        return true;
+      }
+  }
+  return false;
+}    
+
+namespace {
+  // Constant strides come first which in turns are sorted by their absolute
+  // values. If absolute values are the same, then positive strides comes first.
+  // e.g.
+  // 4, -1, X, 1, 2 ==> 1, -1, 2, 4, X
+  struct StrideCompare {
+    const ScalarEvolution *SE;
+    explicit StrideCompare(const ScalarEvolution *se) : SE(se) {}
+
+    bool operator()(const SCEVHandle &LHS, const SCEVHandle &RHS) {
+      const SCEVConstant *LHSC = dyn_cast<SCEVConstant>(LHS);
+      const SCEVConstant *RHSC = dyn_cast<SCEVConstant>(RHS);
+      if (LHSC && RHSC) {
+        int64_t  LV = LHSC->getValue()->getSExtValue();
+        int64_t  RV = RHSC->getValue()->getSExtValue();
+        uint64_t ALV = (LV < 0) ? -LV : LV;
+        uint64_t ARV = (RV < 0) ? -RV : RV;
+        if (ALV == ARV) {
+          if (LV != RV)
+            return LV > RV;
+        } else {
+          return ALV < ARV;
+        }
+
+        // If it's the same value but different type, sort by bit width so
+        // that we emit larger induction variables before smaller
+        // ones, letting the smaller be re-written in terms of larger ones.
+        return SE->getTypeSizeInBits(RHS->getType()) <
+               SE->getTypeSizeInBits(LHS->getType());
+      }
+      return LHSC && !RHSC;
+    }
+  };
+}
+
+/// ChangeCompareStride - If a loop termination compare instruction is the
+/// only use of its stride, and the compaison is against a constant value,
+/// try eliminate the stride by moving the compare instruction to another
+/// stride and change its constant operand accordingly. e.g.
+///
+/// loop:
+/// ...
+/// v1 = v1 + 3
+/// v2 = v2 + 1
+/// if (v2 < 10) goto loop
+/// =>
+/// loop:
+/// ...
+/// v1 = v1 + 3
+/// if (v1 < 30) goto loop
+ICmpInst *LoopStrengthReduce::ChangeCompareStride(Loop *L, ICmpInst *Cond,
+                                                IVStrideUse* &CondUse,
+                                                const SCEVHandle* &CondStride) {
+  // If there's only one stride in the loop, there's nothing to do here.
+  if (IU->StrideOrder.size() < 2)
+    return Cond;
+  // If there are other users of the condition's stride, don't bother
+  // trying to change the condition because the stride will still
+  // remain.
+  std::map<SCEVHandle, IVUsersOfOneStride *>::iterator I =
+    IU->IVUsesByStride.find(*CondStride);
+  if (I == IU->IVUsesByStride.end() ||
+      I->second->Users.size() != 1)
+    return Cond;
+  // Only handle constant strides for now.
+  const SCEVConstant *SC = dyn_cast<SCEVConstant>(*CondStride);
+  if (!SC) return Cond;
+
+  ICmpInst::Predicate Predicate = Cond->getPredicate();
+  int64_t CmpSSInt = SC->getValue()->getSExtValue();
+  unsigned BitWidth = SE->getTypeSizeInBits((*CondStride)->getType());
+  uint64_t SignBit = 1ULL << (BitWidth-1);
+  const Type *CmpTy = Cond->getOperand(0)->getType();
+  const Type *NewCmpTy = NULL;
+  unsigned TyBits = SE->getTypeSizeInBits(CmpTy);
+  unsigned NewTyBits = 0;
+  SCEVHandle *NewStride = NULL;
+  Value *NewCmpLHS = NULL;
+  Value *NewCmpRHS = NULL;
+  int64_t Scale = 1;
+  SCEVHandle NewOffset = SE->getIntegerSCEV(0, CmpTy);
+
+  if (ConstantInt *C = dyn_cast<ConstantInt>(Cond->getOperand(1))) {
+    int64_t CmpVal = C->getValue().getSExtValue();
+
+    // Check stride constant and the comparision constant signs to detect
+    // overflow.
+    if ((CmpVal & SignBit) != (CmpSSInt & SignBit))
+      return Cond;
+
+    // Look for a suitable stride / iv as replacement.
+    for (unsigned i = 0, e = IU->StrideOrder.size(); i != e; ++i) {
+      std::map<SCEVHandle, IVUsersOfOneStride *>::iterator SI =
+        IU->IVUsesByStride.find(IU->StrideOrder[i]);
+      if (!isa<SCEVConstant>(SI->first))
+        continue;
+      int64_t SSInt = cast<SCEVConstant>(SI->first)->getValue()->getSExtValue();
+      if (SSInt == CmpSSInt ||
+          abs64(SSInt) < abs64(CmpSSInt) ||
+          (SSInt % CmpSSInt) != 0)
+        continue;
+
+      Scale = SSInt / CmpSSInt;
+      int64_t NewCmpVal = CmpVal * Scale;
+      APInt Mul = APInt(BitWidth*2, CmpVal, true);
+      Mul = Mul * APInt(BitWidth*2, Scale, true);
+      // Check for overflow.
+      if (!Mul.isSignedIntN(BitWidth))
+        continue;
+      // Check for overflow in the stride's type too.
+      if (!Mul.isSignedIntN(SE->getTypeSizeInBits(SI->first->getType())))
+        continue;
+
+      // Watch out for overflow.
+      if (ICmpInst::isSignedPredicate(Predicate) &&
+          (CmpVal & SignBit) != (NewCmpVal & SignBit))
+        continue;
+
+      if (NewCmpVal == CmpVal)
+        continue;
+      // Pick the best iv to use trying to avoid a cast.
+      NewCmpLHS = NULL;
+      for (ilist<IVStrideUse>::iterator UI = SI->second->Users.begin(),
+             E = SI->second->Users.end(); UI != E; ++UI) {
+        Value *Op = UI->getOperandValToReplace();
+
+        // If the IVStrideUse implies a cast, check for an actual cast which
+        // can be used to find the original IV expression.
+        if (SE->getEffectiveSCEVType(Op->getType()) !=
+            SE->getEffectiveSCEVType(SI->first->getType())) {
+          CastInst *CI = dyn_cast<CastInst>(Op);
+          // If it's not a simple cast, it's complicated.
+          if (!CI)
+            continue;
+          // If it's a cast from a type other than the stride type,
+          // it's complicated.
+          if (CI->getOperand(0)->getType() != SI->first->getType())
+            continue;
+          // Ok, we found the IV expression in the stride's type.
+          Op = CI->getOperand(0);
+        }
+
+        NewCmpLHS = Op;
+        if (NewCmpLHS->getType() == CmpTy)
+          break;
+      }
+      if (!NewCmpLHS)
+        continue;
+
+      NewCmpTy = NewCmpLHS->getType();
+      NewTyBits = SE->getTypeSizeInBits(NewCmpTy);
+      const Type *NewCmpIntTy = IntegerType::get(NewTyBits);
+      if (RequiresTypeConversion(NewCmpTy, CmpTy)) {
+        // Check if it is possible to rewrite it using
+        // an iv / stride of a smaller integer type.
+        unsigned Bits = NewTyBits;
+        if (ICmpInst::isSignedPredicate(Predicate))
+          --Bits;
+        uint64_t Mask = (1ULL << Bits) - 1;
+        if (((uint64_t)NewCmpVal & Mask) != (uint64_t)NewCmpVal)
+          continue;
+      }
+
+      // Don't rewrite if use offset is non-constant and the new type is
+      // of a different type.
+      // FIXME: too conservative?
+      if (NewTyBits != TyBits && !isa<SCEVConstant>(CondUse->getOffset()))
+        continue;
+
+      bool AllUsesAreAddresses = true;
+      bool AllUsesAreOutsideLoop = true;
+      std::vector<BasedUser> UsersToProcess;
+      SCEVHandle CommonExprs = CollectIVUsers(SI->first, *SI->second, L,
+                                              AllUsesAreAddresses,
+                                              AllUsesAreOutsideLoop,
+                                              UsersToProcess);
+      // Avoid rewriting the compare instruction with an iv of new stride
+      // if it's likely the new stride uses will be rewritten using the
+      // stride of the compare instruction.
+      if (AllUsesAreAddresses &&
+          ValidScale(!CommonExprs->isZero(), Scale, UsersToProcess))
+        continue;
+
+      // Avoid rewriting the compare instruction with an iv which has
+      // implicit extension or truncation built into it.
+      // TODO: This is over-conservative.
+      if (SE->getTypeSizeInBits(CondUse->getOffset()->getType()) != TyBits)
+        continue;
+
+      // If scale is negative, use swapped predicate unless it's testing
+      // for equality.
+      if (Scale < 0 && !Cond->isEquality())
+        Predicate = ICmpInst::getSwappedPredicate(Predicate);
+
+      NewStride = &IU->StrideOrder[i];
+      if (!isa<PointerType>(NewCmpTy))
+        NewCmpRHS = ConstantInt::get(NewCmpTy, NewCmpVal);
+      else {
+        ConstantInt *CI = ConstantInt::get(NewCmpIntTy, NewCmpVal);
+        NewCmpRHS = ConstantExpr::getIntToPtr(CI, NewCmpTy);
+      }
+      NewOffset = TyBits == NewTyBits
+        ? SE->getMulExpr(CondUse->getOffset(),
+                         SE->getConstant(ConstantInt::get(CmpTy, Scale)))
+        : SE->getConstant(ConstantInt::get(NewCmpIntTy,
+          cast<SCEVConstant>(CondUse->getOffset())->getValue()
+            ->getSExtValue()*Scale));
+      break;
+    }
+  }
+
+  // Forgo this transformation if it the increment happens to be
+  // unfortunately positioned after the condition, and the condition
+  // has multiple uses which prevent it from being moved immediately
+  // before the branch. See
+  // test/Transforms/LoopStrengthReduce/change-compare-stride-trickiness-*.ll
+  // for an example of this situation.
+  if (!Cond->hasOneUse()) {
+    for (BasicBlock::iterator I = Cond, E = Cond->getParent()->end();
+         I != E; ++I)
+      if (I == NewCmpLHS)
+        return Cond;
+  }
+
+  if (NewCmpRHS) {
+    // Create a new compare instruction using new stride / iv.
+    ICmpInst *OldCond = Cond;
+    // Insert new compare instruction.
+    Cond = new ICmpInst(Predicate, NewCmpLHS, NewCmpRHS,
+                        L->getHeader()->getName() + ".termcond",
+                        OldCond);
+
+    // Remove the old compare instruction. The old indvar is probably dead too.
+    DeadInsts.push_back(CondUse->getOperandValToReplace());
+    OldCond->replaceAllUsesWith(Cond);
+    OldCond->eraseFromParent();
+
+    IU->IVUsesByStride[*NewStride]->addUser(NewOffset, Cond, NewCmpLHS, false);
+    CondUse = &IU->IVUsesByStride[*NewStride]->Users.back();
+    CondStride = NewStride;
+    ++NumEliminated;
+    Changed = true;
+  }
+
+  return Cond;
+}
+
+/// OptimizeSMax - Rewrite the loop's terminating condition if it uses
+/// an smax computation.
+///
+/// This is a narrow solution to a specific, but acute, problem. For loops
+/// like this:
+///
+///   i = 0;
+///   do {
+///     p[i] = 0.0;
+///   } while (++i < n);
+///
+/// where the comparison is signed, the trip count isn't just 'n', because
+/// 'n' could be negative. And unfortunately this can come up even for loops
+/// where the user didn't use a C do-while loop. For example, seemingly
+/// well-behaved top-test loops will commonly be lowered like this:
+//
+///   if (n > 0) {
+///     i = 0;
+///     do {
+///       p[i] = 0.0;
+///     } while (++i < n);
+///   }
+///
+/// and then it's possible for subsequent optimization to obscure the if
+/// test in such a way that indvars can't find it.
+///
+/// When indvars can't find the if test in loops like this, it creates a
+/// signed-max expression, which allows it to give the loop a canonical
+/// induction variable:
+///
+///   i = 0;
+///   smax = n < 1 ? 1 : n;
+///   do {
+///     p[i] = 0.0;
+///   } while (++i != smax);
+///
+/// Canonical induction variables are necessary because the loop passes
+/// are designed around them. The most obvious example of this is the
+/// LoopInfo analysis, which doesn't remember trip count values. It
+/// expects to be able to rediscover the trip count each time it is
+/// needed, and it does this using a simple analyis that only succeeds if
+/// the loop has a canonical induction variable.
+///
+/// However, when it comes time to generate code, the maximum operation
+/// can be quite costly, especially if it's inside of an outer loop.
+///
+/// This function solves this problem by detecting this type of loop and
+/// rewriting their conditions from ICMP_NE back to ICMP_SLT, and deleting
+/// the instructions for the maximum computation.
+///
+ICmpInst *LoopStrengthReduce::OptimizeSMax(Loop *L, ICmpInst *Cond,
+                                           IVStrideUse* &CondUse) {
+  // Check that the loop matches the pattern we're looking for.
+  if (Cond->getPredicate() != CmpInst::ICMP_EQ &&
+      Cond->getPredicate() != CmpInst::ICMP_NE)
+    return Cond;
+
+  SelectInst *Sel = dyn_cast<SelectInst>(Cond->getOperand(1));
+  if (!Sel || !Sel->hasOneUse()) return Cond;
+
+  SCEVHandle BackedgeTakenCount = SE->getBackedgeTakenCount(L);
+  if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
+    return Cond;
+  SCEVHandle One = SE->getIntegerSCEV(1, BackedgeTakenCount->getType());
+
+  // Add one to the backedge-taken count to get the trip count.
+  SCEVHandle IterationCount = SE->getAddExpr(BackedgeTakenCount, One);
+
+  // Check for a max calculation that matches the pattern.
+  const SCEVSMaxExpr *SMax = dyn_cast<SCEVSMaxExpr>(IterationCount);
+  if (!SMax || SMax != SE->getSCEV(Sel)) return Cond;
+
+  SCEVHandle SMaxLHS = SMax->getOperand(0);
+  SCEVHandle SMaxRHS = SMax->getOperand(1);
+  if (!SMaxLHS || SMaxLHS != One) return Cond;
+
+  // Check the relevant induction variable for conformance to
+  // the pattern.
+  SCEVHandle IV = SE->getSCEV(Cond->getOperand(0));
+  const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(IV);
+  if (!AR || !AR->isAffine() ||
+      AR->getStart() != One ||
+      AR->getStepRecurrence(*SE) != One)
+    return Cond;
+
+  assert(AR->getLoop() == L &&
+         "Loop condition operand is an addrec in a different loop!");
+
+  // Check the right operand of the select, and remember it, as it will
+  // be used in the new comparison instruction.
+  Value *NewRHS = 0;
+  if (SE->getSCEV(Sel->getOperand(1)) == SMaxRHS)
+    NewRHS = Sel->getOperand(1);
+  else if (SE->getSCEV(Sel->getOperand(2)) == SMaxRHS)
+    NewRHS = Sel->getOperand(2);
+  if (!NewRHS) return Cond;
+
+  // Ok, everything looks ok to change the condition into an SLT or SGE and
+  // delete the max calculation.
+  ICmpInst *NewCond =
+    new ICmpInst(Cond->getPredicate() == CmpInst::ICMP_NE ?
+                   CmpInst::ICMP_SLT :
+                   CmpInst::ICMP_SGE,
+                 Cond->getOperand(0), NewRHS, "scmp", Cond);
+
+  // Delete the max calculation instructions.
+  Cond->replaceAllUsesWith(NewCond);
+  CondUse->setUser(NewCond);
+  Instruction *Cmp = cast<Instruction>(Sel->getOperand(0));
+  Cond->eraseFromParent();
+  Sel->eraseFromParent();
+  if (Cmp->use_empty())
+    Cmp->eraseFromParent();
+  return NewCond;
+}
+
+/// OptimizeShadowIV - If IV is used in a int-to-float cast
+/// inside the loop then try to eliminate the cast opeation.
+void LoopStrengthReduce::OptimizeShadowIV(Loop *L) {
+
+  SCEVHandle BackedgeTakenCount = SE->getBackedgeTakenCount(L);
+  if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
+    return;
+
+  for (unsigned Stride = 0, e = IU->StrideOrder.size(); Stride != e;
+       ++Stride) {
+    std::map<SCEVHandle, IVUsersOfOneStride *>::iterator SI =
+      IU->IVUsesByStride.find(IU->StrideOrder[Stride]);
+    assert(SI != IU->IVUsesByStride.end() && "Stride doesn't exist!");
+    if (!isa<SCEVConstant>(SI->first))
+      continue;
+
+    for (ilist<IVStrideUse>::iterator UI = SI->second->Users.begin(),
+           E = SI->second->Users.end(); UI != E; /* empty */) {
+      ilist<IVStrideUse>::iterator CandidateUI = UI;
+      ++UI;
+      Instruction *ShadowUse = CandidateUI->getUser();
+      const Type *DestTy = NULL;
+
+      /* If shadow use is a int->float cast then insert a second IV
+         to eliminate this cast.
+
+           for (unsigned i = 0; i < n; ++i) 
+             foo((double)i);
+
+         is transformed into
+
+           double d = 0.0;
+           for (unsigned i = 0; i < n; ++i, ++d) 
+             foo(d);
+      */
+      if (UIToFPInst *UCast = dyn_cast<UIToFPInst>(CandidateUI->getUser()))
+        DestTy = UCast->getDestTy();
+      else if (SIToFPInst *SCast = dyn_cast<SIToFPInst>(CandidateUI->getUser()))
+        DestTy = SCast->getDestTy();
+      if (!DestTy) continue;
+
+      if (TLI) {
+        // If target does not support DestTy natively then do not apply
+        // this transformation.
+        MVT DVT = TLI->getValueType(DestTy);
+        if (!TLI->isTypeLegal(DVT)) continue;
+      }
+
+      PHINode *PH = dyn_cast<PHINode>(ShadowUse->getOperand(0));
+      if (!PH) continue;
+      if (PH->getNumIncomingValues() != 2) continue;
+
+      const Type *SrcTy = PH->getType();
+      int Mantissa = DestTy->getFPMantissaWidth();
+      if (Mantissa == -1) continue; 
+      if ((int)SE->getTypeSizeInBits(SrcTy) > Mantissa)
+        continue;
+
+      unsigned Entry, Latch;
+      if (PH->getIncomingBlock(0) == L->getLoopPreheader()) {
+        Entry = 0;
+        Latch = 1;
+      } else {
+        Entry = 1;
+        Latch = 0;
+      }
+        
+      ConstantInt *Init = dyn_cast<ConstantInt>(PH->getIncomingValue(Entry));
+      if (!Init) continue;
+      ConstantFP *NewInit = ConstantFP::get(DestTy, Init->getZExtValue());
+
+      BinaryOperator *Incr = 
+        dyn_cast<BinaryOperator>(PH->getIncomingValue(Latch));
+      if (!Incr) continue;
+      if (Incr->getOpcode() != Instruction::Add
+          && Incr->getOpcode() != Instruction::Sub)
+        continue;
+
+      /* Initialize new IV, double d = 0.0 in above example. */
+      ConstantInt *C = NULL;
+      if (Incr->getOperand(0) == PH)
+        C = dyn_cast<ConstantInt>(Incr->getOperand(1));
+      else if (Incr->getOperand(1) == PH)
+        C = dyn_cast<ConstantInt>(Incr->getOperand(0));
+      else
+        continue;
+
+      if (!C) continue;
+
+      /* Add new PHINode. */
+      PHINode *NewPH = PHINode::Create(DestTy, "IV.S.", PH);
+
+      /* create new increment. '++d' in above example. */
+      ConstantFP *CFP = ConstantFP::get(DestTy, C->getZExtValue());
+      BinaryOperator *NewIncr = 
+        BinaryOperator::Create(Incr->getOpcode(),
+                               NewPH, CFP, "IV.S.next.", Incr);
+
+      NewPH->addIncoming(NewInit, PH->getIncomingBlock(Entry));
+      NewPH->addIncoming(NewIncr, PH->getIncomingBlock(Latch));
+
+      /* Remove cast operation */
+      ShadowUse->replaceAllUsesWith(NewPH);
+      ShadowUse->eraseFromParent();
+      NumShadow++;
+      break;
+    }
+  }
+}
+
+// OptimizeIndvars - Now that IVUsesByStride is set up with all of the indvar
+// uses in the loop, look to see if we can eliminate some, in favor of using
+// common indvars for the different uses.
+void LoopStrengthReduce::OptimizeIndvars(Loop *L) {
+  // TODO: implement optzns here.
+
+  OptimizeShadowIV(L);
+}
+
+/// OptimizeLoopTermCond - Change loop terminating condition to use the 
+/// postinc iv when possible.
+void LoopStrengthReduce::OptimizeLoopTermCond(Loop *L) {
+  // Finally, get the terminating condition for the loop if possible.  If we
+  // can, we want to change it to use a post-incremented version of its
+  // induction variable, to allow coalescing the live ranges for the IV into
+  // one register value.
+  BasicBlock *LatchBlock = L->getLoopLatch();
+  BasicBlock *ExitBlock = L->getExitingBlock();
+  if (!ExitBlock)
+    // Multiple exits, just look at the exit in the latch block if there is one.
+    ExitBlock = LatchBlock;
+  BranchInst *TermBr = dyn_cast<BranchInst>(ExitBlock->getTerminator());
+  if (!TermBr)
+    return;
+  if (TermBr->isUnconditional() || !isa<ICmpInst>(TermBr->getCondition()))
+    return;
+
+  // Search IVUsesByStride to find Cond's IVUse if there is one.
+  IVStrideUse *CondUse = 0;
+  const SCEVHandle *CondStride = 0;
+  ICmpInst *Cond = cast<ICmpInst>(TermBr->getCondition());
+  if (!FindIVUserForCond(Cond, CondUse, CondStride))
+    return; // setcc doesn't use the IV.
+
+  if (ExitBlock != LatchBlock) {
+    if (!Cond->hasOneUse())
+      // See below, we don't want the condition to be cloned.
+      return;
+
+    // If exiting block is the latch block, we know it's safe and profitable to
+    // transform the icmp to use post-inc iv. Otherwise do so only if it would
+    // not reuse another iv and its iv would be reused by other uses. We are
+    // optimizing for the case where the icmp is the only use of the iv.
+    IVUsersOfOneStride &StrideUses = *IU->IVUsesByStride[*CondStride];
+    for (ilist<IVStrideUse>::iterator I = StrideUses.Users.begin(),
+         E = StrideUses.Users.end(); I != E; ++I) {
+      if (I->getUser() == Cond)
+        continue;
+      if (!I->isUseOfPostIncrementedValue())
+        return;
+    }
+
+    // FIXME: This is expensive, and worse still ChangeCompareStride does a
+    // similar check. Can we perform all the icmp related transformations after
+    // StrengthReduceStridedIVUsers?
+    if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(*CondStride)) {
+      int64_t SInt = SC->getValue()->getSExtValue();
+      for (unsigned NewStride = 0, ee = IU->StrideOrder.size(); NewStride != ee;
+           ++NewStride) {
+        std::map<SCEVHandle, IVUsersOfOneStride *>::iterator SI =
+          IU->IVUsesByStride.find(IU->StrideOrder[NewStride]);
+        if (!isa<SCEVConstant>(SI->first) || SI->first == *CondStride)
+          continue;
+        int64_t SSInt =
+          cast<SCEVConstant>(SI->first)->getValue()->getSExtValue();
+        if (SSInt == SInt)
+          return; // This can definitely be reused.
+        if (unsigned(abs64(SSInt)) < SInt || (SSInt % SInt) != 0)
+          continue;
+        int64_t Scale = SSInt / SInt;
+        bool AllUsesAreAddresses = true;
+        bool AllUsesAreOutsideLoop = true;
+        std::vector<BasedUser> UsersToProcess;
+        SCEVHandle CommonExprs = CollectIVUsers(SI->first, *SI->second, L,
+                                                AllUsesAreAddresses,
+                                                AllUsesAreOutsideLoop,
+                                                UsersToProcess);
+        // Avoid rewriting the compare instruction with an iv of new stride
+        // if it's likely the new stride uses will be rewritten using the
+        // stride of the compare instruction.
+        if (AllUsesAreAddresses &&
+            ValidScale(!CommonExprs->isZero(), Scale, UsersToProcess))
+          return;
+      }
+    }
+
+    StrideNoReuse.insert(*CondStride);
+  }
+
+  // If the trip count is computed in terms of an smax (due to ScalarEvolution
+  // being unable to find a sufficient guard, for example), change the loop
+  // comparison to use SLT instead of NE.
+  Cond = OptimizeSMax(L, Cond, CondUse);
+
+  // If possible, change stride and operands of the compare instruction to
+  // eliminate one stride.
+  if (ExitBlock == LatchBlock)
+    Cond = ChangeCompareStride(L, Cond, CondUse, CondStride);
+
+  // It's possible for the setcc instruction to be anywhere in the loop, and
+  // possible for it to have multiple users.  If it is not immediately before
+  // the latch block branch, move it.
+  if (&*++BasicBlock::iterator(Cond) != (Instruction*)TermBr) {
+    if (Cond->hasOneUse()) {   // Condition has a single use, just move it.
+      Cond->moveBefore(TermBr);
+    } else {
+      // Otherwise, clone the terminating condition and insert into the loopend.
+      Cond = cast<ICmpInst>(Cond->clone());
+      Cond->setName(L->getHeader()->getName() + ".termcond");
+      LatchBlock->getInstList().insert(TermBr, Cond);
+      
+      // Clone the IVUse, as the old use still exists!
+      IU->IVUsesByStride[*CondStride]->addUser(CondUse->getOffset(), Cond,
+                                              CondUse->getOperandValToReplace(),
+                                               false);
+      CondUse = &IU->IVUsesByStride[*CondStride]->Users.back();
+    }
+  }
+
+  // If we get to here, we know that we can transform the setcc instruction to
+  // use the post-incremented version of the IV, allowing us to coalesce the
+  // live ranges for the IV correctly.
+  CondUse->setOffset(SE->getMinusSCEV(CondUse->getOffset(), *CondStride));
+  CondUse->setIsUseOfPostIncrementedValue(true);
+  Changed = true;
+
+  ++NumLoopCond;
+}
+
+// OptimizeLoopCountIV - If, after all sharing of IVs, the IV used for deciding
+// when to exit the loop is used only for that purpose, try to rearrange things
+// so it counts down to a test against zero.
+void LoopStrengthReduce::OptimizeLoopCountIV(Loop *L) {
+
+  // If the number of times the loop is executed isn't computable, give up.
+  SCEVHandle BackedgeTakenCount = SE->getBackedgeTakenCount(L);
+  if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
+    return;
+
+  // Get the terminating condition for the loop if possible (this isn't
+  // necessarily in the latch, or a block that's a predecessor of the header).
+  SmallVector<BasicBlock*, 8> ExitBlocks;
+  L->getExitBlocks(ExitBlocks);
+  if (ExitBlocks.size() != 1) return;
+
+  // Okay, there is one exit block.  Try to find the condition that causes the
+  // loop to be exited.
+  BasicBlock *ExitBlock = ExitBlocks[0];
+
+  BasicBlock *ExitingBlock = 0;
+  for (pred_iterator PI = pred_begin(ExitBlock), E = pred_end(ExitBlock);
+       PI != E; ++PI)
+    if (L->contains(*PI)) {
+      if (ExitingBlock == 0)
+        ExitingBlock = *PI;
+      else
+        return; // More than one block exiting!
+    }
+  assert(ExitingBlock && "No exits from loop, something is broken!");
+
+  // Okay, we've computed the exiting block.  See what condition causes us to
+  // exit.
+  //
+  // FIXME: we should be able to handle switch instructions (with a single exit)
+  BranchInst *TermBr = dyn_cast<BranchInst>(ExitingBlock->getTerminator());
+  if (TermBr == 0) return;
+  assert(TermBr->isConditional() && "If unconditional, it can't be in loop!");
+  if (!isa<ICmpInst>(TermBr->getCondition()))
+    return;
+  ICmpInst *Cond = cast<ICmpInst>(TermBr->getCondition());
+
+  // Handle only tests for equality for the moment, and only stride 1.
+  if (Cond->getPredicate() != CmpInst::ICMP_EQ)
+    return;
+  SCEVHandle IV = SE->getSCEV(Cond->getOperand(0));
+  const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(IV);
+  SCEVHandle One = SE->getIntegerSCEV(1, BackedgeTakenCount->getType());
+  if (!AR || !AR->isAffine() || AR->getStepRecurrence(*SE) != One)
+    return;
+  // If the RHS of the comparison is defined inside the loop, the rewrite
+  // cannot be done.
+  if (Instruction *CR = dyn_cast<Instruction>(Cond->getOperand(1)))
+    if (L->contains(CR->getParent()))
+      return;
+
+  // Make sure the IV is only used for counting.  Value may be preinc or
+  // postinc; 2 uses in either case.
+  if (!Cond->getOperand(0)->hasNUses(2))
+    return;
+  PHINode *phi = dyn_cast<PHINode>(Cond->getOperand(0));
+  Instruction *incr;
+  if (phi && phi->getParent()==L->getHeader()) {
+    // value tested is preinc.  Find the increment.
+    // A CmpInst is not a BinaryOperator; we depend on this.
+    Instruction::use_iterator UI = phi->use_begin();
+    incr = dyn_cast<BinaryOperator>(UI);
+    if (!incr)
+      incr = dyn_cast<BinaryOperator>(++UI);
+    // 1 use for postinc value, the phi.  Unnecessarily conservative?
+    if (!incr || !incr->hasOneUse() || incr->getOpcode()!=Instruction::Add)
+      return;
+  } else {
+    // Value tested is postinc.  Find the phi node.
+    incr = dyn_cast<BinaryOperator>(Cond->getOperand(0));
+    if (!incr || incr->getOpcode()!=Instruction::Add)
+      return;
+
+    Instruction::use_iterator UI = Cond->getOperand(0)->use_begin();
+    phi = dyn_cast<PHINode>(UI);
+    if (!phi)
+      phi = dyn_cast<PHINode>(++UI);
+    // 1 use for preinc value, the increment.
+    if (!phi || phi->getParent()!=L->getHeader() || !phi->hasOneUse())
+      return;
+  }
+
+  // Replace the increment with a decrement.
+  BinaryOperator *decr = 
+    BinaryOperator::Create(Instruction::Sub, incr->getOperand(0),
+                           incr->getOperand(1), "tmp", incr);
+  incr->replaceAllUsesWith(decr);
+  incr->eraseFromParent();
+
+  // Substitute endval-startval for the original startval, and 0 for the
+  // original endval.  Since we're only testing for equality this is OK even 
+  // if the computation wraps around.
+  BasicBlock  *Preheader = L->getLoopPreheader();
+  Instruction *PreInsertPt = Preheader->getTerminator();
+  int inBlock = L->contains(phi->getIncomingBlock(0)) ? 1 : 0;
+  Value *startVal = phi->getIncomingValue(inBlock);
+  Value *endVal = Cond->getOperand(1);
+  // FIXME check for case where both are constant
+  ConstantInt* Zero = ConstantInt::get(Cond->getOperand(1)->getType(), 0);
+  BinaryOperator *NewStartVal = 
+    BinaryOperator::Create(Instruction::Sub, endVal, startVal,
+                           "tmp", PreInsertPt);
+  phi->setIncomingValue(inBlock, NewStartVal);
+  Cond->setOperand(1, Zero);
+
+  Changed = true;
+}
+
+bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager &LPM) {
+
+  IU = &getAnalysis<IVUsers>();
+  LI = &getAnalysis<LoopInfo>();
+  DT = &getAnalysis<DominatorTree>();
+  SE = &getAnalysis<ScalarEvolution>();
+  Changed = false;
+
+  if (!IU->IVUsesByStride.empty()) {
+#ifndef NDEBUG
+    DOUT << "\nLSR on \"" << L->getHeader()->getParent()->getNameStart()
+         << "\" ";
+    DEBUG(L->dump());
+#endif
+
+    // Sort the StrideOrder so we process larger strides first.
+    std::stable_sort(IU->StrideOrder.begin(), IU->StrideOrder.end(),
+                     StrideCompare(SE));
+
+    // Optimize induction variables.  Some indvar uses can be transformed to use
+    // strides that will be needed for other purposes.  A common example of this
+    // is the exit test for the loop, which can often be rewritten to use the
+    // computation of some other indvar to decide when to terminate the loop.
+    OptimizeIndvars(L);
+
+    // Change loop terminating condition to use the postinc iv when possible
+    // and optimize loop terminating compare. FIXME: Move this after
+    // StrengthReduceStridedIVUsers?
+    OptimizeLoopTermCond(L);
+
+    // FIXME: We can shrink overlarge IV's here.  e.g. if the code has
+    // computation in i64 values and the target doesn't support i64, demote
+    // the computation to 32-bit if safe.
+
+    // FIXME: Attempt to reuse values across multiple IV's.  In particular, we
+    // could have something like "for(i) { foo(i*8); bar(i*16) }", which should
+    // be codegened as "for (j = 0;; j+=8) { foo(j); bar(j+j); }" on X86/PPC.
+    // Need to be careful that IV's are all the same type.  Only works for
+    // intptr_t indvars.
+
+    // IVsByStride keeps IVs for one particular loop.
+    assert(IVsByStride.empty() && "Stale entries in IVsByStride?");
+
+    // Note: this processes each stride/type pair individually.  All users
+    // passed into StrengthReduceStridedIVUsers have the same type AND stride.
+    // Also, note that we iterate over IVUsesByStride indirectly by using
+    // StrideOrder. This extra layer of indirection makes the ordering of
+    // strides deterministic - not dependent on map order.
+    for (unsigned Stride = 0, e = IU->StrideOrder.size();
+         Stride != e; ++Stride) {
+      std::map<SCEVHandle, IVUsersOfOneStride *>::iterator SI =
+        IU->IVUsesByStride.find(IU->StrideOrder[Stride]);
+      assert(SI != IU->IVUsesByStride.end() && "Stride doesn't exist!");
+      // FIXME: Generalize to non-affine IV's.
+      if (!SI->first->isLoopInvariant(L))
+        continue;
+      StrengthReduceStridedIVUsers(SI->first, *SI->second, L);
+    }
+  }
+
+  // After all sharing is done, see if we can adjust the loop to test against
+  // zero instead of counting up to a maximum.  This is usually faster.
+  OptimizeLoopCountIV(L);
+
+  // We're done analyzing this loop; release all the state we built up for it.
+  IVsByStride.clear();
+  StrideNoReuse.clear();
+
+  // Clean up after ourselves
+  if (!DeadInsts.empty())
+    DeleteTriviallyDeadInstructions();
+
+  // At this point, it is worth checking to see if any recurrence PHIs are also
+  // dead, so that we can remove them as well.
+  DeleteDeadPHIs(L->getHeader());
+
+  return Changed;
+}
diff --git a/lib/Transforms/Scalar/LoopUnroll.cpp b/lib/Transforms/Scalar/LoopUnroll.cpp
new file mode 100644
index 0000000..23757cd
--- /dev/null
+++ b/lib/Transforms/Scalar/LoopUnroll.cpp
@@ -0,0 +1,183 @@
+//===-- LoopUnroll.cpp - Loop unroller pass -------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass implements a simple loop unroller.  It works best when loops have
+// been canonicalized by the -indvars pass, allowing it to determine the trip
+// counts of loops easily.
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "loop-unroll"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Utils/UnrollLoop.h"
+#include <climits>
+
+using namespace llvm;
+
+static cl::opt<unsigned>
+UnrollThreshold("unroll-threshold", cl::init(100), cl::Hidden,
+  cl::desc("The cut-off point for automatic loop unrolling"));
+
+static cl::opt<unsigned>
+UnrollCount("unroll-count", cl::init(0), cl::Hidden,
+  cl::desc("Use this unroll count for all loops, for testing purposes"));
+
+static cl::opt<bool>
+UnrollAllowPartial("unroll-allow-partial", cl::init(false), cl::Hidden,
+  cl::desc("Allows loops to be partially unrolled until "
+           "-unroll-threshold loop size is reached."));
+
+namespace {
+  class VISIBILITY_HIDDEN LoopUnroll : public LoopPass {
+  public:
+    static char ID; // Pass ID, replacement for typeid
+    LoopUnroll() : LoopPass(&ID) {}
+
+    /// A magic value for use with the Threshold parameter to indicate
+    /// that the loop unroll should be performed regardless of how much
+    /// code expansion would result.
+    static const unsigned NoThreshold = UINT_MAX;
+
+    bool runOnLoop(Loop *L, LPPassManager &LPM);
+
+    /// This transformation requires natural loop information & requires that
+    /// loop preheaders be inserted into the CFG...
+    ///
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequiredID(LoopSimplifyID);
+      AU.addRequiredID(LCSSAID);
+      AU.addRequired<LoopInfo>();
+      AU.addPreservedID(LCSSAID);
+      AU.addPreserved<LoopInfo>();
+      // FIXME: Loop unroll requires LCSSA. And LCSSA requires dom info.
+      // If loop unroll does not preserve dom info then LCSSA pass on next
+      // loop will receive invalid dom info.
+      // For now, recreate dom info, if loop is unrolled.
+      AU.addPreserved<DominatorTree>();
+      AU.addPreserved<DominanceFrontier>();
+    }
+  };
+}
+
+char LoopUnroll::ID = 0;
+static RegisterPass<LoopUnroll> X("loop-unroll", "Unroll loops");
+
+Pass *llvm::createLoopUnrollPass() { return new LoopUnroll(); }
+
+/// ApproximateLoopSize - Approximate the size of the loop.
+static unsigned ApproximateLoopSize(const Loop *L) {
+  unsigned Size = 0;
+  for (Loop::block_iterator I = L->block_begin(), E = L->block_end();
+       I != E; ++I) {
+    BasicBlock *BB = *I;
+    Instruction *Term = BB->getTerminator();
+    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
+      if (isa<PHINode>(I) && BB == L->getHeader()) {
+        // Ignore PHI nodes in the header.
+      } else if (I->hasOneUse() && I->use_back() == Term) {
+        // Ignore instructions only used by the loop terminator.
+      } else if (isa<DbgInfoIntrinsic>(I)) {
+        // Ignore debug instructions
+      } else if (isa<GetElementPtrInst>(I) && I->hasOneUse()) {
+        // Ignore GEP as they generally are subsumed into a load or store.
+      } else if (isa<CallInst>(I)) {
+        // Estimate size overhead introduced by call instructions which
+        // is higher than other instructions. Here 3 and 10 are magic
+        // numbers that help one isolated test case from PR2067 without
+        // negatively impacting measured benchmarks.
+        if (isa<IntrinsicInst>(I))
+          Size = Size + 3;
+        else
+          Size = Size + 10;
+      } else {
+        ++Size;
+      }
+
+      // TODO: Ignore expressions derived from PHI and constants if inval of phi
+      // is a constant, or if operation is associative.  This will get induction
+      // variables.
+    }
+  }
+
+  return Size;
+}
+
+bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) {
+  assert(L->isLCSSAForm());
+  LoopInfo *LI = &getAnalysis<LoopInfo>();
+
+  BasicBlock *Header = L->getHeader();
+  DOUT << "Loop Unroll: F[" << Header->getParent()->getName()
+       << "] Loop %" << Header->getName() << "\n";
+
+  // Find trip count
+  unsigned TripCount = L->getSmallConstantTripCount();
+  unsigned Count = UnrollCount;
+ 
+  // Automatically select an unroll count.
+  if (Count == 0) {
+    // Conservative heuristic: if we know the trip count, see if we can
+    // completely unroll (subject to the threshold, checked below); otherwise
+    // try to find greatest modulo of the trip count which is still under 
+    // threshold value.
+    if (TripCount != 0) {
+      Count = TripCount;
+    } else {
+      return false;
+    }
+  }
+
+  // Enforce the threshold.
+  if (UnrollThreshold != NoThreshold) {
+    unsigned LoopSize = ApproximateLoopSize(L);
+    DOUT << "  Loop Size = " << LoopSize << "\n";
+    uint64_t Size = (uint64_t)LoopSize*Count;
+    if (TripCount != 1 && Size > UnrollThreshold) {
+      DOUT << "  Too large to fully unroll with count: " << Count
+           << " because size: " << Size << ">" << UnrollThreshold << "\n";
+      if (UnrollAllowPartial) {
+        // Reduce unroll count to be modulo of TripCount for partial unrolling
+        Count = UnrollThreshold / LoopSize;        
+        while (Count != 0 && TripCount%Count != 0) {
+          Count--;
+        }        
+        if (Count < 2) {
+          DOUT << "  could not unroll partially\n";
+          return false;
+        } else {
+          DOUT << "  partially unrolling with count: " << Count << "\n";
+        }
+      } else {
+        DOUT << "  will not try to unroll partially because "
+             << "-unroll-allow-partial not given\n";
+        return false;
+      }
+    }
+  }
+
+  // Unroll the loop.
+  Function *F = L->getHeader()->getParent();
+  if (!UnrollLoop(L, Count, LI, &LPM))
+    return false;
+
+  // FIXME: Reconstruct dom info, because it is not preserved properly.
+  DominatorTree *DT = getAnalysisIfAvailable<DominatorTree>();
+  if (DT) {
+    DT->runOnFunction(*F);
+    DominanceFrontier *DF = getAnalysisIfAvailable<DominanceFrontier>();
+    if (DF)
+      DF->runOnFunction(*F);
+  }
+  return true;
+}
diff --git a/lib/Transforms/Scalar/LoopUnswitch.cpp b/lib/Transforms/Scalar/LoopUnswitch.cpp
new file mode 100644
index 0000000..e3e881f
--- /dev/null
+++ b/lib/Transforms/Scalar/LoopUnswitch.cpp
@@ -0,0 +1,1098 @@
+//===-- LoopUnswitch.cpp - Hoist loop-invariant conditionals in loop ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass transforms loops that contain branches on loop-invariant conditions
+// to have multiple loops.  For example, it turns the left into the right code:
+//
+//  for (...)                  if (lic)
+//    A                          for (...)
+//    if (lic)                     A; B; C
+//      B                      else
+//    C                          for (...)
+//                                 A; C
+//
+// This can increase the size of the code exponentially (doubling it every time
+// a loop is unswitched) so we only unswitch if the resultant code will be
+// smaller than a threshold.
+//
+// This pass expects LICM to be run before it to hoist invariant conditions out
+// of the loop, to make the unswitching opportunity obvious.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "loop-unswitch"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
+#include "llvm/Instructions.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include <algorithm>
+#include <set>
+using namespace llvm;
+
+STATISTIC(NumBranches, "Number of branches unswitched");
+STATISTIC(NumSwitches, "Number of switches unswitched");
+STATISTIC(NumSelects , "Number of selects unswitched");
+STATISTIC(NumTrivial , "Number of unswitches that are trivial");
+STATISTIC(NumSimplify, "Number of simplifications of unswitched code");
+
+static cl::opt<unsigned>
+Threshold("loop-unswitch-threshold", cl::desc("Max loop size to unswitch"),
+          cl::init(10), cl::Hidden);
+  
+namespace {
+  class VISIBILITY_HIDDEN LoopUnswitch : public LoopPass {
+    LoopInfo *LI;  // Loop information
+    LPPassManager *LPM;
+
+    // LoopProcessWorklist - Used to check if second loop needs processing
+    // after RewriteLoopBodyWithConditionConstant rewrites first loop.
+    std::vector<Loop*> LoopProcessWorklist;
+    SmallPtrSet<Value *,8> UnswitchedVals;
+    
+    bool OptimizeForSize;
+    bool redoLoop;
+
+    Loop *currentLoop;
+    DominanceFrontier *DF;
+    DominatorTree *DT;
+    BasicBlock *loopHeader;
+    BasicBlock *loopPreheader;
+    
+    // LoopBlocks contains all of the basic blocks of the loop, including the
+    // preheader of the loop, the body of the loop, and the exit blocks of the 
+    // loop, in that order.
+    std::vector<BasicBlock*> LoopBlocks;
+    // NewBlocks contained cloned copy of basic blocks from LoopBlocks.
+    std::vector<BasicBlock*> NewBlocks;
+
+  public:
+    static char ID; // Pass ID, replacement for typeid
+    explicit LoopUnswitch(bool Os = false) : 
+      LoopPass(&ID), OptimizeForSize(Os), redoLoop(false), 
+      currentLoop(NULL), DF(NULL), DT(NULL), loopHeader(NULL),
+      loopPreheader(NULL) {}
+
+    bool runOnLoop(Loop *L, LPPassManager &LPM);
+    bool processCurrentLoop();
+
+    /// This transformation requires natural loop information & requires that
+    /// loop preheaders be inserted into the CFG...
+    ///
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequiredID(LoopSimplifyID);
+      AU.addPreservedID(LoopSimplifyID);
+      AU.addRequired<LoopInfo>();
+      AU.addPreserved<LoopInfo>();
+      AU.addRequiredID(LCSSAID);
+      AU.addPreservedID(LCSSAID);
+      AU.addPreserved<DominatorTree>();
+      AU.addPreserved<DominanceFrontier>();
+    }
+
+  private:
+
+    /// RemoveLoopFromWorklist - If the specified loop is on the loop worklist,
+    /// remove it.
+    void RemoveLoopFromWorklist(Loop *L) {
+      std::vector<Loop*>::iterator I = std::find(LoopProcessWorklist.begin(),
+                                                 LoopProcessWorklist.end(), L);
+      if (I != LoopProcessWorklist.end())
+        LoopProcessWorklist.erase(I);
+    }
+
+    void initLoopData() {
+      loopHeader = currentLoop->getHeader();
+      loopPreheader = currentLoop->getLoopPreheader();
+    }
+
+    /// Split all of the edges from inside the loop to their exit blocks.
+    /// Update the appropriate Phi nodes as we do so.
+    void SplitExitEdges(Loop *L, const SmallVector<BasicBlock *, 8> &ExitBlocks);
+
+    bool UnswitchIfProfitable(Value *LoopCond, Constant *Val);
+    unsigned getLoopUnswitchCost(Value *LIC);
+    void UnswitchTrivialCondition(Loop *L, Value *Cond, Constant *Val,
+                                  BasicBlock *ExitBlock);
+    void UnswitchNontrivialCondition(Value *LIC, Constant *OnVal, Loop *L);
+
+    void RewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC,
+                                              Constant *Val, bool isEqual);
+
+    void EmitPreheaderBranchOnCondition(Value *LIC, Constant *Val,
+                                        BasicBlock *TrueDest, 
+                                        BasicBlock *FalseDest,
+                                        Instruction *InsertPt);
+
+    void SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L);
+    void RemoveBlockIfDead(BasicBlock *BB,
+                           std::vector<Instruction*> &Worklist, Loop *l);
+    void RemoveLoopFromHierarchy(Loop *L);
+    bool IsTrivialUnswitchCondition(Value *Cond, Constant **Val = 0,
+                                    BasicBlock **LoopExit = 0);
+
+  };
+}
+char LoopUnswitch::ID = 0;
+static RegisterPass<LoopUnswitch> X("loop-unswitch", "Unswitch loops");
+
+Pass *llvm::createLoopUnswitchPass(bool Os) { 
+  return new LoopUnswitch(Os); 
+}
+
+/// FindLIVLoopCondition - Cond is a condition that occurs in L.  If it is
+/// invariant in the loop, or has an invariant piece, return the invariant.
+/// Otherwise, return null.
+static Value *FindLIVLoopCondition(Value *Cond, Loop *L, bool &Changed) {
+  // Constants should be folded, not unswitched on!
+  if (isa<Constant>(Cond)) return 0;
+
+  // TODO: Handle: br (VARIANT|INVARIANT).
+  // TODO: Hoist simple expressions out of loops.
+  if (L->isLoopInvariant(Cond)) return Cond;
+
+  if (BinaryOperator *BO = dyn_cast<BinaryOperator>(Cond))
+    if (BO->getOpcode() == Instruction::And ||
+        BO->getOpcode() == Instruction::Or) {
+      // If either the left or right side is invariant, we can unswitch on this,
+      // which will cause the branch to go away in one loop and the condition to
+      // simplify in the other one.
+      if (Value *LHS = FindLIVLoopCondition(BO->getOperand(0), L, Changed))
+        return LHS;
+      if (Value *RHS = FindLIVLoopCondition(BO->getOperand(1), L, Changed))
+        return RHS;
+    }
+  
+  return 0;
+}
+
+bool LoopUnswitch::runOnLoop(Loop *L, LPPassManager &LPM_Ref) {
+  LI = &getAnalysis<LoopInfo>();
+  LPM = &LPM_Ref;
+  DF = getAnalysisIfAvailable<DominanceFrontier>();
+  DT = getAnalysisIfAvailable<DominatorTree>();
+  currentLoop = L;
+  Function *F = currentLoop->getHeader()->getParent();
+  bool Changed = false;
+  do {
+    assert(currentLoop->isLCSSAForm());
+    redoLoop = false;
+    Changed |= processCurrentLoop();
+  } while(redoLoop);
+
+  if (Changed) {
+    // FIXME: Reconstruct dom info, because it is not preserved properly.
+    if (DT)
+      DT->runOnFunction(*F);
+    if (DF)
+      DF->runOnFunction(*F);
+  }
+  return Changed;
+}
+
+/// processCurrentLoop - Do actual work and unswitch loop if possible 
+/// and profitable.
+bool LoopUnswitch::processCurrentLoop() {
+  bool Changed = false;
+
+  // Loop over all of the basic blocks in the loop.  If we find an interior
+  // block that is branching on a loop-invariant condition, we can unswitch this
+  // loop.
+  for (Loop::block_iterator I = currentLoop->block_begin(), 
+         E = currentLoop->block_end();
+       I != E; ++I) {
+    TerminatorInst *TI = (*I)->getTerminator();
+    if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
+      // If this isn't branching on an invariant condition, we can't unswitch
+      // it.
+      if (BI->isConditional()) {
+        // See if this, or some part of it, is loop invariant.  If so, we can
+        // unswitch on it if we desire.
+        Value *LoopCond = FindLIVLoopCondition(BI->getCondition(), 
+                                               currentLoop, Changed);
+        if (LoopCond && UnswitchIfProfitable(LoopCond, 
+                                             ConstantInt::getTrue())) {
+          ++NumBranches;
+          return true;
+        }
+      }      
+    } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
+      Value *LoopCond = FindLIVLoopCondition(SI->getCondition(), 
+                                             currentLoop, Changed);
+      if (LoopCond && SI->getNumCases() > 1) {
+        // Find a value to unswitch on:
+        // FIXME: this should chose the most expensive case!
+        Constant *UnswitchVal = SI->getCaseValue(1);
+        // Do not process same value again and again.
+        if (!UnswitchedVals.insert(UnswitchVal))
+          continue;
+
+        if (UnswitchIfProfitable(LoopCond, UnswitchVal)) {
+          ++NumSwitches;
+          return true;
+        }
+      }
+    }
+    
+    // Scan the instructions to check for unswitchable values.
+    for (BasicBlock::iterator BBI = (*I)->begin(), E = (*I)->end(); 
+         BBI != E; ++BBI)
+      if (SelectInst *SI = dyn_cast<SelectInst>(BBI)) {
+        Value *LoopCond = FindLIVLoopCondition(SI->getCondition(), 
+                                               currentLoop, Changed);
+        if (LoopCond && UnswitchIfProfitable(LoopCond, 
+                                             ConstantInt::getTrue())) {
+          ++NumSelects;
+          return true;
+        }
+      }
+  }
+  return Changed;
+}
+
+/// isTrivialLoopExitBlock - Check to see if all paths from BB either:
+///   1. Exit the loop with no side effects.
+///   2. Branch to the latch block with no side-effects.
+///
+/// If these conditions are true, we return true and set ExitBB to the block we
+/// exit through.
+///
+static bool isTrivialLoopExitBlockHelper(Loop *L, BasicBlock *BB,
+                                         BasicBlock *&ExitBB,
+                                         std::set<BasicBlock*> &Visited) {
+  if (!Visited.insert(BB).second) {
+    // Already visited and Ok, end of recursion.
+    return true;
+  } else if (!L->contains(BB)) {
+    // Otherwise, this is a loop exit, this is fine so long as this is the
+    // first exit.
+    if (ExitBB != 0) return false;
+    ExitBB = BB;
+    return true;
+  }
+  
+  // Otherwise, this is an unvisited intra-loop node.  Check all successors.
+  for (succ_iterator SI = succ_begin(BB), E = succ_end(BB); SI != E; ++SI) {
+    // Check to see if the successor is a trivial loop exit.
+    if (!isTrivialLoopExitBlockHelper(L, *SI, ExitBB, Visited))
+      return false;
+  }
+
+  // Okay, everything after this looks good, check to make sure that this block
+  // doesn't include any side effects.
+  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I)
+    if (I->mayHaveSideEffects())
+      return false;
+  
+  return true;
+}
+
+/// isTrivialLoopExitBlock - Return true if the specified block unconditionally
+/// leads to an exit from the specified loop, and has no side-effects in the 
+/// process.  If so, return the block that is exited to, otherwise return null.
+static BasicBlock *isTrivialLoopExitBlock(Loop *L, BasicBlock *BB) {
+  std::set<BasicBlock*> Visited;
+  Visited.insert(L->getHeader());  // Branches to header are ok.
+  BasicBlock *ExitBB = 0;
+  if (isTrivialLoopExitBlockHelper(L, BB, ExitBB, Visited))
+    return ExitBB;
+  return 0;
+}
+
+/// IsTrivialUnswitchCondition - Check to see if this unswitch condition is
+/// trivial: that is, that the condition controls whether or not the loop does
+/// anything at all.  If this is a trivial condition, unswitching produces no
+/// code duplications (equivalently, it produces a simpler loop and a new empty
+/// loop, which gets deleted).
+///
+/// If this is a trivial condition, return true, otherwise return false.  When
+/// returning true, this sets Cond and Val to the condition that controls the
+/// trivial condition: when Cond dynamically equals Val, the loop is known to
+/// exit.  Finally, this sets LoopExit to the BB that the loop exits to when
+/// Cond == Val.
+///
+bool LoopUnswitch::IsTrivialUnswitchCondition(Value *Cond, Constant **Val,
+                                       BasicBlock **LoopExit) {
+  BasicBlock *Header = currentLoop->getHeader();
+  TerminatorInst *HeaderTerm = Header->getTerminator();
+  
+  BasicBlock *LoopExitBB = 0;
+  if (BranchInst *BI = dyn_cast<BranchInst>(HeaderTerm)) {
+    // If the header block doesn't end with a conditional branch on Cond, we
+    // can't handle it.
+    if (!BI->isConditional() || BI->getCondition() != Cond)
+      return false;
+  
+    // Check to see if a successor of the branch is guaranteed to go to the
+    // latch block or exit through a one exit block without having any 
+    // side-effects.  If so, determine the value of Cond that causes it to do
+    // this.
+    if ((LoopExitBB = isTrivialLoopExitBlock(currentLoop, 
+                                             BI->getSuccessor(0)))) {
+      if (Val) *Val = ConstantInt::getTrue();
+    } else if ((LoopExitBB = isTrivialLoopExitBlock(currentLoop, 
+                                                    BI->getSuccessor(1)))) {
+      if (Val) *Val = ConstantInt::getFalse();
+    }
+  } else if (SwitchInst *SI = dyn_cast<SwitchInst>(HeaderTerm)) {
+    // If this isn't a switch on Cond, we can't handle it.
+    if (SI->getCondition() != Cond) return false;
+    
+    // Check to see if a successor of the switch is guaranteed to go to the
+    // latch block or exit through a one exit block without having any 
+    // side-effects.  If so, determine the value of Cond that causes it to do
+    // this.  Note that we can't trivially unswitch on the default case.
+    for (unsigned i = 1, e = SI->getNumSuccessors(); i != e; ++i)
+      if ((LoopExitBB = isTrivialLoopExitBlock(currentLoop, 
+                                               SI->getSuccessor(i)))) {
+        // Okay, we found a trivial case, remember the value that is trivial.
+        if (Val) *Val = SI->getCaseValue(i);
+        break;
+      }
+  }
+
+  // If we didn't find a single unique LoopExit block, or if the loop exit block
+  // contains phi nodes, this isn't trivial.
+  if (!LoopExitBB || isa<PHINode>(LoopExitBB->begin()))
+    return false;   // Can't handle this.
+  
+  if (LoopExit) *LoopExit = LoopExitBB;
+  
+  // We already know that nothing uses any scalar values defined inside of this
+  // loop.  As such, we just have to check to see if this loop will execute any
+  // side-effecting instructions (e.g. stores, calls, volatile loads) in the
+  // part of the loop that the code *would* execute.  We already checked the
+  // tail, check the header now.
+  for (BasicBlock::iterator I = Header->begin(), E = Header->end(); I != E; ++I)
+    if (I->mayHaveSideEffects())
+      return false;
+  return true;
+}
+
+/// getLoopUnswitchCost - Return the cost (code size growth) that will happen if
+/// we choose to unswitch current loop on the specified value.
+///
+unsigned LoopUnswitch::getLoopUnswitchCost(Value *LIC) {
+  // If the condition is trivial, always unswitch.  There is no code growth for
+  // this case.
+  if (IsTrivialUnswitchCondition(LIC))
+    return 0;
+  
+  // FIXME: This is really overly conservative.  However, more liberal 
+  // estimations have thus far resulted in excessive unswitching, which is bad
+  // both in compile time and in code size.  This should be replaced once
+  // someone figures out how a good estimation.
+  return currentLoop->getBlocks().size();
+  
+  unsigned Cost = 0;
+  // FIXME: this is brain dead.  It should take into consideration code
+  // shrinkage.
+  for (Loop::block_iterator I = currentLoop->block_begin(), 
+         E = currentLoop->block_end();
+       I != E; ++I) {
+    BasicBlock *BB = *I;
+    // Do not include empty blocks in the cost calculation.  This happen due to
+    // loop canonicalization and will be removed.
+    if (BB->begin() == BasicBlock::iterator(BB->getTerminator()))
+      continue;
+    
+    // Count basic blocks.
+    ++Cost;
+  }
+
+  return Cost;
+}
+
+/// UnswitchIfProfitable - We have found that we can unswitch currentLoop when
+/// LoopCond == Val to simplify the loop.  If we decide that this is profitable,
+/// unswitch the loop, reprocess the pieces, then return true.
+bool LoopUnswitch::UnswitchIfProfitable(Value *LoopCond, Constant *Val){
+
+  initLoopData();
+  Function *F = loopHeader->getParent();
+
+
+  // Check to see if it would be profitable to unswitch current loop.
+  unsigned Cost = getLoopUnswitchCost(LoopCond);
+
+  // Do not do non-trivial unswitch while optimizing for size.
+  if (Cost && OptimizeForSize)
+    return false;
+  if (Cost && !F->isDeclaration() && F->hasFnAttr(Attribute::OptimizeForSize))
+    return false;
+
+  if (Cost > Threshold) {
+    // FIXME: this should estimate growth by the amount of code shared by the
+    // resultant unswitched loops.
+    //
+    DOUT << "NOT unswitching loop %"
+         << currentLoop->getHeader()->getName() << ", cost too high: "
+         << currentLoop->getBlocks().size() << "\n";
+    return false;
+  }
+
+  Constant *CondVal;
+  BasicBlock *ExitBlock;
+  if (IsTrivialUnswitchCondition(LoopCond, &CondVal, &ExitBlock)) {
+    UnswitchTrivialCondition(currentLoop, LoopCond, CondVal, ExitBlock);
+  } else {
+    UnswitchNontrivialCondition(LoopCond, Val, currentLoop);
+  }
+
+  return true;
+}
+
+// RemapInstruction - Convert the instruction operands from referencing the
+// current values into those specified by ValueMap.
+//
+static inline void RemapInstruction(Instruction *I,
+                                    DenseMap<const Value *, Value*> &ValueMap) {
+  for (unsigned op = 0, E = I->getNumOperands(); op != E; ++op) {
+    Value *Op = I->getOperand(op);
+    DenseMap<const Value *, Value*>::iterator It = ValueMap.find(Op);
+    if (It != ValueMap.end()) Op = It->second;
+    I->setOperand(op, Op);
+  }
+}
+
+/// CloneLoop - Recursively clone the specified loop and all of its children,
+/// mapping the blocks with the specified map.
+static Loop *CloneLoop(Loop *L, Loop *PL, DenseMap<const Value*, Value*> &VM,
+                       LoopInfo *LI, LPPassManager *LPM) {
+  Loop *New = new Loop();
+
+  LPM->insertLoop(New, PL);
+
+  // Add all of the blocks in L to the new loop.
+  for (Loop::block_iterator I = L->block_begin(), E = L->block_end();
+       I != E; ++I)
+    if (LI->getLoopFor(*I) == L)
+      New->addBasicBlockToLoop(cast<BasicBlock>(VM[*I]), LI->getBase());
+
+  // Add all of the subloops to the new loop.
+  for (Loop::iterator I = L->begin(), E = L->end(); I != E; ++I)
+    CloneLoop(*I, New, VM, LI, LPM);
+
+  return New;
+}
+
+/// EmitPreheaderBranchOnCondition - Emit a conditional branch on two values
+/// if LIC == Val, branch to TrueDst, otherwise branch to FalseDest.  Insert the
+/// code immediately before InsertPt.
+void LoopUnswitch::EmitPreheaderBranchOnCondition(Value *LIC, Constant *Val,
+                                                  BasicBlock *TrueDest,
+                                                  BasicBlock *FalseDest,
+                                                  Instruction *InsertPt) {
+  // Insert a conditional branch on LIC to the two preheaders.  The original
+  // code is the true version and the new code is the false version.
+  Value *BranchVal = LIC;
+  if (!isa<ConstantInt>(Val) || Val->getType() != Type::Int1Ty)
+    BranchVal = new ICmpInst(ICmpInst::ICMP_EQ, LIC, Val, "tmp", InsertPt);
+  else if (Val != ConstantInt::getTrue())
+    // We want to enter the new loop when the condition is true.
+    std::swap(TrueDest, FalseDest);
+
+  // Insert the new branch.
+  BranchInst::Create(TrueDest, FalseDest, BranchVal, InsertPt);
+}
+
+/// UnswitchTrivialCondition - Given a loop that has a trivial unswitchable
+/// condition in it (a cond branch from its header block to its latch block,
+/// where the path through the loop that doesn't execute its body has no 
+/// side-effects), unswitch it.  This doesn't involve any code duplication, just
+/// moving the conditional branch outside of the loop and updating loop info.
+void LoopUnswitch::UnswitchTrivialCondition(Loop *L, Value *Cond, 
+                                            Constant *Val, 
+                                            BasicBlock *ExitBlock) {
+  DOUT << "loop-unswitch: Trivial-Unswitch loop %"
+       << loopHeader->getName() << " [" << L->getBlocks().size()
+       << " blocks] in Function " << L->getHeader()->getParent()->getName()
+       << " on cond: " << *Val << " == " << *Cond << "\n";
+  
+  // First step, split the preheader, so that we know that there is a safe place
+  // to insert the conditional branch.  We will change loopPreheader to have a
+  // conditional branch on Cond.
+  BasicBlock *NewPH = SplitEdge(loopPreheader, loopHeader, this);
+
+  // Now that we have a place to insert the conditional branch, create a place
+  // to branch to: this is the exit block out of the loop that we should
+  // short-circuit to.
+  
+  // Split this block now, so that the loop maintains its exit block, and so
+  // that the jump from the preheader can execute the contents of the exit block
+  // without actually branching to it (the exit block should be dominated by the
+  // loop header, not the preheader).
+  assert(!L->contains(ExitBlock) && "Exit block is in the loop?");
+  BasicBlock *NewExit = SplitBlock(ExitBlock, ExitBlock->begin(), this);
+    
+  // Okay, now we have a position to branch from and a position to branch to, 
+  // insert the new conditional branch.
+  EmitPreheaderBranchOnCondition(Cond, Val, NewExit, NewPH, 
+                                 loopPreheader->getTerminator());
+  LPM->deleteSimpleAnalysisValue(loopPreheader->getTerminator(), L);
+  loopPreheader->getTerminator()->eraseFromParent();
+
+  // We need to reprocess this loop, it could be unswitched again.
+  redoLoop = true;
+  
+  // Now that we know that the loop is never entered when this condition is a
+  // particular value, rewrite the loop with this info.  We know that this will
+  // at least eliminate the old branch.
+  RewriteLoopBodyWithConditionConstant(L, Cond, Val, false);
+  ++NumTrivial;
+}
+
+/// SplitExitEdges - Split all of the edges from inside the loop to their exit
+/// blocks.  Update the appropriate Phi nodes as we do so.
+void LoopUnswitch::SplitExitEdges(Loop *L, 
+                                const SmallVector<BasicBlock *, 8> &ExitBlocks) 
+{
+
+  for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) {
+    BasicBlock *ExitBlock = ExitBlocks[i];
+    std::vector<BasicBlock*> Preds(pred_begin(ExitBlock), pred_end(ExitBlock));
+
+    for (unsigned j = 0, e = Preds.size(); j != e; ++j) {
+      BasicBlock* NewExitBlock = SplitEdge(Preds[j], ExitBlock, this);
+      BasicBlock* StartBlock = Preds[j];
+      BasicBlock* EndBlock;
+      if (NewExitBlock->getSinglePredecessor() == ExitBlock) {
+        EndBlock = NewExitBlock;
+        NewExitBlock = EndBlock->getSinglePredecessor();
+      } else {
+        EndBlock = ExitBlock;
+      }
+      
+      std::set<PHINode*> InsertedPHIs;
+      PHINode* OldLCSSA = 0;
+      for (BasicBlock::iterator I = EndBlock->begin();
+           (OldLCSSA = dyn_cast<PHINode>(I)); ++I) {
+        Value* OldValue = OldLCSSA->getIncomingValueForBlock(NewExitBlock);
+        PHINode* NewLCSSA = PHINode::Create(OldLCSSA->getType(),
+                                            OldLCSSA->getName() + ".us-lcssa",
+                                            NewExitBlock->getTerminator());
+        NewLCSSA->addIncoming(OldValue, StartBlock);
+        OldLCSSA->setIncomingValue(OldLCSSA->getBasicBlockIndex(NewExitBlock),
+                                   NewLCSSA);
+        InsertedPHIs.insert(NewLCSSA);
+      }
+
+      BasicBlock::iterator InsertPt = EndBlock->getFirstNonPHI();
+      for (BasicBlock::iterator I = NewExitBlock->begin();
+         (OldLCSSA = dyn_cast<PHINode>(I)) && InsertedPHIs.count(OldLCSSA) == 0;
+         ++I) {
+        PHINode *NewLCSSA = PHINode::Create(OldLCSSA->getType(),
+                                            OldLCSSA->getName() + ".us-lcssa",
+                                            InsertPt);
+        OldLCSSA->replaceAllUsesWith(NewLCSSA);
+        NewLCSSA->addIncoming(OldLCSSA, NewExitBlock);
+      }
+
+    }    
+  }
+
+}
+
+/// UnswitchNontrivialCondition - We determined that the loop is profitable 
+/// to unswitch when LIC equal Val.  Split it into loop versions and test the 
+/// condition outside of either loop.  Return the loops created as Out1/Out2.
+void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val, 
+                                               Loop *L) {
+  Function *F = loopHeader->getParent();
+  DOUT << "loop-unswitch: Unswitching loop %"
+       << loopHeader->getName() << " [" << L->getBlocks().size()
+       << " blocks] in Function " << F->getName()
+       << " when '" << *Val << "' == " << *LIC << "\n";
+
+  LoopBlocks.clear();
+  NewBlocks.clear();
+
+  // First step, split the preheader and exit blocks, and add these blocks to
+  // the LoopBlocks list.
+  BasicBlock *NewPreheader = SplitEdge(loopPreheader, loopHeader, this);
+  LoopBlocks.push_back(NewPreheader);
+
+  // We want the loop to come after the preheader, but before the exit blocks.
+  LoopBlocks.insert(LoopBlocks.end(), L->block_begin(), L->block_end());
+
+  SmallVector<BasicBlock*, 8> ExitBlocks;
+  L->getUniqueExitBlocks(ExitBlocks);
+
+  // Split all of the edges from inside the loop to their exit blocks.  Update
+  // the appropriate Phi nodes as we do so.
+  SplitExitEdges(L, ExitBlocks);
+
+  // The exit blocks may have been changed due to edge splitting, recompute.
+  ExitBlocks.clear();
+  L->getUniqueExitBlocks(ExitBlocks);
+
+  // Add exit blocks to the loop blocks.
+  LoopBlocks.insert(LoopBlocks.end(), ExitBlocks.begin(), ExitBlocks.end());
+
+  // Next step, clone all of the basic blocks that make up the loop (including
+  // the loop preheader and exit blocks), keeping track of the mapping between
+  // the instructions and blocks.
+  NewBlocks.reserve(LoopBlocks.size());
+  DenseMap<const Value*, Value*> ValueMap;
+  for (unsigned i = 0, e = LoopBlocks.size(); i != e; ++i) {
+    BasicBlock *New = CloneBasicBlock(LoopBlocks[i], ValueMap, ".us", F);
+    NewBlocks.push_back(New);
+    ValueMap[LoopBlocks[i]] = New;  // Keep the BB mapping.
+    LPM->cloneBasicBlockSimpleAnalysis(LoopBlocks[i], New, L);
+  }
+
+  // Splice the newly inserted blocks into the function right before the
+  // original preheader.
+  F->getBasicBlockList().splice(LoopBlocks[0], F->getBasicBlockList(),
+                                NewBlocks[0], F->end());
+
+  // Now we create the new Loop object for the versioned loop.
+  Loop *NewLoop = CloneLoop(L, L->getParentLoop(), ValueMap, LI, LPM);
+  Loop *ParentLoop = L->getParentLoop();
+  if (ParentLoop) {
+    // Make sure to add the cloned preheader and exit blocks to the parent loop
+    // as well.
+    ParentLoop->addBasicBlockToLoop(NewBlocks[0], LI->getBase());
+  }
+  
+  for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) {
+    BasicBlock *NewExit = cast<BasicBlock>(ValueMap[ExitBlocks[i]]);
+    // The new exit block should be in the same loop as the old one.
+    if (Loop *ExitBBLoop = LI->getLoopFor(ExitBlocks[i]))
+      ExitBBLoop->addBasicBlockToLoop(NewExit, LI->getBase());
+    
+    assert(NewExit->getTerminator()->getNumSuccessors() == 1 &&
+           "Exit block should have been split to have one successor!");
+    BasicBlock *ExitSucc = NewExit->getTerminator()->getSuccessor(0);
+
+    // If the successor of the exit block had PHI nodes, add an entry for
+    // NewExit.
+    PHINode *PN;
+    for (BasicBlock::iterator I = ExitSucc->begin();
+         (PN = dyn_cast<PHINode>(I)); ++I) {
+      Value *V = PN->getIncomingValueForBlock(ExitBlocks[i]);
+      DenseMap<const Value *, Value*>::iterator It = ValueMap.find(V);
+      if (It != ValueMap.end()) V = It->second;
+      PN->addIncoming(V, NewExit);
+    }
+  }
+
+  // Rewrite the code to refer to itself.
+  for (unsigned i = 0, e = NewBlocks.size(); i != e; ++i)
+    for (BasicBlock::iterator I = NewBlocks[i]->begin(),
+           E = NewBlocks[i]->end(); I != E; ++I)
+      RemapInstruction(I, ValueMap);
+  
+  // Rewrite the original preheader to select between versions of the loop.
+  BranchInst *OldBR = cast<BranchInst>(loopPreheader->getTerminator());
+  assert(OldBR->isUnconditional() && OldBR->getSuccessor(0) == LoopBlocks[0] &&
+         "Preheader splitting did not work correctly!");
+
+  // Emit the new branch that selects between the two versions of this loop.
+  EmitPreheaderBranchOnCondition(LIC, Val, NewBlocks[0], LoopBlocks[0], OldBR);
+  LPM->deleteSimpleAnalysisValue(OldBR, L);
+  OldBR->eraseFromParent();
+
+  LoopProcessWorklist.push_back(NewLoop);
+  redoLoop = true;
+
+  // Now we rewrite the original code to know that the condition is true and the
+  // new code to know that the condition is false.
+  RewriteLoopBodyWithConditionConstant(L      , LIC, Val, false);
+  
+  // It's possible that simplifying one loop could cause the other to be
+  // deleted.  If so, don't simplify it.
+  if (!LoopProcessWorklist.empty() && LoopProcessWorklist.back() == NewLoop)
+    RewriteLoopBodyWithConditionConstant(NewLoop, LIC, Val, true);
+
+}
+
+/// RemoveFromWorklist - Remove all instances of I from the worklist vector
+/// specified.
+static void RemoveFromWorklist(Instruction *I, 
+                               std::vector<Instruction*> &Worklist) {
+  std::vector<Instruction*>::iterator WI = std::find(Worklist.begin(),
+                                                     Worklist.end(), I);
+  while (WI != Worklist.end()) {
+    unsigned Offset = WI-Worklist.begin();
+    Worklist.erase(WI);
+    WI = std::find(Worklist.begin()+Offset, Worklist.end(), I);
+  }
+}
+
+/// ReplaceUsesOfWith - When we find that I really equals V, remove I from the
+/// program, replacing all uses with V and update the worklist.
+static void ReplaceUsesOfWith(Instruction *I, Value *V, 
+                              std::vector<Instruction*> &Worklist,
+                              Loop *L, LPPassManager *LPM) {
+  DOUT << "Replace with '" << *V << "': " << *I;
+
+  // Add uses to the worklist, which may be dead now.
+  for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i)
+    if (Instruction *Use = dyn_cast<Instruction>(I->getOperand(i)))
+      Worklist.push_back(Use);
+
+  // Add users to the worklist which may be simplified now.
+  for (Value::use_iterator UI = I->use_begin(), E = I->use_end();
+       UI != E; ++UI)
+    Worklist.push_back(cast<Instruction>(*UI));
+  LPM->deleteSimpleAnalysisValue(I, L);
+  RemoveFromWorklist(I, Worklist);
+  I->replaceAllUsesWith(V);
+  I->eraseFromParent();
+  ++NumSimplify;
+}
+
+/// RemoveBlockIfDead - If the specified block is dead, remove it, update loop
+/// information, and remove any dead successors it has.
+///
+void LoopUnswitch::RemoveBlockIfDead(BasicBlock *BB,
+                                     std::vector<Instruction*> &Worklist,
+                                     Loop *L) {
+  if (pred_begin(BB) != pred_end(BB)) {
+    // This block isn't dead, since an edge to BB was just removed, see if there
+    // are any easy simplifications we can do now.
+    if (BasicBlock *Pred = BB->getSinglePredecessor()) {
+      // If it has one pred, fold phi nodes in BB.
+      while (isa<PHINode>(BB->begin()))
+        ReplaceUsesOfWith(BB->begin(), 
+                          cast<PHINode>(BB->begin())->getIncomingValue(0), 
+                          Worklist, L, LPM);
+      
+      // If this is the header of a loop and the only pred is the latch, we now
+      // have an unreachable loop.
+      if (Loop *L = LI->getLoopFor(BB))
+        if (loopHeader == BB && L->contains(Pred)) {
+          // Remove the branch from the latch to the header block, this makes
+          // the header dead, which will make the latch dead (because the header
+          // dominates the latch).
+          LPM->deleteSimpleAnalysisValue(Pred->getTerminator(), L);
+          Pred->getTerminator()->eraseFromParent();
+          new UnreachableInst(Pred);
+          
+          // The loop is now broken, remove it from LI.
+          RemoveLoopFromHierarchy(L);
+          
+          // Reprocess the header, which now IS dead.
+          RemoveBlockIfDead(BB, Worklist, L);
+          return;
+        }
+      
+      // If pred ends in a uncond branch, add uncond branch to worklist so that
+      // the two blocks will get merged.
+      if (BranchInst *BI = dyn_cast<BranchInst>(Pred->getTerminator()))
+        if (BI->isUnconditional())
+          Worklist.push_back(BI);
+    }
+    return;
+  }
+
+  DOUT << "Nuking dead block: " << *BB;
+  
+  // Remove the instructions in the basic block from the worklist.
+  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
+    RemoveFromWorklist(I, Worklist);
+    
+    // Anything that uses the instructions in this basic block should have their
+    // uses replaced with undefs.
+    if (!I->use_empty())
+      I->replaceAllUsesWith(UndefValue::get(I->getType()));
+  }
+  
+  // If this is the edge to the header block for a loop, remove the loop and
+  // promote all subloops.
+  if (Loop *BBLoop = LI->getLoopFor(BB)) {
+    if (BBLoop->getLoopLatch() == BB)
+      RemoveLoopFromHierarchy(BBLoop);
+  }
+
+  // Remove the block from the loop info, which removes it from any loops it
+  // was in.
+  LI->removeBlock(BB);
+  
+  
+  // Remove phi node entries in successors for this block.
+  TerminatorInst *TI = BB->getTerminator();
+  SmallVector<BasicBlock*, 4> Succs;
+  for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i) {
+    Succs.push_back(TI->getSuccessor(i));
+    TI->getSuccessor(i)->removePredecessor(BB);
+  }
+  
+  // Unique the successors, remove anything with multiple uses.
+  array_pod_sort(Succs.begin(), Succs.end());
+  Succs.erase(std::unique(Succs.begin(), Succs.end()), Succs.end());
+  
+  // Remove the basic block, including all of the instructions contained in it.
+  LPM->deleteSimpleAnalysisValue(BB, L);  
+  BB->eraseFromParent();
+  // Remove successor blocks here that are not dead, so that we know we only
+  // have dead blocks in this list.  Nondead blocks have a way of becoming dead,
+  // then getting removed before we revisit them, which is badness.
+  //
+  for (unsigned i = 0; i != Succs.size(); ++i)
+    if (pred_begin(Succs[i]) != pred_end(Succs[i])) {
+      // One exception is loop headers.  If this block was the preheader for a
+      // loop, then we DO want to visit the loop so the loop gets deleted.
+      // We know that if the successor is a loop header, that this loop had to
+      // be the preheader: the case where this was the latch block was handled
+      // above and headers can only have two predecessors.
+      if (!LI->isLoopHeader(Succs[i])) {
+        Succs.erase(Succs.begin()+i);
+        --i;
+      }
+    }
+  
+  for (unsigned i = 0, e = Succs.size(); i != e; ++i)
+    RemoveBlockIfDead(Succs[i], Worklist, L);
+}
+
+/// RemoveLoopFromHierarchy - We have discovered that the specified loop has
+/// become unwrapped, either because the backedge was deleted, or because the
+/// edge into the header was removed.  If the edge into the header from the
+/// latch block was removed, the loop is unwrapped but subloops are still alive,
+/// so they just reparent loops.  If the loops are actually dead, they will be
+/// removed later.
+void LoopUnswitch::RemoveLoopFromHierarchy(Loop *L) {
+  LPM->deleteLoopFromQueue(L);
+  RemoveLoopFromWorklist(L);
+}
+
+// RewriteLoopBodyWithConditionConstant - We know either that the value LIC has
+// the value specified by Val in the specified loop, or we know it does NOT have
+// that value.  Rewrite any uses of LIC or of properties correlated to it.
+void LoopUnswitch::RewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC,
+                                                        Constant *Val,
+                                                        bool IsEqual) {
+  assert(!isa<Constant>(LIC) && "Why are we unswitching on a constant?");
+  
+  // FIXME: Support correlated properties, like:
+  //  for (...)
+  //    if (li1 < li2)
+  //      ...
+  //    if (li1 > li2)
+  //      ...
+  
+  // FOLD boolean conditions (X|LIC), (X&LIC).  Fold conditional branches,
+  // selects, switches.
+  std::vector<User*> Users(LIC->use_begin(), LIC->use_end());
+  std::vector<Instruction*> Worklist;
+
+  // If we know that LIC == Val, or that LIC == NotVal, just replace uses of LIC
+  // in the loop with the appropriate one directly.
+  if (IsEqual || (isa<ConstantInt>(Val) && Val->getType() == Type::Int1Ty)) {
+    Value *Replacement;
+    if (IsEqual)
+      Replacement = Val;
+    else
+      Replacement = ConstantInt::get(Type::Int1Ty, 
+                                     !cast<ConstantInt>(Val)->getZExtValue());
+    
+    for (unsigned i = 0, e = Users.size(); i != e; ++i)
+      if (Instruction *U = cast<Instruction>(Users[i])) {
+        if (!L->contains(U->getParent()))
+          continue;
+        U->replaceUsesOfWith(LIC, Replacement);
+        Worklist.push_back(U);
+      }
+  } else {
+    // Otherwise, we don't know the precise value of LIC, but we do know that it
+    // is certainly NOT "Val".  As such, simplify any uses in the loop that we
+    // can.  This case occurs when we unswitch switch statements.
+    for (unsigned i = 0, e = Users.size(); i != e; ++i)
+      if (Instruction *U = cast<Instruction>(Users[i])) {
+        if (!L->contains(U->getParent()))
+          continue;
+
+        Worklist.push_back(U);
+
+        // If we know that LIC is not Val, use this info to simplify code.
+        if (SwitchInst *SI = dyn_cast<SwitchInst>(U)) {
+          for (unsigned i = 1, e = SI->getNumCases(); i != e; ++i) {
+            if (SI->getCaseValue(i) == Val) {
+              // Found a dead case value.  Don't remove PHI nodes in the 
+              // successor if they become single-entry, those PHI nodes may
+              // be in the Users list.
+              
+              // FIXME: This is a hack.  We need to keep the successor around
+              // and hooked up so as to preserve the loop structure, because
+              // trying to update it is complicated.  So instead we preserve the
+              // loop structure and put the block on an dead code path.
+              
+              BasicBlock *SISucc = SI->getSuccessor(i);
+              BasicBlock* Old = SI->getParent();
+              BasicBlock* Split = SplitBlock(Old, SI, this);
+              
+              Instruction* OldTerm = Old->getTerminator();
+              BranchInst::Create(Split, SISucc,
+                                 ConstantInt::getTrue(), OldTerm);
+
+              LPM->deleteSimpleAnalysisValue(Old->getTerminator(), L);
+              Old->getTerminator()->eraseFromParent();
+              
+              PHINode *PN;
+              for (BasicBlock::iterator II = SISucc->begin();
+                   (PN = dyn_cast<PHINode>(II)); ++II) {
+                Value *InVal = PN->removeIncomingValue(Split, false);
+                PN->addIncoming(InVal, Old);
+              }
+
+              SI->removeCase(i);
+              break;
+            }
+          }
+        }
+        
+        // TODO: We could do other simplifications, for example, turning 
+        // LIC == Val -> false.
+      }
+  }
+  
+  SimplifyCode(Worklist, L);
+}
+
+/// SimplifyCode - Okay, now that we have simplified some instructions in the 
+/// loop, walk over it and constant prop, dce, and fold control flow where
+/// possible.  Note that this is effectively a very simple loop-structure-aware
+/// optimizer.  During processing of this loop, L could very well be deleted, so
+/// it must not be used.
+///
+/// FIXME: When the loop optimizer is more mature, separate this out to a new
+/// pass.
+///
+void LoopUnswitch::SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L) {
+  while (!Worklist.empty()) {
+    Instruction *I = Worklist.back();
+    Worklist.pop_back();
+    
+    // Simple constant folding.
+    if (Constant *C = ConstantFoldInstruction(I)) {
+      ReplaceUsesOfWith(I, C, Worklist, L, LPM);
+      continue;
+    }
+    
+    // Simple DCE.
+    if (isInstructionTriviallyDead(I)) {
+      DOUT << "Remove dead instruction '" << *I;
+      
+      // Add uses to the worklist, which may be dead now.
+      for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i)
+        if (Instruction *Use = dyn_cast<Instruction>(I->getOperand(i)))
+          Worklist.push_back(Use);
+      LPM->deleteSimpleAnalysisValue(I, L);
+      RemoveFromWorklist(I, Worklist);
+      I->eraseFromParent();
+      ++NumSimplify;
+      continue;
+    }
+    
+    // Special case hacks that appear commonly in unswitched code.
+    switch (I->getOpcode()) {
+    case Instruction::Select:
+      if (ConstantInt *CB = dyn_cast<ConstantInt>(I->getOperand(0))) {
+        ReplaceUsesOfWith(I, I->getOperand(!CB->getZExtValue()+1), Worklist, L,
+                          LPM);
+        continue;
+      }
+      break;
+    case Instruction::And:
+      if (isa<ConstantInt>(I->getOperand(0)) && 
+          I->getOperand(0)->getType() == Type::Int1Ty)   // constant -> RHS
+        cast<BinaryOperator>(I)->swapOperands();
+      if (ConstantInt *CB = dyn_cast<ConstantInt>(I->getOperand(1))) 
+        if (CB->getType() == Type::Int1Ty) {
+          if (CB->isOne())      // X & 1 -> X
+            ReplaceUsesOfWith(I, I->getOperand(0), Worklist, L, LPM);
+          else                  // X & 0 -> 0
+            ReplaceUsesOfWith(I, I->getOperand(1), Worklist, L, LPM);
+          continue;
+        }
+      break;
+    case Instruction::Or:
+      if (isa<ConstantInt>(I->getOperand(0)) &&
+          I->getOperand(0)->getType() == Type::Int1Ty)   // constant -> RHS
+        cast<BinaryOperator>(I)->swapOperands();
+      if (ConstantInt *CB = dyn_cast<ConstantInt>(I->getOperand(1)))
+        if (CB->getType() == Type::Int1Ty) {
+          if (CB->isOne())   // X | 1 -> 1
+            ReplaceUsesOfWith(I, I->getOperand(1), Worklist, L, LPM);
+          else                  // X | 0 -> X
+            ReplaceUsesOfWith(I, I->getOperand(0), Worklist, L, LPM);
+          continue;
+        }
+      break;
+    case Instruction::Br: {
+      BranchInst *BI = cast<BranchInst>(I);
+      if (BI->isUnconditional()) {
+        // If BI's parent is the only pred of the successor, fold the two blocks
+        // together.
+        BasicBlock *Pred = BI->getParent();
+        BasicBlock *Succ = BI->getSuccessor(0);
+        BasicBlock *SinglePred = Succ->getSinglePredecessor();
+        if (!SinglePred) continue;  // Nothing to do.
+        assert(SinglePred == Pred && "CFG broken");
+
+        DOUT << "Merging blocks: " << Pred->getName() << " <- " 
+             << Succ->getName() << "\n";
+        
+        // Resolve any single entry PHI nodes in Succ.
+        while (PHINode *PN = dyn_cast<PHINode>(Succ->begin()))
+          ReplaceUsesOfWith(PN, PN->getIncomingValue(0), Worklist, L, LPM);
+        
+        // Move all of the successor contents from Succ to Pred.
+        Pred->getInstList().splice(BI, Succ->getInstList(), Succ->begin(),
+                                   Succ->end());
+        LPM->deleteSimpleAnalysisValue(BI, L);
+        BI->eraseFromParent();
+        RemoveFromWorklist(BI, Worklist);
+        
+        // If Succ has any successors with PHI nodes, update them to have
+        // entries coming from Pred instead of Succ.
+        Succ->replaceAllUsesWith(Pred);
+        
+        // Remove Succ from the loop tree.
+        LI->removeBlock(Succ);
+        LPM->deleteSimpleAnalysisValue(Succ, L);
+        Succ->eraseFromParent();
+        ++NumSimplify;
+      } else if (ConstantInt *CB = dyn_cast<ConstantInt>(BI->getCondition())){
+        // Conditional branch.  Turn it into an unconditional branch, then
+        // remove dead blocks.
+        break;  // FIXME: Enable.
+
+        DOUT << "Folded branch: " << *BI;
+        BasicBlock *DeadSucc = BI->getSuccessor(CB->getZExtValue());
+        BasicBlock *LiveSucc = BI->getSuccessor(!CB->getZExtValue());
+        DeadSucc->removePredecessor(BI->getParent(), true);
+        Worklist.push_back(BranchInst::Create(LiveSucc, BI));
+        LPM->deleteSimpleAnalysisValue(BI, L);
+        BI->eraseFromParent();
+        RemoveFromWorklist(BI, Worklist);
+        ++NumSimplify;
+
+        RemoveBlockIfDead(DeadSucc, Worklist, L);
+      }
+      break;
+    }
+    }
+  }
+}
diff --git a/lib/Transforms/Scalar/Makefile b/lib/Transforms/Scalar/Makefile
new file mode 100644
index 0000000..cc42fd0
--- /dev/null
+++ b/lib/Transforms/Scalar/Makefile
@@ -0,0 +1,15 @@
+##===- lib/Transforms/Scalar/Makefile ----------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../../..
+LIBRARYNAME = LLVMScalarOpts
+BUILD_ARCHIVE = 1
+
+include $(LEVEL)/Makefile.common
+
diff --git a/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/lib/Transforms/Scalar/MemCpyOptimizer.cpp
new file mode 100644
index 0000000..5cf0518
--- /dev/null
+++ b/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -0,0 +1,741 @@
+//===- MemCpyOptimizer.cpp - Optimize use of memcpy and friends -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass performs various transformations related to eliminating memcpy
+// calls, or transforming sets of stores into memset's.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "memcpyopt"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Instructions.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/MemoryDependenceAnalysis.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/GetElementPtrTypeIterator.h"
+#include "llvm/Target/TargetData.h"
+#include <list>
+using namespace llvm;
+
+STATISTIC(NumMemCpyInstr, "Number of memcpy instructions deleted");
+STATISTIC(NumMemSetInfer, "Number of memsets inferred");
+
+/// isBytewiseValue - If the specified value can be set by repeating the same
+/// byte in memory, return the i8 value that it is represented with.  This is
+/// true for all i8 values obviously, but is also true for i32 0, i32 -1,
+/// i16 0xF0F0, double 0.0 etc.  If the value can't be handled with a repeated
+/// byte store (e.g. i16 0x1234), return null.
+static Value *isBytewiseValue(Value *V) {
+  // All byte-wide stores are splatable, even of arbitrary variables.
+  if (V->getType() == Type::Int8Ty) return V;
+  
+  // Constant float and double values can be handled as integer values if the
+  // corresponding integer value is "byteable".  An important case is 0.0. 
+  if (ConstantFP *CFP = dyn_cast<ConstantFP>(V)) {
+    if (CFP->getType() == Type::FloatTy)
+      V = ConstantExpr::getBitCast(CFP, Type::Int32Ty);
+    if (CFP->getType() == Type::DoubleTy)
+      V = ConstantExpr::getBitCast(CFP, Type::Int64Ty);
+    // Don't handle long double formats, which have strange constraints.
+  }
+  
+  // We can handle constant integers that are power of two in size and a 
+  // multiple of 8 bits.
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
+    unsigned Width = CI->getBitWidth();
+    if (isPowerOf2_32(Width) && Width > 8) {
+      // We can handle this value if the recursive binary decomposition is the
+      // same at all levels.
+      APInt Val = CI->getValue();
+      APInt Val2;
+      while (Val.getBitWidth() != 8) {
+        unsigned NextWidth = Val.getBitWidth()/2;
+        Val2  = Val.lshr(NextWidth);
+        Val2.trunc(Val.getBitWidth()/2);
+        Val.trunc(Val.getBitWidth()/2);
+
+        // If the top/bottom halves aren't the same, reject it.
+        if (Val != Val2)
+          return 0;
+      }
+      return ConstantInt::get(Val);
+    }
+  }
+  
+  // Conceptually, we could handle things like:
+  //   %a = zext i8 %X to i16
+  //   %b = shl i16 %a, 8
+  //   %c = or i16 %a, %b
+  // but until there is an example that actually needs this, it doesn't seem
+  // worth worrying about.
+  return 0;
+}
+
+static int64_t GetOffsetFromIndex(const GetElementPtrInst *GEP, unsigned Idx,
+                                  bool &VariableIdxFound, TargetData &TD) {
+  // Skip over the first indices.
+  gep_type_iterator GTI = gep_type_begin(GEP);
+  for (unsigned i = 1; i != Idx; ++i, ++GTI)
+    /*skip along*/;
+  
+  // Compute the offset implied by the rest of the indices.
+  int64_t Offset = 0;
+  for (unsigned i = Idx, e = GEP->getNumOperands(); i != e; ++i, ++GTI) {
+    ConstantInt *OpC = dyn_cast<ConstantInt>(GEP->getOperand(i));
+    if (OpC == 0)
+      return VariableIdxFound = true;
+    if (OpC->isZero()) continue;  // No offset.
+
+    // Handle struct indices, which add their field offset to the pointer.
+    if (const StructType *STy = dyn_cast<StructType>(*GTI)) {
+      Offset += TD.getStructLayout(STy)->getElementOffset(OpC->getZExtValue());
+      continue;
+    }
+    
+    // Otherwise, we have a sequential type like an array or vector.  Multiply
+    // the index by the ElementSize.
+    uint64_t Size = TD.getTypeAllocSize(GTI.getIndexedType());
+    Offset += Size*OpC->getSExtValue();
+  }
+
+  return Offset;
+}
+
+/// IsPointerOffset - Return true if Ptr1 is provably equal to Ptr2 plus a
+/// constant offset, and return that constant offset.  For example, Ptr1 might
+/// be &A[42], and Ptr2 might be &A[40].  In this case offset would be -8.
+static bool IsPointerOffset(Value *Ptr1, Value *Ptr2, int64_t &Offset,
+                            TargetData &TD) {
+  // Right now we handle the case when Ptr1/Ptr2 are both GEPs with an identical
+  // base.  After that base, they may have some number of common (and
+  // potentially variable) indices.  After that they handle some constant
+  // offset, which determines their offset from each other.  At this point, we
+  // handle no other case.
+  GetElementPtrInst *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1);
+  GetElementPtrInst *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2);
+  if (!GEP1 || !GEP2 || GEP1->getOperand(0) != GEP2->getOperand(0))
+    return false;
+  
+  // Skip any common indices and track the GEP types.
+  unsigned Idx = 1;
+  for (; Idx != GEP1->getNumOperands() && Idx != GEP2->getNumOperands(); ++Idx)
+    if (GEP1->getOperand(Idx) != GEP2->getOperand(Idx))
+      break;
+
+  bool VariableIdxFound = false;
+  int64_t Offset1 = GetOffsetFromIndex(GEP1, Idx, VariableIdxFound, TD);
+  int64_t Offset2 = GetOffsetFromIndex(GEP2, Idx, VariableIdxFound, TD);
+  if (VariableIdxFound) return false;
+  
+  Offset = Offset2-Offset1;
+  return true;
+}
+
+
+/// MemsetRange - Represents a range of memset'd bytes with the ByteVal value.
+/// This allows us to analyze stores like:
+///   store 0 -> P+1
+///   store 0 -> P+0
+///   store 0 -> P+3
+///   store 0 -> P+2
+/// which sometimes happens with stores to arrays of structs etc.  When we see
+/// the first store, we make a range [1, 2).  The second store extends the range
+/// to [0, 2).  The third makes a new range [2, 3).  The fourth store joins the
+/// two ranges into [0, 3) which is memset'able.
+namespace {
+struct MemsetRange {
+  // Start/End - A semi range that describes the span that this range covers.
+  // The range is closed at the start and open at the end: [Start, End).  
+  int64_t Start, End;
+
+  /// StartPtr - The getelementptr instruction that points to the start of the
+  /// range.
+  Value *StartPtr;
+  
+  /// Alignment - The known alignment of the first store.
+  unsigned Alignment;
+  
+  /// TheStores - The actual stores that make up this range.
+  SmallVector<StoreInst*, 16> TheStores;
+  
+  bool isProfitableToUseMemset(const TargetData &TD) const;
+
+};
+} // end anon namespace
+
+bool MemsetRange::isProfitableToUseMemset(const TargetData &TD) const {
+  // If we found more than 8 stores to merge or 64 bytes, use memset.
+  if (TheStores.size() >= 8 || End-Start >= 64) return true;
+  
+  // Assume that the code generator is capable of merging pairs of stores
+  // together if it wants to.
+  if (TheStores.size() <= 2) return false;
+  
+  // If we have fewer than 8 stores, it can still be worthwhile to do this.
+  // For example, merging 4 i8 stores into an i32 store is useful almost always.
+  // However, merging 2 32-bit stores isn't useful on a 32-bit architecture (the
+  // memset will be split into 2 32-bit stores anyway) and doing so can
+  // pessimize the llvm optimizer.
+  //
+  // Since we don't have perfect knowledge here, make some assumptions: assume
+  // the maximum GPR width is the same size as the pointer size and assume that
+  // this width can be stored.  If so, check to see whether we will end up
+  // actually reducing the number of stores used.
+  unsigned Bytes = unsigned(End-Start);
+  unsigned NumPointerStores = Bytes/TD.getPointerSize();
+  
+  // Assume the remaining bytes if any are done a byte at a time.
+  unsigned NumByteStores = Bytes - NumPointerStores*TD.getPointerSize();
+  
+  // If we will reduce the # stores (according to this heuristic), do the
+  // transformation.  This encourages merging 4 x i8 -> i32 and 2 x i16 -> i32
+  // etc.
+  return TheStores.size() > NumPointerStores+NumByteStores;
+}    
+
+
+namespace {
+class MemsetRanges {
+  /// Ranges - A sorted list of the memset ranges.  We use std::list here
+  /// because each element is relatively large and expensive to copy.
+  std::list<MemsetRange> Ranges;
+  typedef std::list<MemsetRange>::iterator range_iterator;
+  TargetData &TD;
+public:
+  MemsetRanges(TargetData &td) : TD(td) {}
+  
+  typedef std::list<MemsetRange>::const_iterator const_iterator;
+  const_iterator begin() const { return Ranges.begin(); }
+  const_iterator end() const { return Ranges.end(); }
+  bool empty() const { return Ranges.empty(); }
+  
+  void addStore(int64_t OffsetFromFirst, StoreInst *SI);
+};
+  
+} // end anon namespace
+
+
+/// addStore - Add a new store to the MemsetRanges data structure.  This adds a
+/// new range for the specified store at the specified offset, merging into
+/// existing ranges as appropriate.
+void MemsetRanges::addStore(int64_t Start, StoreInst *SI) {
+  int64_t End = Start+TD.getTypeStoreSize(SI->getOperand(0)->getType());
+  
+  // Do a linear search of the ranges to see if this can be joined and/or to
+  // find the insertion point in the list.  We keep the ranges sorted for
+  // simplicity here.  This is a linear search of a linked list, which is ugly,
+  // however the number of ranges is limited, so this won't get crazy slow.
+  range_iterator I = Ranges.begin(), E = Ranges.end();
+  
+  while (I != E && Start > I->End)
+    ++I;
+  
+  // We now know that I == E, in which case we didn't find anything to merge
+  // with, or that Start <= I->End.  If End < I->Start or I == E, then we need
+  // to insert a new range.  Handle this now.
+  if (I == E || End < I->Start) {
+    MemsetRange &R = *Ranges.insert(I, MemsetRange());
+    R.Start        = Start;
+    R.End          = End;
+    R.StartPtr     = SI->getPointerOperand();
+    R.Alignment    = SI->getAlignment();
+    R.TheStores.push_back(SI);
+    return;
+  }
+
+  // This store overlaps with I, add it.
+  I->TheStores.push_back(SI);
+  
+  // At this point, we may have an interval that completely contains our store.
+  // If so, just add it to the interval and return.
+  if (I->Start <= Start && I->End >= End)
+    return;
+  
+  // Now we know that Start <= I->End and End >= I->Start so the range overlaps
+  // but is not entirely contained within the range.
+  
+  // See if the range extends the start of the range.  In this case, it couldn't
+  // possibly cause it to join the prior range, because otherwise we would have
+  // stopped on *it*.
+  if (Start < I->Start) {
+    I->Start = Start;
+    I->StartPtr = SI->getPointerOperand();
+  }
+    
+  // Now we know that Start <= I->End and Start >= I->Start (so the startpoint
+  // is in or right at the end of I), and that End >= I->Start.  Extend I out to
+  // End.
+  if (End > I->End) {
+    I->End = End;
+    range_iterator NextI = I;
+    while (++NextI != E && End >= NextI->Start) {
+      // Merge the range in.
+      I->TheStores.append(NextI->TheStores.begin(), NextI->TheStores.end());
+      if (NextI->End > I->End)
+        I->End = NextI->End;
+      Ranges.erase(NextI);
+      NextI = I;
+    }
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//                         MemCpyOpt Pass
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+  class VISIBILITY_HIDDEN MemCpyOpt : public FunctionPass {
+    bool runOnFunction(Function &F);
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    MemCpyOpt() : FunctionPass(&ID) {}
+
+  private:
+    // This transformation requires dominator postdominator info
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesCFG();
+      AU.addRequired<DominatorTree>();
+      AU.addRequired<MemoryDependenceAnalysis>();
+      AU.addRequired<AliasAnalysis>();
+      AU.addRequired<TargetData>();
+      AU.addPreserved<AliasAnalysis>();
+      AU.addPreserved<MemoryDependenceAnalysis>();
+      AU.addPreserved<TargetData>();
+    }
+  
+    // Helper fuctions
+    bool processStore(StoreInst *SI, BasicBlock::iterator& BBI);
+    bool processMemCpy(MemCpyInst* M);
+    bool performCallSlotOptzn(MemCpyInst* cpy, CallInst* C);
+    bool iterateOnFunction(Function &F);
+  };
+  
+  char MemCpyOpt::ID = 0;
+}
+
+// createMemCpyOptPass - The public interface to this file...
+FunctionPass *llvm::createMemCpyOptPass() { return new MemCpyOpt(); }
+
+static RegisterPass<MemCpyOpt> X("memcpyopt",
+                                 "MemCpy Optimization");
+
+
+
+/// processStore - When GVN is scanning forward over instructions, we look for
+/// some other patterns to fold away.  In particular, this looks for stores to
+/// neighboring locations of memory.  If it sees enough consequtive ones
+/// (currently 4) it attempts to merge them together into a memcpy/memset.
+bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator& BBI) {
+  if (SI->isVolatile()) return false;
+  
+  // There are two cases that are interesting for this code to handle: memcpy
+  // and memset.  Right now we only handle memset.
+  
+  // Ensure that the value being stored is something that can be memset'able a
+  // byte at a time like "0" or "-1" or any width, as well as things like
+  // 0xA0A0A0A0 and 0.0.
+  Value *ByteVal = isBytewiseValue(SI->getOperand(0));
+  if (!ByteVal)
+    return false;
+
+  TargetData &TD = getAnalysis<TargetData>();
+  AliasAnalysis &AA = getAnalysis<AliasAnalysis>();
+
+  // Okay, so we now have a single store that can be splatable.  Scan to find
+  // all subsequent stores of the same value to offset from the same pointer.
+  // Join these together into ranges, so we can decide whether contiguous blocks
+  // are stored.
+  MemsetRanges Ranges(TD);
+  
+  Value *StartPtr = SI->getPointerOperand();
+  
+  BasicBlock::iterator BI = SI;
+  for (++BI; !isa<TerminatorInst>(BI); ++BI) {
+    if (isa<CallInst>(BI) || isa<InvokeInst>(BI)) { 
+      // If the call is readnone, ignore it, otherwise bail out.  We don't even
+      // allow readonly here because we don't want something like:
+      // A[1] = 2; strlen(A); A[2] = 2; -> memcpy(A, ...); strlen(A).
+      if (AA.getModRefBehavior(CallSite::get(BI)) ==
+            AliasAnalysis::DoesNotAccessMemory)
+        continue;
+      
+      // TODO: If this is a memset, try to join it in.
+      
+      break;
+    } else if (isa<VAArgInst>(BI) || isa<LoadInst>(BI))
+      break;
+
+    // If this is a non-store instruction it is fine, ignore it.
+    StoreInst *NextStore = dyn_cast<StoreInst>(BI);
+    if (NextStore == 0) continue;
+    
+    // If this is a store, see if we can merge it in.
+    if (NextStore->isVolatile()) break;
+    
+    // Check to see if this stored value is of the same byte-splattable value.
+    if (ByteVal != isBytewiseValue(NextStore->getOperand(0)))
+      break;
+
+    // Check to see if this store is to a constant offset from the start ptr.
+    int64_t Offset;
+    if (!IsPointerOffset(StartPtr, NextStore->getPointerOperand(), Offset, TD))
+      break;
+
+    Ranges.addStore(Offset, NextStore);
+  }
+
+  // If we have no ranges, then we just had a single store with nothing that
+  // could be merged in.  This is a very common case of course.
+  if (Ranges.empty())
+    return false;
+  
+  // If we had at least one store that could be merged in, add the starting
+  // store as well.  We try to avoid this unless there is at least something
+  // interesting as a small compile-time optimization.
+  Ranges.addStore(0, SI);
+
+  
+  Function *MemSetF = 0;
+  
+  // Now that we have full information about ranges, loop over the ranges and
+  // emit memset's for anything big enough to be worthwhile.
+  bool MadeChange = false;
+  for (MemsetRanges::const_iterator I = Ranges.begin(), E = Ranges.end();
+       I != E; ++I) {
+    const MemsetRange &Range = *I;
+
+    if (Range.TheStores.size() == 1) continue;
+    
+    // If it is profitable to lower this range to memset, do so now.
+    if (!Range.isProfitableToUseMemset(TD))
+      continue;
+    
+    // Otherwise, we do want to transform this!  Create a new memset.  We put
+    // the memset right before the first instruction that isn't part of this
+    // memset block.  This ensure that the memset is dominated by any addressing
+    // instruction needed by the start of the block.
+    BasicBlock::iterator InsertPt = BI;
+  
+    if (MemSetF == 0) {
+      const Type *Tys[] = {Type::Int64Ty};
+      MemSetF = Intrinsic::getDeclaration(SI->getParent()->getParent()
+                                          ->getParent(), Intrinsic::memset,
+                                          Tys, 1);
+   }
+    
+    // Get the starting pointer of the block.
+    StartPtr = Range.StartPtr;
+  
+    // Cast the start ptr to be i8* as memset requires.
+    const Type *i8Ptr = PointerType::getUnqual(Type::Int8Ty);
+    if (StartPtr->getType() != i8Ptr)
+      StartPtr = new BitCastInst(StartPtr, i8Ptr, StartPtr->getNameStart(),
+                                 InsertPt);
+  
+    Value *Ops[] = {
+      StartPtr, ByteVal,   // Start, value
+      ConstantInt::get(Type::Int64Ty, Range.End-Range.Start),  // size
+      ConstantInt::get(Type::Int32Ty, Range.Alignment)   // align
+    };
+    Value *C = CallInst::Create(MemSetF, Ops, Ops+4, "", InsertPt);
+    DEBUG(cerr << "Replace stores:\n";
+          for (unsigned i = 0, e = Range.TheStores.size(); i != e; ++i)
+            cerr << *Range.TheStores[i];
+          cerr << "With: " << *C); C=C;
+  
+    // Don't invalidate the iterator
+    BBI = BI;
+  
+    // Zap all the stores.
+    for (SmallVector<StoreInst*, 16>::const_iterator SI = Range.TheStores.begin(),
+         SE = Range.TheStores.end(); SI != SE; ++SI)
+      (*SI)->eraseFromParent();
+    ++NumMemSetInfer;
+    MadeChange = true;
+  }
+  
+  return MadeChange;
+}
+
+
+/// performCallSlotOptzn - takes a memcpy and a call that it depends on,
+/// and checks for the possibility of a call slot optimization by having
+/// the call write its result directly into the destination of the memcpy.
+bool MemCpyOpt::performCallSlotOptzn(MemCpyInst *cpy, CallInst *C) {
+  // The general transformation to keep in mind is
+  //
+  //   call @func(..., src, ...)
+  //   memcpy(dest, src, ...)
+  //
+  // ->
+  //
+  //   memcpy(dest, src, ...)
+  //   call @func(..., dest, ...)
+  //
+  // Since moving the memcpy is technically awkward, we additionally check that
+  // src only holds uninitialized values at the moment of the call, meaning that
+  // the memcpy can be discarded rather than moved.
+
+  // Deliberately get the source and destination with bitcasts stripped away,
+  // because we'll need to do type comparisons based on the underlying type.
+  Value* cpyDest = cpy->getDest();
+  Value* cpySrc = cpy->getSource();
+  CallSite CS = CallSite::get(C);
+
+  // We need to be able to reason about the size of the memcpy, so we require
+  // that it be a constant.
+  ConstantInt* cpyLength = dyn_cast<ConstantInt>(cpy->getLength());
+  if (!cpyLength)
+    return false;
+
+  // Require that src be an alloca.  This simplifies the reasoning considerably.
+  AllocaInst* srcAlloca = dyn_cast<AllocaInst>(cpySrc);
+  if (!srcAlloca)
+    return false;
+
+  // Check that all of src is copied to dest.
+  TargetData& TD = getAnalysis<TargetData>();
+
+  ConstantInt* srcArraySize = dyn_cast<ConstantInt>(srcAlloca->getArraySize());
+  if (!srcArraySize)
+    return false;
+
+  uint64_t srcSize = TD.getTypeAllocSize(srcAlloca->getAllocatedType()) *
+    srcArraySize->getZExtValue();
+
+  if (cpyLength->getZExtValue() < srcSize)
+    return false;
+
+  // Check that accessing the first srcSize bytes of dest will not cause a
+  // trap.  Otherwise the transform is invalid since it might cause a trap
+  // to occur earlier than it otherwise would.
+  if (AllocaInst* A = dyn_cast<AllocaInst>(cpyDest)) {
+    // The destination is an alloca.  Check it is larger than srcSize.
+    ConstantInt* destArraySize = dyn_cast<ConstantInt>(A->getArraySize());
+    if (!destArraySize)
+      return false;
+
+    uint64_t destSize = TD.getTypeAllocSize(A->getAllocatedType()) *
+      destArraySize->getZExtValue();
+
+    if (destSize < srcSize)
+      return false;
+  } else if (Argument* A = dyn_cast<Argument>(cpyDest)) {
+    // If the destination is an sret parameter then only accesses that are
+    // outside of the returned struct type can trap.
+    if (!A->hasStructRetAttr())
+      return false;
+
+    const Type* StructTy = cast<PointerType>(A->getType())->getElementType();
+    uint64_t destSize = TD.getTypeAllocSize(StructTy);
+
+    if (destSize < srcSize)
+      return false;
+  } else {
+    return false;
+  }
+
+  // Check that src is not accessed except via the call and the memcpy.  This
+  // guarantees that it holds only undefined values when passed in (so the final
+  // memcpy can be dropped), that it is not read or written between the call and
+  // the memcpy, and that writing beyond the end of it is undefined.
+  SmallVector<User*, 8> srcUseList(srcAlloca->use_begin(),
+                                   srcAlloca->use_end());
+  while (!srcUseList.empty()) {
+    User* UI = srcUseList.back();
+    srcUseList.pop_back();
+
+    if (isa<BitCastInst>(UI)) {
+      for (User::use_iterator I = UI->use_begin(), E = UI->use_end();
+           I != E; ++I)
+        srcUseList.push_back(*I);
+    } else if (GetElementPtrInst* G = dyn_cast<GetElementPtrInst>(UI)) {
+      if (G->hasAllZeroIndices())
+        for (User::use_iterator I = UI->use_begin(), E = UI->use_end();
+             I != E; ++I)
+          srcUseList.push_back(*I);
+      else
+        return false;
+    } else if (UI != C && UI != cpy) {
+      return false;
+    }
+  }
+
+  // Since we're changing the parameter to the callsite, we need to make sure
+  // that what would be the new parameter dominates the callsite.
+  DominatorTree& DT = getAnalysis<DominatorTree>();
+  if (Instruction* cpyDestInst = dyn_cast<Instruction>(cpyDest))
+    if (!DT.dominates(cpyDestInst, C))
+      return false;
+
+  // In addition to knowing that the call does not access src in some
+  // unexpected manner, for example via a global, which we deduce from
+  // the use analysis, we also need to know that it does not sneakily
+  // access dest.  We rely on AA to figure this out for us.
+  AliasAnalysis& AA = getAnalysis<AliasAnalysis>();
+  if (AA.getModRefInfo(C, cpy->getRawDest(), srcSize) !=
+      AliasAnalysis::NoModRef)
+    return false;
+
+  // All the checks have passed, so do the transformation.
+  bool changedArgument = false;
+  for (unsigned i = 0; i < CS.arg_size(); ++i)
+    if (CS.getArgument(i)->stripPointerCasts() == cpySrc) {
+      if (cpySrc->getType() != cpyDest->getType())
+        cpyDest = CastInst::CreatePointerCast(cpyDest, cpySrc->getType(),
+                                              cpyDest->getName(), C);
+      changedArgument = true;
+      if (CS.getArgument(i)->getType() != cpyDest->getType())
+        CS.setArgument(i, CastInst::CreatePointerCast(cpyDest, 
+                       CS.getArgument(i)->getType(), cpyDest->getName(), C));
+      else
+        CS.setArgument(i, cpyDest);
+    }
+
+  if (!changedArgument)
+    return false;
+
+  // Drop any cached information about the call, because we may have changed
+  // its dependence information by changing its parameter.
+  MemoryDependenceAnalysis& MD = getAnalysis<MemoryDependenceAnalysis>();
+  MD.removeInstruction(C);
+
+  // Remove the memcpy
+  MD.removeInstruction(cpy);
+  cpy->eraseFromParent();
+  NumMemCpyInstr++;
+
+  return true;
+}
+
+/// processMemCpy - perform simplication of memcpy's.  If we have memcpy A which
+/// copies X to Y, and memcpy B which copies Y to Z, then we can rewrite B to be
+/// a memcpy from X to Z (or potentially a memmove, depending on circumstances).
+///  This allows later passes to remove the first memcpy altogether.
+bool MemCpyOpt::processMemCpy(MemCpyInst* M) {
+  MemoryDependenceAnalysis& MD = getAnalysis<MemoryDependenceAnalysis>();
+
+  // The are two possible optimizations we can do for memcpy:
+  //   a) memcpy-memcpy xform which exposes redundance for DSE
+  //   b) call-memcpy xform for return slot optimization
+  MemDepResult dep = MD.getDependency(M);
+  if (!dep.isClobber())
+    return false;
+  if (!isa<MemCpyInst>(dep.getInst())) {
+    if (CallInst* C = dyn_cast<CallInst>(dep.getInst()))
+      return performCallSlotOptzn(M, C);
+    return false;
+  }
+  
+  MemCpyInst* MDep = cast<MemCpyInst>(dep.getInst());
+  
+  // We can only transforms memcpy's where the dest of one is the source of the
+  // other
+  if (M->getSource() != MDep->getDest())
+    return false;
+  
+  // Second, the length of the memcpy's must be the same, or the preceeding one
+  // must be larger than the following one.
+  ConstantInt* C1 = dyn_cast<ConstantInt>(MDep->getLength());
+  ConstantInt* C2 = dyn_cast<ConstantInt>(M->getLength());
+  if (!C1 || !C2)
+    return false;
+  
+  uint64_t DepSize = C1->getValue().getZExtValue();
+  uint64_t CpySize = C2->getValue().getZExtValue();
+  
+  if (DepSize < CpySize)
+    return false;
+  
+  // Finally, we have to make sure that the dest of the second does not
+  // alias the source of the first
+  AliasAnalysis& AA = getAnalysis<AliasAnalysis>();
+  if (AA.alias(M->getRawDest(), CpySize, MDep->getRawSource(), DepSize) !=
+      AliasAnalysis::NoAlias)
+    return false;
+  else if (AA.alias(M->getRawDest(), CpySize, M->getRawSource(), CpySize) !=
+           AliasAnalysis::NoAlias)
+    return false;
+  else if (AA.alias(MDep->getRawDest(), DepSize, MDep->getRawSource(), DepSize)
+           != AliasAnalysis::NoAlias)
+    return false;
+  
+  // If all checks passed, then we can transform these memcpy's
+  const Type *Tys[1];
+  Tys[0] = M->getLength()->getType();
+  Function* MemCpyFun = Intrinsic::getDeclaration(
+                                 M->getParent()->getParent()->getParent(),
+                                 M->getIntrinsicID(), Tys, 1);
+    
+  Value *Args[4] = {
+    M->getRawDest(), MDep->getRawSource(), M->getLength(), M->getAlignmentCst()
+  };
+  
+  CallInst* C = CallInst::Create(MemCpyFun, Args, Args+4, "", M);
+  
+  
+  // If C and M don't interfere, then this is a valid transformation.  If they
+  // did, this would mean that the two sources overlap, which would be bad.
+  if (MD.getDependency(C) == dep) {
+    MD.removeInstruction(M);
+    M->eraseFromParent();
+    NumMemCpyInstr++;
+    return true;
+  }
+  
+  // Otherwise, there was no point in doing this, so we remove the call we
+  // inserted and act like nothing happened.
+  MD.removeInstruction(C);
+  C->eraseFromParent();
+  return false;
+}
+
+// MemCpyOpt::runOnFunction - This is the main transformation entry point for a
+// function.
+//
+bool MemCpyOpt::runOnFunction(Function& F) {
+  
+  bool changed = false;
+  bool shouldContinue = true;
+  
+  while (shouldContinue) {
+    shouldContinue = iterateOnFunction(F);
+    changed |= shouldContinue;
+  }
+  
+  return changed;
+}
+
+
+// MemCpyOpt::iterateOnFunction - Executes one iteration of GVN
+bool MemCpyOpt::iterateOnFunction(Function &F) {
+  bool changed_function = false;
+
+  // Walk all instruction in the function
+  for (Function::iterator BB = F.begin(), BBE = F.end(); BB != BBE; ++BB) {
+    for (BasicBlock::iterator BI = BB->begin(), BE = BB->end();
+         BI != BE;) {
+      // Avoid invalidating the iterator
+      Instruction* I = BI++;
+      
+      if (StoreInst *SI = dyn_cast<StoreInst>(I))
+        changed_function |= processStore(SI, BI);
+      else if (MemCpyInst* M = dyn_cast<MemCpyInst>(I)) {
+        changed_function |= processMemCpy(M);
+      }
+    }
+  }
+  
+  return changed_function;
+}
diff --git a/lib/Transforms/Scalar/PredicateSimplifier.cpp b/lib/Transforms/Scalar/PredicateSimplifier.cpp
new file mode 100644
index 0000000..a7e4d6e
--- /dev/null
+++ b/lib/Transforms/Scalar/PredicateSimplifier.cpp
@@ -0,0 +1,2725 @@
+//===-- PredicateSimplifier.cpp - Path Sensitive Simplifier ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Path-sensitive optimizer. In a branch where x == y, replace uses of
+// x with y. Permits further optimization, such as the elimination of
+// the unreachable call:
+//
+// void test(int *p, int *q)
+// {
+//   if (p != q)
+//     return;
+// 
+//   if (*p != *q)
+//     foo(); // unreachable
+// }
+//
+//===----------------------------------------------------------------------===//
+//
+// The InequalityGraph focusses on four properties; equals, not equals,
+// less-than and less-than-or-equals-to. The greater-than forms are also held
+// just to allow walking from a lesser node to a greater one. These properties
+// are stored in a lattice; LE can become LT or EQ, NE can become LT or GT.
+//
+// These relationships define a graph between values of the same type. Each
+// Value is stored in a map table that retrieves the associated Node. This
+// is how EQ relationships are stored; the map contains pointers from equal
+// Value to the same node. The node contains a most canonical Value* form
+// and the list of known relationships with other nodes.
+//
+// If two nodes are known to be inequal, then they will contain pointers to
+// each other with an "NE" relationship. If node getNode(%x) is less than
+// getNode(%y), then the %x node will contain <%y, GT> and %y will contain
+// <%x, LT>. This allows us to tie nodes together into a graph like this:
+//
+//   %a < %b < %c < %d
+//
+// with four nodes representing the properties. The InequalityGraph provides
+// querying with "isRelatedBy" and mutators "addEquality" and "addInequality".
+// To find a relationship, we start with one of the nodes any binary search
+// through its list to find where the relationships with the second node start.
+// Then we iterate through those to find the first relationship that dominates
+// our context node.
+//
+// To create these properties, we wait until a branch or switch instruction
+// implies that a particular value is true (or false). The VRPSolver is
+// responsible for analyzing the variable and seeing what new inferences
+// can be made from each property. For example:
+//
+//   %P = icmp ne i32* %ptr, null
+//   %a = and i1 %P, %Q
+//   br i1 %a label %cond_true, label %cond_false
+//
+// For the true branch, the VRPSolver will start with %a EQ true and look at
+// the definition of %a and find that it can infer that %P and %Q are both
+// true. From %P being true, it can infer that %ptr NE null. For the false
+// branch it can't infer anything from the "and" instruction.
+//
+// Besides branches, we can also infer properties from instruction that may
+// have undefined behaviour in certain cases. For example, the dividend of
+// a division may never be zero. After the division instruction, we may assume
+// that the dividend is not equal to zero.
+//
+//===----------------------------------------------------------------------===//
+//
+// The ValueRanges class stores the known integer bounds of a Value. When we
+// encounter i8 %a u< %b, the ValueRanges stores that %a = [1, 255] and
+// %b = [0, 254].
+//
+// It never stores an empty range, because that means that the code is
+// unreachable. It never stores a single-element range since that's an equality
+// relationship and better stored in the InequalityGraph, nor an empty range
+// since that is better stored in UnreachableBlocks.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "predsimplify"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Instructions.h"
+#include "llvm/Pass.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SetOperations.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Assembly/Writer.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/ConstantRange.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/InstVisitor.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <algorithm>
+#include <deque>
+#include <stack>
+using namespace llvm;
+
+STATISTIC(NumVarsReplaced, "Number of argument substitutions");
+STATISTIC(NumInstruction , "Number of instructions removed");
+STATISTIC(NumSimple      , "Number of simple replacements");
+STATISTIC(NumBlocks      , "Number of blocks marked unreachable");
+STATISTIC(NumSnuggle     , "Number of comparisons snuggled");
+
+namespace {
+  class DomTreeDFS {
+  public:
+    class Node {
+      friend class DomTreeDFS;
+    public:
+      typedef std::vector<Node *>::iterator       iterator;
+      typedef std::vector<Node *>::const_iterator const_iterator;
+
+      unsigned getDFSNumIn()  const { return DFSin;  }
+      unsigned getDFSNumOut() const { return DFSout; }
+
+      BasicBlock *getBlock() const { return BB; }
+
+      iterator begin() { return Children.begin(); }
+      iterator end()   { return Children.end();   }
+
+      const_iterator begin() const { return Children.begin(); }
+      const_iterator end()   const { return Children.end();   }
+
+      bool dominates(const Node *N) const {
+        return DFSin <= N->DFSin && DFSout >= N->DFSout;
+      }
+
+      bool DominatedBy(const Node *N) const {
+        return N->dominates(this);
+      }
+
+      /// Sorts by the number of descendants. With this, you can iterate
+      /// through a sorted list and the first matching entry is the most
+      /// specific match for your basic block. The order provided is stable;
+      /// DomTreeDFS::Nodes with the same number of descendants are sorted by
+      /// DFS in number.
+      bool operator<(const Node &N) const {
+        unsigned   spread =   DFSout -   DFSin;
+        unsigned N_spread = N.DFSout - N.DFSin;
+        if (spread == N_spread) return DFSin < N.DFSin;
+        return spread < N_spread;
+      }
+      bool operator>(const Node &N) const { return N < *this; }
+
+    private:
+      unsigned DFSin, DFSout;
+      BasicBlock *BB;
+
+      std::vector<Node *> Children;
+    };
+
+    // XXX: this may be slow. Instead of using "new" for each node, consider
+    // putting them in a vector to keep them contiguous.
+    explicit DomTreeDFS(DominatorTree *DT) {
+      std::stack<std::pair<Node *, DomTreeNode *> > S;
+
+      Entry = new Node;
+      Entry->BB = DT->getRootNode()->getBlock();
+      S.push(std::make_pair(Entry, DT->getRootNode()));
+
+      NodeMap[Entry->BB] = Entry;
+
+      while (!S.empty()) {
+        std::pair<Node *, DomTreeNode *> &Pair = S.top();
+        Node *N = Pair.first;
+        DomTreeNode *DTNode = Pair.second;
+        S.pop();
+
+        for (DomTreeNode::iterator I = DTNode->begin(), E = DTNode->end();
+             I != E; ++I) {
+          Node *NewNode = new Node;
+          NewNode->BB = (*I)->getBlock();
+          N->Children.push_back(NewNode);
+          S.push(std::make_pair(NewNode, *I));
+
+          NodeMap[NewNode->BB] = NewNode;
+        }
+      }
+
+      renumber();
+
+#ifndef NDEBUG
+      DEBUG(dump());
+#endif
+    }
+
+#ifndef NDEBUG
+    virtual
+#endif
+    ~DomTreeDFS() {
+      std::stack<Node *> S;
+
+      S.push(Entry);
+      while (!S.empty()) {
+        Node *N = S.top(); S.pop();
+
+        for (Node::iterator I = N->begin(), E = N->end(); I != E; ++I)
+          S.push(*I);
+
+        delete N;
+      }
+    }
+
+    /// getRootNode - This returns the entry node for the CFG of the function.
+    Node *getRootNode() const { return Entry; }
+
+    /// getNodeForBlock - return the node for the specified basic block.
+    Node *getNodeForBlock(BasicBlock *BB) const {
+      if (!NodeMap.count(BB)) return 0;
+      return const_cast<DomTreeDFS*>(this)->NodeMap[BB];
+    }
+
+    /// dominates - returns true if the basic block for I1 dominates that of
+    /// the basic block for I2. If the instructions belong to the same basic
+    /// block, the instruction first instruction sequentially in the block is
+    /// considered dominating.
+    bool dominates(Instruction *I1, Instruction *I2) {
+      BasicBlock *BB1 = I1->getParent(),
+                 *BB2 = I2->getParent();
+      if (BB1 == BB2) {
+        if (isa<TerminatorInst>(I1)) return false;
+        if (isa<TerminatorInst>(I2)) return true;
+        if ( isa<PHINode>(I1) && !isa<PHINode>(I2)) return true;
+        if (!isa<PHINode>(I1) &&  isa<PHINode>(I2)) return false;
+
+        for (BasicBlock::const_iterator I = BB2->begin(), E = BB2->end();
+             I != E; ++I) {
+          if (&*I == I1) return true;
+          else if (&*I == I2) return false;
+        }
+        assert(!"Instructions not found in parent BasicBlock?");
+      } else {
+        Node *Node1 = getNodeForBlock(BB1),
+             *Node2 = getNodeForBlock(BB2);
+        return Node1 && Node2 && Node1->dominates(Node2);
+      }
+      return false; // Not reached
+    }
+
+  private:
+    /// renumber - calculates the depth first search numberings and applies
+    /// them onto the nodes.
+    void renumber() {
+      std::stack<std::pair<Node *, Node::iterator> > S;
+      unsigned n = 0;
+
+      Entry->DFSin = ++n;
+      S.push(std::make_pair(Entry, Entry->begin()));
+
+      while (!S.empty()) {
+        std::pair<Node *, Node::iterator> &Pair = S.top();
+        Node *N = Pair.first;
+        Node::iterator &I = Pair.second;
+
+        if (I == N->end()) {
+          N->DFSout = ++n;
+          S.pop();
+        } else {
+          Node *Next = *I++;
+          Next->DFSin = ++n;
+          S.push(std::make_pair(Next, Next->begin()));
+        }
+      }
+    }
+
+#ifndef NDEBUG
+    virtual void dump() const {
+      dump(*cerr.stream());
+    }
+
+    void dump(std::ostream &os) const {
+      os << "Predicate simplifier DomTreeDFS: \n";
+      dump(Entry, 0, os);
+      os << "\n\n";
+    }
+
+    void dump(Node *N, int depth, std::ostream &os) const {
+      ++depth;
+      for (int i = 0; i < depth; ++i) { os << " "; }
+      os << "[" << depth << "] ";
+
+      os << N->getBlock()->getName() << " (" << N->getDFSNumIn()
+         << ", " << N->getDFSNumOut() << ")\n";
+
+      for (Node::iterator I = N->begin(), E = N->end(); I != E; ++I)
+        dump(*I, depth, os);
+    }
+#endif
+
+    Node *Entry;
+    std::map<BasicBlock *, Node *> NodeMap;
+  };
+
+  // SLT SGT ULT UGT EQ
+  //   0   1   0   1  0 -- GT                  10
+  //   0   1   0   1  1 -- GE                  11
+  //   0   1   1   0  0 -- SGTULT              12
+  //   0   1   1   0  1 -- SGEULE              13
+  //   0   1   1   1  0 -- SGT                 14
+  //   0   1   1   1  1 -- SGE                 15
+  //   1   0   0   1  0 -- SLTUGT              18
+  //   1   0   0   1  1 -- SLEUGE              19
+  //   1   0   1   0  0 -- LT                  20
+  //   1   0   1   0  1 -- LE                  21
+  //   1   0   1   1  0 -- SLT                 22
+  //   1   0   1   1  1 -- SLE                 23
+  //   1   1   0   1  0 -- UGT                 26
+  //   1   1   0   1  1 -- UGE                 27
+  //   1   1   1   0  0 -- ULT                 28
+  //   1   1   1   0  1 -- ULE                 29
+  //   1   1   1   1  0 -- NE                  30
+  enum LatticeBits {
+    EQ_BIT = 1, UGT_BIT = 2, ULT_BIT = 4, SGT_BIT = 8, SLT_BIT = 16
+  };
+  enum LatticeVal {
+    GT = SGT_BIT | UGT_BIT,
+    GE = GT | EQ_BIT,
+    LT = SLT_BIT | ULT_BIT,
+    LE = LT | EQ_BIT,
+    NE = SLT_BIT | SGT_BIT | ULT_BIT | UGT_BIT,
+    SGTULT = SGT_BIT | ULT_BIT,
+    SGEULE = SGTULT | EQ_BIT,
+    SLTUGT = SLT_BIT | UGT_BIT,
+    SLEUGE = SLTUGT | EQ_BIT,
+    ULT = SLT_BIT | SGT_BIT | ULT_BIT,
+    UGT = SLT_BIT | SGT_BIT | UGT_BIT,
+    SLT = SLT_BIT | ULT_BIT | UGT_BIT,
+    SGT = SGT_BIT | ULT_BIT | UGT_BIT,
+    SLE = SLT | EQ_BIT,
+    SGE = SGT | EQ_BIT,
+    ULE = ULT | EQ_BIT,
+    UGE = UGT | EQ_BIT
+  };
+
+#ifndef NDEBUG
+  /// validPredicate - determines whether a given value is actually a lattice
+  /// value. Only used in assertions or debugging.
+  static bool validPredicate(LatticeVal LV) {
+    switch (LV) {
+      case GT: case GE: case LT: case LE: case NE:
+      case SGTULT: case SGT: case SGEULE:
+      case SLTUGT: case SLT: case SLEUGE:
+      case ULT: case UGT:
+      case SLE: case SGE: case ULE: case UGE:
+        return true;
+      default:
+        return false;
+    }
+  }
+#endif
+
+  /// reversePredicate - reverse the direction of the inequality
+  static LatticeVal reversePredicate(LatticeVal LV) {
+    unsigned reverse = LV ^ (SLT_BIT|SGT_BIT|ULT_BIT|UGT_BIT); //preserve EQ_BIT
+
+    if ((reverse & (SLT_BIT|SGT_BIT)) == 0)
+      reverse |= (SLT_BIT|SGT_BIT);
+
+    if ((reverse & (ULT_BIT|UGT_BIT)) == 0)
+      reverse |= (ULT_BIT|UGT_BIT);
+
+    LatticeVal Rev = static_cast<LatticeVal>(reverse);
+    assert(validPredicate(Rev) && "Failed reversing predicate.");
+    return Rev;
+  }
+
+  /// ValueNumbering stores the scope-specific value numbers for a given Value.
+  class VISIBILITY_HIDDEN ValueNumbering {
+
+    /// VNPair is a tuple of {Value, index number, DomTreeDFS::Node}. It
+    /// includes the comparison operators necessary to allow you to store it
+    /// in a sorted vector.
+    class VISIBILITY_HIDDEN VNPair {
+    public:
+      Value *V;
+      unsigned index;
+      DomTreeDFS::Node *Subtree;
+
+      VNPair(Value *V, unsigned index, DomTreeDFS::Node *Subtree)
+        : V(V), index(index), Subtree(Subtree) {}
+
+      bool operator==(const VNPair &RHS) const {
+        return V == RHS.V && Subtree == RHS.Subtree;
+      }
+
+      bool operator<(const VNPair &RHS) const {
+        if (V != RHS.V) return V < RHS.V;
+        return *Subtree < *RHS.Subtree;
+      }
+
+      bool operator<(Value *RHS) const {
+        return V < RHS;
+      }
+
+      bool operator>(Value *RHS) const {
+        return V > RHS;
+      }
+
+      friend bool operator<(Value *RHS, const VNPair &pair) {
+        return pair.operator>(RHS);
+      }
+    };
+
+    typedef std::vector<VNPair> VNMapType;
+    VNMapType VNMap;
+
+    /// The canonical choice for value number at index.
+    std::vector<Value *> Values;
+
+    DomTreeDFS *DTDFS;
+
+  public:
+#ifndef NDEBUG
+    virtual ~ValueNumbering() {}
+    virtual void dump() {
+      dump(*cerr.stream());
+    }
+
+    void dump(std::ostream &os) {
+      for (unsigned i = 1; i <= Values.size(); ++i) {
+        os << i << " = ";
+        WriteAsOperand(os, Values[i-1]);
+        os << " {";
+        for (unsigned j = 0; j < VNMap.size(); ++j) {
+          if (VNMap[j].index == i) {
+            WriteAsOperand(os, VNMap[j].V);
+            os << " (" << VNMap[j].Subtree->getDFSNumIn() << ")  ";
+          }
+        }
+        os << "}\n";
+      }
+    }
+#endif
+
+    /// compare - returns true if V1 is a better canonical value than V2.
+    bool compare(Value *V1, Value *V2) const {
+      if (isa<Constant>(V1))
+        return !isa<Constant>(V2);
+      else if (isa<Constant>(V2))
+        return false;
+      else if (isa<Argument>(V1))
+        return !isa<Argument>(V2);
+      else if (isa<Argument>(V2))
+        return false;
+
+      Instruction *I1 = dyn_cast<Instruction>(V1);
+      Instruction *I2 = dyn_cast<Instruction>(V2);
+
+      if (!I1 || !I2)
+        return V1->getNumUses() < V2->getNumUses();
+
+      return DTDFS->dominates(I1, I2);
+    }
+
+    ValueNumbering(DomTreeDFS *DTDFS) : DTDFS(DTDFS) {}
+
+    /// valueNumber - finds the value number for V under the Subtree. If
+    /// there is no value number, returns zero.
+    unsigned valueNumber(Value *V, DomTreeDFS::Node *Subtree) {
+      if (!(isa<Constant>(V) || isa<Argument>(V) || isa<Instruction>(V))
+          || V->getType() == Type::VoidTy) return 0;
+
+      VNMapType::iterator E = VNMap.end();
+      VNPair pair(V, 0, Subtree);
+      VNMapType::iterator I = std::lower_bound(VNMap.begin(), E, pair);
+      while (I != E && I->V == V) {
+        if (I->Subtree->dominates(Subtree))
+          return I->index;
+        ++I;
+      }
+      return 0;
+    }
+
+    /// getOrInsertVN - always returns a value number, creating it if necessary.
+    unsigned getOrInsertVN(Value *V, DomTreeDFS::Node *Subtree) {
+      if (unsigned n = valueNumber(V, Subtree))
+        return n;
+      else
+        return newVN(V);
+    }
+
+    /// newVN - creates a new value number. Value V must not already have a
+    /// value number assigned.
+    unsigned newVN(Value *V) {
+      assert((isa<Constant>(V) || isa<Argument>(V) || isa<Instruction>(V)) &&
+             "Bad Value for value numbering.");
+      assert(V->getType() != Type::VoidTy && "Won't value number a void value");
+
+      Values.push_back(V);
+
+      VNPair pair = VNPair(V, Values.size(), DTDFS->getRootNode());
+      VNMapType::iterator I = std::lower_bound(VNMap.begin(), VNMap.end(), pair);
+      assert((I == VNMap.end() || value(I->index) != V) &&
+             "Attempt to create a duplicate value number.");
+      VNMap.insert(I, pair);
+
+      return Values.size();
+    }
+
+    /// value - returns the Value associated with a value number.
+    Value *value(unsigned index) const {
+      assert(index != 0 && "Zero index is reserved for not found.");
+      assert(index <= Values.size() && "Index out of range.");
+      return Values[index-1];
+    }
+
+    /// canonicalize - return a Value that is equal to V under Subtree.
+    Value *canonicalize(Value *V, DomTreeDFS::Node *Subtree) {
+      if (isa<Constant>(V)) return V;
+
+      if (unsigned n = valueNumber(V, Subtree))
+        return value(n);
+      else
+        return V;
+    }
+
+    /// addEquality - adds that value V belongs to the set of equivalent
+    /// values defined by value number n under Subtree.
+    void addEquality(unsigned n, Value *V, DomTreeDFS::Node *Subtree) {
+      assert(canonicalize(value(n), Subtree) == value(n) &&
+             "Node's 'canonical' choice isn't best within this subtree.");
+
+      // Suppose that we are given "%x -> node #1 (%y)". The problem is that
+      // we may already have "%z -> node #2 (%x)" somewhere above us in the
+      // graph. We need to find those edges and add "%z -> node #1 (%y)"
+      // to keep the lookups canonical.
+
+      std::vector<Value *> ToRepoint(1, V);
+
+      if (unsigned Conflict = valueNumber(V, Subtree)) {
+        for (VNMapType::iterator I = VNMap.begin(), E = VNMap.end();
+             I != E; ++I) {
+          if (I->index == Conflict && I->Subtree->dominates(Subtree))
+            ToRepoint.push_back(I->V);
+        }
+      }
+
+      for (std::vector<Value *>::iterator VI = ToRepoint.begin(),
+           VE = ToRepoint.end(); VI != VE; ++VI) {
+        Value *V = *VI;
+
+        VNPair pair(V, n, Subtree);
+        VNMapType::iterator B = VNMap.begin(), E = VNMap.end();
+        VNMapType::iterator I = std::lower_bound(B, E, pair);
+        if (I != E && I->V == V && I->Subtree == Subtree)
+          I->index = n; // Update best choice
+        else
+          VNMap.insert(I, pair); // New Value
+
+        // XXX: we currently don't have to worry about updating values with
+        // more specific Subtrees, but we will need to for PHI node support.
+
+#ifndef NDEBUG
+        Value *V_n = value(n);
+        if (isa<Constant>(V) && isa<Constant>(V_n)) {
+          assert(V == V_n && "Constant equals different constant?");
+        }
+#endif
+      }
+    }
+
+    /// remove - removes all references to value V.
+    void remove(Value *V) {
+      VNMapType::iterator B = VNMap.begin(), E = VNMap.end();
+      VNPair pair(V, 0, DTDFS->getRootNode());
+      VNMapType::iterator J = std::upper_bound(B, E, pair);
+      VNMapType::iterator I = J;
+
+      while (I != B && (I == E || I->V == V)) --I;
+
+      VNMap.erase(I, J);
+    }
+  };
+
+  /// The InequalityGraph stores the relationships between values.
+  /// Each Value in the graph is assigned to a Node. Nodes are pointer
+  /// comparable for equality. The caller is expected to maintain the logical
+  /// consistency of the system.
+  ///
+  /// The InequalityGraph class may invalidate Node*s after any mutator call.
+  /// @brief The InequalityGraph stores the relationships between values.
+  class VISIBILITY_HIDDEN InequalityGraph {
+    ValueNumbering &VN;
+    DomTreeDFS::Node *TreeRoot;
+
+    InequalityGraph();                  // DO NOT IMPLEMENT
+    InequalityGraph(InequalityGraph &); // DO NOT IMPLEMENT
+  public:
+    InequalityGraph(ValueNumbering &VN, DomTreeDFS::Node *TreeRoot)
+      : VN(VN), TreeRoot(TreeRoot) {}
+
+    class Node;
+
+    /// An Edge is contained inside a Node making one end of the edge implicit
+    /// and contains a pointer to the other end. The edge contains a lattice
+    /// value specifying the relationship and an DomTreeDFS::Node specifying
+    /// the root in the dominator tree to which this edge applies.
+    class VISIBILITY_HIDDEN Edge {
+    public:
+      Edge(unsigned T, LatticeVal V, DomTreeDFS::Node *ST)
+        : To(T), LV(V), Subtree(ST) {}
+
+      unsigned To;
+      LatticeVal LV;
+      DomTreeDFS::Node *Subtree;
+
+      bool operator<(const Edge &edge) const {
+        if (To != edge.To) return To < edge.To;
+        return *Subtree < *edge.Subtree;
+      }
+
+      bool operator<(unsigned to) const {
+        return To < to;
+      }
+
+      bool operator>(unsigned to) const {
+        return To > to;
+      }
+
+      friend bool operator<(unsigned to, const Edge &edge) {
+        return edge.operator>(to);
+      }
+    };
+
+    /// A single node in the InequalityGraph. This stores the canonical Value
+    /// for the node, as well as the relationships with the neighbours.
+    ///
+    /// @brief A single node in the InequalityGraph.
+    class VISIBILITY_HIDDEN Node {
+      friend class InequalityGraph;
+
+      typedef SmallVector<Edge, 4> RelationsType;
+      RelationsType Relations;
+
+      // TODO: can this idea improve performance?
+      //friend class std::vector<Node>;
+      //Node(Node &N) { RelationsType.swap(N.RelationsType); }
+
+    public:
+      typedef RelationsType::iterator       iterator;
+      typedef RelationsType::const_iterator const_iterator;
+
+#ifndef NDEBUG
+      virtual ~Node() {}
+      virtual void dump() const {
+        dump(*cerr.stream());
+      }
+    private:
+      void dump(std::ostream &os) const {
+        static const std::string names[32] =
+          { "000000", "000001", "000002", "000003", "000004", "000005",
+            "000006", "000007", "000008", "000009", "     >", "    >=",
+            "  s>u<", "s>=u<=", "    s>", "   s>=", "000016", "000017",
+            "  s<u>", "s<=u>=", "     <", "    <=", "    s<", "   s<=",
+            "000024", "000025", "    u>", "   u>=", "    u<", "   u<=",
+            "    !=", "000031" };
+        for (Node::const_iterator NI = begin(), NE = end(); NI != NE; ++NI) {
+          os << names[NI->LV] << " " << NI->To
+             << " (" << NI->Subtree->getDFSNumIn() << "), ";
+        }
+      }
+    public:
+#endif
+
+      iterator begin()             { return Relations.begin(); }
+      iterator end()               { return Relations.end();   }
+      const_iterator begin() const { return Relations.begin(); }
+      const_iterator end()   const { return Relations.end();   }
+
+      iterator find(unsigned n, DomTreeDFS::Node *Subtree) {
+        iterator E = end();
+        for (iterator I = std::lower_bound(begin(), E, n);
+             I != E && I->To == n; ++I) {
+          if (Subtree->DominatedBy(I->Subtree))
+            return I;
+        }
+        return E;
+      }
+
+      const_iterator find(unsigned n, DomTreeDFS::Node *Subtree) const {
+        const_iterator E = end();
+        for (const_iterator I = std::lower_bound(begin(), E, n);
+             I != E && I->To == n; ++I) {
+          if (Subtree->DominatedBy(I->Subtree))
+            return I;
+        }
+        return E;
+      }
+
+      /// update - updates the lattice value for a given node, creating a new
+      /// entry if one doesn't exist. The new lattice value must not be
+      /// inconsistent with any previously existing value.
+      void update(unsigned n, LatticeVal R, DomTreeDFS::Node *Subtree) {
+        assert(validPredicate(R) && "Invalid predicate.");
+
+        Edge edge(n, R, Subtree);
+        iterator B = begin(), E = end();
+        iterator I = std::lower_bound(B, E, edge);
+
+        iterator J = I;
+        while (J != E && J->To == n) {
+          if (Subtree->DominatedBy(J->Subtree))
+            break;
+          ++J;
+        }
+
+        if (J != E && J->To == n) {
+          edge.LV = static_cast<LatticeVal>(J->LV & R);
+          assert(validPredicate(edge.LV) && "Invalid union of lattice values.");
+
+          if (edge.LV == J->LV)
+            return; // This update adds nothing new.
+        }
+
+        if (I != B) {
+          // We also have to tighten any edge beneath our update.
+          for (iterator K = I - 1; K->To == n; --K) {
+            if (K->Subtree->DominatedBy(Subtree)) {
+              LatticeVal LV = static_cast<LatticeVal>(K->LV & edge.LV);
+              assert(validPredicate(LV) && "Invalid union of lattice values");
+              K->LV = LV;
+            }
+            if (K == B) break;
+          }
+        }
+
+        // Insert new edge at Subtree if it isn't already there.
+        if (I == E || I->To != n || Subtree != I->Subtree)
+          Relations.insert(I, edge);
+      }
+    };
+
+  private:
+
+    std::vector<Node> Nodes;
+
+  public:
+    /// node - returns the node object at a given value number. The pointer
+    /// returned may be invalidated on the next call to node().
+    Node *node(unsigned index) {
+      assert(VN.value(index)); // This triggers the necessary checks.
+      if (Nodes.size() < index) Nodes.resize(index);
+      return &Nodes[index-1];
+    }
+
+    /// isRelatedBy - true iff n1 op n2
+    bool isRelatedBy(unsigned n1, unsigned n2, DomTreeDFS::Node *Subtree,
+                     LatticeVal LV) {
+      if (n1 == n2) return LV & EQ_BIT;
+
+      Node *N1 = node(n1);
+      Node::iterator I = N1->find(n2, Subtree), E = N1->end();
+      if (I != E) return (I->LV & LV) == I->LV;
+
+      return false;
+    }
+
+    // The add* methods assume that your input is logically valid and may 
+    // assertion-fail or infinitely loop if you attempt a contradiction.
+
+    /// addInequality - Sets n1 op n2.
+    /// It is also an error to call this on an inequality that is already true.
+    void addInequality(unsigned n1, unsigned n2, DomTreeDFS::Node *Subtree,
+                       LatticeVal LV1) {
+      assert(n1 != n2 && "A node can't be inequal to itself.");
+
+      if (LV1 != NE)
+        assert(!isRelatedBy(n1, n2, Subtree, reversePredicate(LV1)) &&
+               "Contradictory inequality.");
+
+      // Suppose we're adding %n1 < %n2. Find all the %a < %n1 and
+      // add %a < %n2 too. This keeps the graph fully connected.
+      if (LV1 != NE) {
+        // Break up the relationship into signed and unsigned comparison parts.
+        // If the signed parts of %a op1 %n1 match that of %n1 op2 %n2, and
+        // op1 and op2 aren't NE, then add %a op3 %n2. The new relationship
+        // should have the EQ_BIT iff it's set for both op1 and op2.
+
+        unsigned LV1_s = LV1 & (SLT_BIT|SGT_BIT);
+        unsigned LV1_u = LV1 & (ULT_BIT|UGT_BIT);
+
+        for (Node::iterator I = node(n1)->begin(), E = node(n1)->end(); I != E; ++I) {
+          if (I->LV != NE && I->To != n2) {
+
+            DomTreeDFS::Node *Local_Subtree = NULL;
+            if (Subtree->DominatedBy(I->Subtree))
+              Local_Subtree = Subtree;
+            else if (I->Subtree->DominatedBy(Subtree))
+              Local_Subtree = I->Subtree;
+
+            if (Local_Subtree) {
+              unsigned new_relationship = 0;
+              LatticeVal ILV = reversePredicate(I->LV);
+              unsigned ILV_s = ILV & (SLT_BIT|SGT_BIT);
+              unsigned ILV_u = ILV & (ULT_BIT|UGT_BIT);
+
+              if (LV1_s != (SLT_BIT|SGT_BIT) && ILV_s == LV1_s)
+                new_relationship |= ILV_s;
+              if (LV1_u != (ULT_BIT|UGT_BIT) && ILV_u == LV1_u)
+                new_relationship |= ILV_u;
+
+              if (new_relationship) {
+                if ((new_relationship & (SLT_BIT|SGT_BIT)) == 0)
+                  new_relationship |= (SLT_BIT|SGT_BIT);
+                if ((new_relationship & (ULT_BIT|UGT_BIT)) == 0)
+                  new_relationship |= (ULT_BIT|UGT_BIT);
+                if ((LV1 & EQ_BIT) && (ILV & EQ_BIT))
+                  new_relationship |= EQ_BIT;
+
+                LatticeVal NewLV = static_cast<LatticeVal>(new_relationship);
+
+                node(I->To)->update(n2, NewLV, Local_Subtree);
+                node(n2)->update(I->To, reversePredicate(NewLV), Local_Subtree);
+              }
+            }
+          }
+        }
+
+        for (Node::iterator I = node(n2)->begin(), E = node(n2)->end(); I != E; ++I) {
+          if (I->LV != NE && I->To != n1) {
+            DomTreeDFS::Node *Local_Subtree = NULL;
+            if (Subtree->DominatedBy(I->Subtree))
+              Local_Subtree = Subtree;
+            else if (I->Subtree->DominatedBy(Subtree))
+              Local_Subtree = I->Subtree;
+
+            if (Local_Subtree) {
+              unsigned new_relationship = 0;
+              unsigned ILV_s = I->LV & (SLT_BIT|SGT_BIT);
+              unsigned ILV_u = I->LV & (ULT_BIT|UGT_BIT);
+
+              if (LV1_s != (SLT_BIT|SGT_BIT) && ILV_s == LV1_s)
+                new_relationship |= ILV_s;
+
+              if (LV1_u != (ULT_BIT|UGT_BIT) && ILV_u == LV1_u)
+                new_relationship |= ILV_u;
+
+              if (new_relationship) {
+                if ((new_relationship & (SLT_BIT|SGT_BIT)) == 0)
+                  new_relationship |= (SLT_BIT|SGT_BIT);
+                if ((new_relationship & (ULT_BIT|UGT_BIT)) == 0)
+                  new_relationship |= (ULT_BIT|UGT_BIT);
+                if ((LV1 & EQ_BIT) && (I->LV & EQ_BIT))
+                  new_relationship |= EQ_BIT;
+
+                LatticeVal NewLV = static_cast<LatticeVal>(new_relationship);
+
+                node(n1)->update(I->To, NewLV, Local_Subtree);
+                node(I->To)->update(n1, reversePredicate(NewLV), Local_Subtree);
+              }
+            }
+          }
+        }
+      }
+
+      node(n1)->update(n2, LV1, Subtree);
+      node(n2)->update(n1, reversePredicate(LV1), Subtree);
+    }
+
+    /// remove - removes a node from the graph by removing all references to
+    /// and from it.
+    void remove(unsigned n) {
+      Node *N = node(n);
+      for (Node::iterator NI = N->begin(), NE = N->end(); NI != NE; ++NI) {
+        Node::iterator Iter = node(NI->To)->find(n, TreeRoot);
+        do {
+          node(NI->To)->Relations.erase(Iter);
+          Iter = node(NI->To)->find(n, TreeRoot);
+        } while (Iter != node(NI->To)->end());
+      }
+      N->Relations.clear();
+    }
+
+#ifndef NDEBUG
+    virtual ~InequalityGraph() {}
+    virtual void dump() {
+      dump(*cerr.stream());
+    }
+
+    void dump(std::ostream &os) {
+      for (unsigned i = 1; i <= Nodes.size(); ++i) {
+        os << i << " = {";
+        node(i)->dump(os);
+        os << "}\n";
+      }
+    }
+#endif
+  };
+
+  class VRPSolver;
+
+  /// ValueRanges tracks the known integer ranges and anti-ranges of the nodes
+  /// in the InequalityGraph.
+  class VISIBILITY_HIDDEN ValueRanges {
+    ValueNumbering &VN;
+    TargetData *TD;
+
+    class VISIBILITY_HIDDEN ScopedRange {
+      typedef std::vector<std::pair<DomTreeDFS::Node *, ConstantRange> >
+              RangeListType;
+      RangeListType RangeList;
+
+      static bool swo(const std::pair<DomTreeDFS::Node *, ConstantRange> &LHS,
+                      const std::pair<DomTreeDFS::Node *, ConstantRange> &RHS) {
+        return *LHS.first < *RHS.first;
+      }
+
+    public:
+#ifndef NDEBUG
+      virtual ~ScopedRange() {}
+      virtual void dump() const {
+        dump(*cerr.stream());
+      }
+
+      void dump(std::ostream &os) const {
+        os << "{";
+        for (const_iterator I = begin(), E = end(); I != E; ++I) {
+          os << &I->second << " (" << I->first->getDFSNumIn() << "), ";
+        }
+        os << "}";
+      }
+#endif
+
+      typedef RangeListType::iterator       iterator;
+      typedef RangeListType::const_iterator const_iterator;
+
+      iterator begin() { return RangeList.begin(); }
+      iterator end()   { return RangeList.end(); }
+      const_iterator begin() const { return RangeList.begin(); }
+      const_iterator end()   const { return RangeList.end(); }
+
+      iterator find(DomTreeDFS::Node *Subtree) {
+        static ConstantRange empty(1, false);
+        iterator E = end();
+        iterator I = std::lower_bound(begin(), E,
+                                      std::make_pair(Subtree, empty), swo);
+
+        while (I != E && !I->first->dominates(Subtree)) ++I;
+        return I;
+      }
+
+      const_iterator find(DomTreeDFS::Node *Subtree) const {
+        static const ConstantRange empty(1, false);
+        const_iterator E = end();
+        const_iterator I = std::lower_bound(begin(), E,
+                                            std::make_pair(Subtree, empty), swo);
+
+        while (I != E && !I->first->dominates(Subtree)) ++I;
+        return I;
+      }
+
+      void update(const ConstantRange &CR, DomTreeDFS::Node *Subtree) {
+        assert(!CR.isEmptySet() && "Empty ConstantRange.");
+        assert(!CR.isSingleElement() && "Refusing to store single element.");
+
+        static ConstantRange empty(1, false);
+        iterator E = end();
+        iterator I =
+            std::lower_bound(begin(), E, std::make_pair(Subtree, empty), swo);
+
+        if (I != end() && I->first == Subtree) {
+          ConstantRange CR2 = I->second.maximalIntersectWith(CR);
+          assert(!CR2.isEmptySet() && !CR2.isSingleElement() &&
+                 "Invalid union of ranges.");
+          I->second = CR2;
+        } else
+          RangeList.insert(I, std::make_pair(Subtree, CR));
+      }
+    };
+
+    std::vector<ScopedRange> Ranges;
+
+    void update(unsigned n, const ConstantRange &CR, DomTreeDFS::Node *Subtree){
+      if (CR.isFullSet()) return;
+      if (Ranges.size() < n) Ranges.resize(n);
+      Ranges[n-1].update(CR, Subtree);
+    }
+
+    /// create - Creates a ConstantRange that matches the given LatticeVal
+    /// relation with a given integer.
+    ConstantRange create(LatticeVal LV, const ConstantRange &CR) {
+      assert(!CR.isEmptySet() && "Can't deal with empty set.");
+
+      if (LV == NE)
+        return makeConstantRange(ICmpInst::ICMP_NE, CR);
+
+      unsigned LV_s = LV & (SGT_BIT|SLT_BIT);
+      unsigned LV_u = LV & (UGT_BIT|ULT_BIT);
+      bool hasEQ = LV & EQ_BIT;
+
+      ConstantRange Range(CR.getBitWidth());
+
+      if (LV_s == SGT_BIT) {
+        Range = Range.maximalIntersectWith(makeConstantRange(
+                    hasEQ ? ICmpInst::ICMP_SGE : ICmpInst::ICMP_SGT, CR));
+      } else if (LV_s == SLT_BIT) {
+        Range = Range.maximalIntersectWith(makeConstantRange(
+                    hasEQ ? ICmpInst::ICMP_SLE : ICmpInst::ICMP_SLT, CR));
+      }
+
+      if (LV_u == UGT_BIT) {
+        Range = Range.maximalIntersectWith(makeConstantRange(
+                    hasEQ ? ICmpInst::ICMP_UGE : ICmpInst::ICMP_UGT, CR));
+      } else if (LV_u == ULT_BIT) {
+        Range = Range.maximalIntersectWith(makeConstantRange(
+                    hasEQ ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT, CR));
+      }
+
+      return Range;
+    }
+
+    /// makeConstantRange - Creates a ConstantRange representing the set of all
+    /// value that match the ICmpInst::Predicate with any of the values in CR.
+    ConstantRange makeConstantRange(ICmpInst::Predicate ICmpOpcode,
+                                    const ConstantRange &CR) {
+      uint32_t W = CR.getBitWidth();
+      switch (ICmpOpcode) {
+        default: assert(!"Invalid ICmp opcode to makeConstantRange()");
+        case ICmpInst::ICMP_EQ:
+          return ConstantRange(CR.getLower(), CR.getUpper());
+        case ICmpInst::ICMP_NE:
+          if (CR.isSingleElement())
+            return ConstantRange(CR.getUpper(), CR.getLower());
+          return ConstantRange(W);
+        case ICmpInst::ICMP_ULT:
+          return ConstantRange(APInt::getMinValue(W), CR.getUnsignedMax());
+        case ICmpInst::ICMP_SLT:
+          return ConstantRange(APInt::getSignedMinValue(W), CR.getSignedMax());
+        case ICmpInst::ICMP_ULE: {
+          APInt UMax(CR.getUnsignedMax());
+          if (UMax.isMaxValue())
+            return ConstantRange(W);
+          return ConstantRange(APInt::getMinValue(W), UMax + 1);
+        }
+        case ICmpInst::ICMP_SLE: {
+          APInt SMax(CR.getSignedMax());
+          if (SMax.isMaxSignedValue() || (SMax+1).isMaxSignedValue())
+            return ConstantRange(W);
+          return ConstantRange(APInt::getSignedMinValue(W), SMax + 1);
+        }
+        case ICmpInst::ICMP_UGT:
+          return ConstantRange(CR.getUnsignedMin() + 1, APInt::getNullValue(W));
+        case ICmpInst::ICMP_SGT:
+          return ConstantRange(CR.getSignedMin() + 1,
+                               APInt::getSignedMinValue(W));
+        case ICmpInst::ICMP_UGE: {
+          APInt UMin(CR.getUnsignedMin());
+          if (UMin.isMinValue())
+            return ConstantRange(W);
+          return ConstantRange(UMin, APInt::getNullValue(W));
+        }
+        case ICmpInst::ICMP_SGE: {
+          APInt SMin(CR.getSignedMin());
+          if (SMin.isMinSignedValue())
+            return ConstantRange(W);
+          return ConstantRange(SMin, APInt::getSignedMinValue(W));
+        }
+      }
+    }
+
+#ifndef NDEBUG
+    bool isCanonical(Value *V, DomTreeDFS::Node *Subtree) {
+      return V == VN.canonicalize(V, Subtree);
+    }
+#endif
+
+  public:
+
+    ValueRanges(ValueNumbering &VN, TargetData *TD) : VN(VN), TD(TD) {}
+
+#ifndef NDEBUG
+    virtual ~ValueRanges() {}
+
+    virtual void dump() const {
+      dump(*cerr.stream());
+    }
+
+    void dump(std::ostream &os) const {
+      for (unsigned i = 0, e = Ranges.size(); i != e; ++i) {
+        os << (i+1) << " = ";
+        Ranges[i].dump(os);
+        os << "\n";
+      }
+    }
+#endif
+
+    /// range - looks up the ConstantRange associated with a value number.
+    ConstantRange range(unsigned n, DomTreeDFS::Node *Subtree) {
+      assert(VN.value(n)); // performs range checks
+
+      if (n <= Ranges.size()) {
+        ScopedRange::iterator I = Ranges[n-1].find(Subtree);
+        if (I != Ranges[n-1].end()) return I->second;
+      }
+
+      Value *V = VN.value(n);
+      ConstantRange CR = range(V);
+      return CR;
+    }
+
+    /// range - determine a range from a Value without performing any lookups.
+    ConstantRange range(Value *V) const {
+      if (ConstantInt *C = dyn_cast<ConstantInt>(V))
+        return ConstantRange(C->getValue());
+      else if (isa<ConstantPointerNull>(V))
+        return ConstantRange(APInt::getNullValue(typeToWidth(V->getType())));
+      else
+        return ConstantRange(typeToWidth(V->getType()));
+    }
+
+    // typeToWidth - returns the number of bits necessary to store a value of
+    // this type, or zero if unknown.
+    uint32_t typeToWidth(const Type *Ty) const {
+      if (TD)
+        return TD->getTypeSizeInBits(Ty);
+      else
+        return Ty->getPrimitiveSizeInBits();
+    }
+
+    static bool isRelatedBy(const ConstantRange &CR1, const ConstantRange &CR2,
+                            LatticeVal LV) {
+      switch (LV) {
+      default: assert(!"Impossible lattice value!");
+      case NE:
+        return CR1.maximalIntersectWith(CR2).isEmptySet();
+      case ULT:
+        return CR1.getUnsignedMax().ult(CR2.getUnsignedMin());
+      case ULE:
+        return CR1.getUnsignedMax().ule(CR2.getUnsignedMin());
+      case UGT:
+        return CR1.getUnsignedMin().ugt(CR2.getUnsignedMax());
+      case UGE:
+        return CR1.getUnsignedMin().uge(CR2.getUnsignedMax());
+      case SLT:
+        return CR1.getSignedMax().slt(CR2.getSignedMin());
+      case SLE:
+        return CR1.getSignedMax().sle(CR2.getSignedMin());
+      case SGT:
+        return CR1.getSignedMin().sgt(CR2.getSignedMax());
+      case SGE:
+        return CR1.getSignedMin().sge(CR2.getSignedMax());
+      case LT:
+        return CR1.getUnsignedMax().ult(CR2.getUnsignedMin()) &&
+               CR1.getSignedMax().slt(CR2.getUnsignedMin());
+      case LE:
+        return CR1.getUnsignedMax().ule(CR2.getUnsignedMin()) &&
+               CR1.getSignedMax().sle(CR2.getUnsignedMin());
+      case GT:
+        return CR1.getUnsignedMin().ugt(CR2.getUnsignedMax()) &&
+               CR1.getSignedMin().sgt(CR2.getSignedMax());
+      case GE:
+        return CR1.getUnsignedMin().uge(CR2.getUnsignedMax()) &&
+               CR1.getSignedMin().sge(CR2.getSignedMax());
+      case SLTUGT:
+        return CR1.getSignedMax().slt(CR2.getSignedMin()) &&
+               CR1.getUnsignedMin().ugt(CR2.getUnsignedMax());
+      case SLEUGE:
+        return CR1.getSignedMax().sle(CR2.getSignedMin()) &&
+               CR1.getUnsignedMin().uge(CR2.getUnsignedMax());
+      case SGTULT:
+        return CR1.getSignedMin().sgt(CR2.getSignedMax()) &&
+               CR1.getUnsignedMax().ult(CR2.getUnsignedMin());
+      case SGEULE:
+        return CR1.getSignedMin().sge(CR2.getSignedMax()) &&
+               CR1.getUnsignedMax().ule(CR2.getUnsignedMin());
+      }
+    }
+
+    bool isRelatedBy(unsigned n1, unsigned n2, DomTreeDFS::Node *Subtree,
+                     LatticeVal LV) {
+      ConstantRange CR1 = range(n1, Subtree);
+      ConstantRange CR2 = range(n2, Subtree);
+
+      // True iff all values in CR1 are LV to all values in CR2.
+      return isRelatedBy(CR1, CR2, LV);
+    }
+
+    void addToWorklist(Value *V, Constant *C, ICmpInst::Predicate Pred,
+                       VRPSolver *VRP);
+    void markBlock(VRPSolver *VRP);
+
+    void mergeInto(Value **I, unsigned n, unsigned New,
+                   DomTreeDFS::Node *Subtree, VRPSolver *VRP) {
+      ConstantRange CR_New = range(New, Subtree);
+      ConstantRange Merged = CR_New;
+
+      for (; n != 0; ++I, --n) {
+        unsigned i = VN.valueNumber(*I, Subtree);
+        ConstantRange CR_Kill = i ? range(i, Subtree) : range(*I);
+        if (CR_Kill.isFullSet()) continue;
+        Merged = Merged.maximalIntersectWith(CR_Kill);
+      }
+
+      if (Merged.isFullSet() || Merged == CR_New) return;
+
+      applyRange(New, Merged, Subtree, VRP);
+    }
+
+    void applyRange(unsigned n, const ConstantRange &CR,
+                    DomTreeDFS::Node *Subtree, VRPSolver *VRP) {
+      ConstantRange Merged = CR.maximalIntersectWith(range(n, Subtree));
+      if (Merged.isEmptySet()) {
+        markBlock(VRP);
+        return;
+      }
+
+      if (const APInt *I = Merged.getSingleElement()) {
+        Value *V = VN.value(n); // XXX: redesign worklist.
+        const Type *Ty = V->getType();
+        if (Ty->isInteger()) {
+          addToWorklist(V, ConstantInt::get(*I), ICmpInst::ICMP_EQ, VRP);
+          return;
+        } else if (const PointerType *PTy = dyn_cast<PointerType>(Ty)) {
+          assert(*I == 0 && "Pointer is null but not zero?");
+          addToWorklist(V, ConstantPointerNull::get(PTy),
+                        ICmpInst::ICMP_EQ, VRP);
+          return;
+        }
+      }
+
+      update(n, Merged, Subtree);
+    }
+
+    void addNotEquals(unsigned n1, unsigned n2, DomTreeDFS::Node *Subtree,
+                      VRPSolver *VRP) {
+      ConstantRange CR1 = range(n1, Subtree);
+      ConstantRange CR2 = range(n2, Subtree);
+
+      uint32_t W = CR1.getBitWidth();
+
+      if (const APInt *I = CR1.getSingleElement()) {
+        if (CR2.isFullSet()) {
+          ConstantRange NewCR2(CR1.getUpper(), CR1.getLower());
+          applyRange(n2, NewCR2, Subtree, VRP);
+        } else if (*I == CR2.getLower()) {
+          APInt NewLower(CR2.getLower() + 1),
+                NewUpper(CR2.getUpper());
+          if (NewLower == NewUpper)
+            NewLower = NewUpper = APInt::getMinValue(W);
+
+          ConstantRange NewCR2(NewLower, NewUpper);
+          applyRange(n2, NewCR2, Subtree, VRP);
+        } else if (*I == CR2.getUpper() - 1) {
+          APInt NewLower(CR2.getLower()),
+                NewUpper(CR2.getUpper() - 1);
+          if (NewLower == NewUpper)
+            NewLower = NewUpper = APInt::getMinValue(W);
+
+          ConstantRange NewCR2(NewLower, NewUpper);
+          applyRange(n2, NewCR2, Subtree, VRP);
+        }
+      }
+
+      if (const APInt *I = CR2.getSingleElement()) {
+        if (CR1.isFullSet()) {
+          ConstantRange NewCR1(CR2.getUpper(), CR2.getLower());
+          applyRange(n1, NewCR1, Subtree, VRP);
+        } else if (*I == CR1.getLower()) {
+          APInt NewLower(CR1.getLower() + 1),
+                NewUpper(CR1.getUpper());
+          if (NewLower == NewUpper)
+            NewLower = NewUpper = APInt::getMinValue(W);
+
+          ConstantRange NewCR1(NewLower, NewUpper);
+          applyRange(n1, NewCR1, Subtree, VRP);
+        } else if (*I == CR1.getUpper() - 1) {
+          APInt NewLower(CR1.getLower()),
+                NewUpper(CR1.getUpper() - 1);
+          if (NewLower == NewUpper)
+            NewLower = NewUpper = APInt::getMinValue(W);
+
+          ConstantRange NewCR1(NewLower, NewUpper);
+          applyRange(n1, NewCR1, Subtree, VRP);
+        }
+      }
+    }
+
+    void addInequality(unsigned n1, unsigned n2, DomTreeDFS::Node *Subtree,
+                       LatticeVal LV, VRPSolver *VRP) {
+      assert(!isRelatedBy(n1, n2, Subtree, LV) && "Asked to do useless work.");
+
+      if (LV == NE) {
+        addNotEquals(n1, n2, Subtree, VRP);
+        return;
+      }
+
+      ConstantRange CR1 = range(n1, Subtree);
+      ConstantRange CR2 = range(n2, Subtree);
+
+      if (!CR1.isSingleElement()) {
+        ConstantRange NewCR1 = CR1.maximalIntersectWith(create(LV, CR2));
+        if (NewCR1 != CR1)
+          applyRange(n1, NewCR1, Subtree, VRP);
+      }
+
+      if (!CR2.isSingleElement()) {
+        ConstantRange NewCR2 = CR2.maximalIntersectWith(
+                                       create(reversePredicate(LV), CR1));
+        if (NewCR2 != CR2)
+          applyRange(n2, NewCR2, Subtree, VRP);
+      }
+    }
+  };
+
+  /// UnreachableBlocks keeps tracks of blocks that are for one reason or
+  /// another discovered to be unreachable. This is used to cull the graph when
+  /// analyzing instructions, and to mark blocks with the "unreachable"
+  /// terminator instruction after the function has executed.
+  class VISIBILITY_HIDDEN UnreachableBlocks {
+  private:
+    std::vector<BasicBlock *> DeadBlocks;
+
+  public:
+    /// mark - mark a block as dead
+    void mark(BasicBlock *BB) {
+      std::vector<BasicBlock *>::iterator E = DeadBlocks.end();
+      std::vector<BasicBlock *>::iterator I =
+        std::lower_bound(DeadBlocks.begin(), E, BB);
+
+      if (I == E || *I != BB) DeadBlocks.insert(I, BB);
+    }
+
+    /// isDead - returns whether a block is known to be dead already
+    bool isDead(BasicBlock *BB) {
+      std::vector<BasicBlock *>::iterator E = DeadBlocks.end();
+      std::vector<BasicBlock *>::iterator I =
+        std::lower_bound(DeadBlocks.begin(), E, BB);
+
+      return I != E && *I == BB;
+    }
+
+    /// kill - replace the dead blocks' terminator with an UnreachableInst.
+    bool kill() {
+      bool modified = false;
+      for (std::vector<BasicBlock *>::iterator I = DeadBlocks.begin(),
+           E = DeadBlocks.end(); I != E; ++I) {
+        BasicBlock *BB = *I;
+
+        DOUT << "unreachable block: " << BB->getName() << "\n";
+
+        for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB);
+             SI != SE; ++SI) {
+          BasicBlock *Succ = *SI;
+          Succ->removePredecessor(BB);
+        }
+
+        TerminatorInst *TI = BB->getTerminator();
+        TI->replaceAllUsesWith(UndefValue::get(TI->getType()));
+        TI->eraseFromParent();
+        new UnreachableInst(BB);
+        ++NumBlocks;
+        modified = true;
+      }
+      DeadBlocks.clear();
+      return modified;
+    }
+  };
+
+  /// VRPSolver keeps track of how changes to one variable affect other
+  /// variables, and forwards changes along to the InequalityGraph. It
+  /// also maintains the correct choice for "canonical" in the IG.
+  /// @brief VRPSolver calculates inferences from a new relationship.
+  class VISIBILITY_HIDDEN VRPSolver {
+  private:
+    friend class ValueRanges;
+
+    struct Operation {
+      Value *LHS, *RHS;
+      ICmpInst::Predicate Op;
+
+      BasicBlock *ContextBB; // XXX use a DomTreeDFS::Node instead
+      Instruction *ContextInst;
+    };
+    std::deque<Operation> WorkList;
+
+    ValueNumbering &VN;
+    InequalityGraph &IG;
+    UnreachableBlocks &UB;
+    ValueRanges &VR;
+    DomTreeDFS *DTDFS;
+    DomTreeDFS::Node *Top;
+    BasicBlock *TopBB;
+    Instruction *TopInst;
+    bool &modified;
+
+    typedef InequalityGraph::Node Node;
+
+    // below - true if the Instruction is dominated by the current context
+    // block or instruction
+    bool below(Instruction *I) {
+      BasicBlock *BB = I->getParent();
+      if (TopInst && TopInst->getParent() == BB) {
+        if (isa<TerminatorInst>(TopInst)) return false;
+        if (isa<TerminatorInst>(I)) return true;
+        if ( isa<PHINode>(TopInst) && !isa<PHINode>(I)) return true;
+        if (!isa<PHINode>(TopInst) &&  isa<PHINode>(I)) return false;
+
+        for (BasicBlock::const_iterator Iter = BB->begin(), E = BB->end();
+             Iter != E; ++Iter) {
+          if (&*Iter == TopInst) return true;
+          else if (&*Iter == I) return false;
+        }
+        assert(!"Instructions not found in parent BasicBlock?");
+      } else {
+        DomTreeDFS::Node *Node = DTDFS->getNodeForBlock(BB);
+        if (!Node) return false;
+        return Top->dominates(Node);
+      }
+      return false; // Not reached
+    }
+
+    // aboveOrBelow - true if the Instruction either dominates or is dominated
+    // by the current context block or instruction
+    bool aboveOrBelow(Instruction *I) {
+      BasicBlock *BB = I->getParent();
+      DomTreeDFS::Node *Node = DTDFS->getNodeForBlock(BB);
+      if (!Node) return false;
+
+      return Top == Node || Top->dominates(Node) || Node->dominates(Top);
+    }
+
+    bool makeEqual(Value *V1, Value *V2) {
+      DOUT << "makeEqual(" << *V1 << ", " << *V2 << ")\n";
+      DOUT << "context is ";
+      if (TopInst) DOUT << "I: " << *TopInst << "\n";
+      else DOUT << "BB: " << TopBB->getName()
+                << "(" << Top->getDFSNumIn() << ")\n";
+
+      assert(V1->getType() == V2->getType() &&
+             "Can't make two values with different types equal.");
+
+      if (V1 == V2) return true;
+
+      if (isa<Constant>(V1) && isa<Constant>(V2))
+        return false;
+
+      unsigned n1 = VN.valueNumber(V1, Top), n2 = VN.valueNumber(V2, Top);
+
+      if (n1 && n2) {
+        if (n1 == n2) return true;
+        if (IG.isRelatedBy(n1, n2, Top, NE)) return false;
+      }
+
+      if (n1) assert(V1 == VN.value(n1) && "Value isn't canonical.");
+      if (n2) assert(V2 == VN.value(n2) && "Value isn't canonical.");
+
+      assert(!VN.compare(V2, V1) && "Please order parameters to makeEqual.");
+
+      assert(!isa<Constant>(V2) && "Tried to remove a constant.");
+
+      SetVector<unsigned> Remove;
+      if (n2) Remove.insert(n2);
+
+      if (n1 && n2) {
+        // Suppose we're being told that %x == %y, and %x <= %z and %y >= %z.
+        // We can't just merge %x and %y because the relationship with %z would
+        // be EQ and that's invalid. What we're doing is looking for any nodes
+        // %z such that %x <= %z and %y >= %z, and vice versa.
+
+        Node::iterator end = IG.node(n2)->end();
+
+        // Find the intersection between N1 and N2 which is dominated by
+        // Top. If we find %x where N1 <= %x <= N2 (or >=) then add %x to
+        // Remove.
+        for (Node::iterator I = IG.node(n1)->begin(), E = IG.node(n1)->end();
+             I != E; ++I) {
+          if (!(I->LV & EQ_BIT) || !Top->DominatedBy(I->Subtree)) continue;
+
+          unsigned ILV_s = I->LV & (SLT_BIT|SGT_BIT);
+          unsigned ILV_u = I->LV & (ULT_BIT|UGT_BIT);
+          Node::iterator NI = IG.node(n2)->find(I->To, Top);
+          if (NI != end) {
+            LatticeVal NILV = reversePredicate(NI->LV);
+            unsigned NILV_s = NILV & (SLT_BIT|SGT_BIT);
+            unsigned NILV_u = NILV & (ULT_BIT|UGT_BIT);
+
+            if ((ILV_s != (SLT_BIT|SGT_BIT) && ILV_s == NILV_s) ||
+                (ILV_u != (ULT_BIT|UGT_BIT) && ILV_u == NILV_u))
+              Remove.insert(I->To);
+          }
+        }
+
+        // See if one of the nodes about to be removed is actually a better
+        // canonical choice than n1.
+        unsigned orig_n1 = n1;
+        SetVector<unsigned>::iterator DontRemove = Remove.end();
+        for (SetVector<unsigned>::iterator I = Remove.begin()+1 /* skip n2 */,
+             E = Remove.end(); I != E; ++I) {
+          unsigned n = *I;
+          Value *V = VN.value(n);
+          if (VN.compare(V, V1)) {
+            V1 = V;
+            n1 = n;
+            DontRemove = I;
+          }
+        }
+        if (DontRemove != Remove.end()) {
+          unsigned n = *DontRemove;
+          Remove.remove(n);
+          Remove.insert(orig_n1);
+        }
+      }
+
+      // We'd like to allow makeEqual on two values to perform a simple
+      // substitution without creating nodes in the IG whenever possible.
+      //
+      // The first iteration through this loop operates on V2 before going
+      // through the Remove list and operating on those too. If all of the
+      // iterations performed simple replacements then we exit early.
+      bool mergeIGNode = false;
+      unsigned i = 0;
+      for (Value *R = V2; i == 0 || i < Remove.size(); ++i) {
+        if (i) R = VN.value(Remove[i]); // skip n2.
+
+        // Try to replace the whole instruction. If we can, we're done.
+        Instruction *I2 = dyn_cast<Instruction>(R);
+        if (I2 && below(I2)) {
+          std::vector<Instruction *> ToNotify;
+          for (Value::use_iterator UI = R->use_begin(), UE = R->use_end();
+               UI != UE;) {
+            Use &TheUse = UI.getUse();
+            ++UI;
+            if (Instruction *I = dyn_cast<Instruction>(TheUse.getUser()))
+              ToNotify.push_back(I);
+          }
+
+          DOUT << "Simply removing " << *I2
+               << ", replacing with " << *V1 << "\n";
+          I2->replaceAllUsesWith(V1);
+          // leave it dead; it'll get erased later.
+          ++NumInstruction;
+          modified = true;
+
+          for (std::vector<Instruction *>::iterator II = ToNotify.begin(),
+               IE = ToNotify.end(); II != IE; ++II) {
+            opsToDef(*II);
+          }
+
+          continue;
+        }
+
+        // Otherwise, replace all dominated uses.
+        for (Value::use_iterator UI = R->use_begin(), UE = R->use_end();
+             UI != UE;) {
+          Use &TheUse = UI.getUse();
+          ++UI;
+          if (Instruction *I = dyn_cast<Instruction>(TheUse.getUser())) {
+            if (below(I)) {
+              TheUse.set(V1);
+              modified = true;
+              ++NumVarsReplaced;
+              opsToDef(I);
+            }
+          }
+        }
+
+        // If that killed the instruction, stop here.
+        if (I2 && isInstructionTriviallyDead(I2)) {
+          DOUT << "Killed all uses of " << *I2
+               << ", replacing with " << *V1 << "\n";
+          continue;
+        }
+
+        // If we make it to here, then we will need to create a node for N1.
+        // Otherwise, we can skip out early!
+        mergeIGNode = true;
+      }
+
+      if (!isa<Constant>(V1)) {
+        if (Remove.empty()) {
+          VR.mergeInto(&V2, 1, VN.getOrInsertVN(V1, Top), Top, this);
+        } else {
+          std::vector<Value*> RemoveVals;
+          RemoveVals.reserve(Remove.size());
+
+          for (SetVector<unsigned>::iterator I = Remove.begin(),
+               E = Remove.end(); I != E; ++I) {
+            Value *V = VN.value(*I);
+            if (!V->use_empty())
+              RemoveVals.push_back(V);
+          }
+          VR.mergeInto(&RemoveVals[0], RemoveVals.size(), 
+                       VN.getOrInsertVN(V1, Top), Top, this);
+        }
+      }
+
+      if (mergeIGNode) {
+        // Create N1.
+        if (!n1) n1 = VN.getOrInsertVN(V1, Top);
+        IG.node(n1); // Ensure that IG.Nodes won't get resized
+
+        // Migrate relationships from removed nodes to N1.
+        for (SetVector<unsigned>::iterator I = Remove.begin(), E = Remove.end();
+             I != E; ++I) {
+          unsigned n = *I;
+          for (Node::iterator NI = IG.node(n)->begin(), NE = IG.node(n)->end();
+               NI != NE; ++NI) {
+            if (NI->Subtree->DominatedBy(Top)) {
+              if (NI->To == n1) {
+                assert((NI->LV & EQ_BIT) && "Node inequal to itself.");
+                continue;
+              }
+              if (Remove.count(NI->To))
+                continue;
+
+              IG.node(NI->To)->update(n1, reversePredicate(NI->LV), Top);
+              IG.node(n1)->update(NI->To, NI->LV, Top);
+            }
+          }
+        }
+
+        // Point V2 (and all items in Remove) to N1.
+        if (!n2)
+          VN.addEquality(n1, V2, Top);
+        else {
+          for (SetVector<unsigned>::iterator I = Remove.begin(),
+               E = Remove.end(); I != E; ++I) {
+            VN.addEquality(n1, VN.value(*I), Top);
+          }
+        }
+
+        // If !Remove.empty() then V2 = Remove[0]->getValue().
+        // Even when Remove is empty, we still want to process V2.
+        i = 0;
+        for (Value *R = V2; i == 0 || i < Remove.size(); ++i) {
+          if (i) R = VN.value(Remove[i]); // skip n2.
+
+          if (Instruction *I2 = dyn_cast<Instruction>(R)) {
+            if (aboveOrBelow(I2))
+            defToOps(I2);
+          }
+          for (Value::use_iterator UI = V2->use_begin(), UE = V2->use_end();
+               UI != UE;) {
+            Use &TheUse = UI.getUse();
+            ++UI;
+            if (Instruction *I = dyn_cast<Instruction>(TheUse.getUser())) {
+              if (aboveOrBelow(I))
+                opsToDef(I);
+            }
+          }
+        }
+      }
+
+      // re-opsToDef all dominated users of V1.
+      if (Instruction *I = dyn_cast<Instruction>(V1)) {
+        for (Value::use_iterator UI = I->use_begin(), UE = I->use_end();
+             UI != UE;) {
+          Use &TheUse = UI.getUse();
+          ++UI;
+          Value *V = TheUse.getUser();
+          if (!V->use_empty()) {
+            if (Instruction *Inst = dyn_cast<Instruction>(V)) {
+              if (aboveOrBelow(Inst))
+                opsToDef(Inst);
+            }
+          }
+        }
+      }
+
+      return true;
+    }
+
+    /// cmpInstToLattice - converts an CmpInst::Predicate to lattice value
+    /// Requires that the lattice value be valid; does not accept ICMP_EQ.
+    static LatticeVal cmpInstToLattice(ICmpInst::Predicate Pred) {
+      switch (Pred) {
+        case ICmpInst::ICMP_EQ:
+          assert(!"No matching lattice value.");
+          return static_cast<LatticeVal>(EQ_BIT);
+        default:
+          assert(!"Invalid 'icmp' predicate.");
+        case ICmpInst::ICMP_NE:
+          return NE;
+        case ICmpInst::ICMP_UGT:
+          return UGT;
+        case ICmpInst::ICMP_UGE:
+          return UGE;
+        case ICmpInst::ICMP_ULT:
+          return ULT;
+        case ICmpInst::ICMP_ULE:
+          return ULE;
+        case ICmpInst::ICMP_SGT:
+          return SGT;
+        case ICmpInst::ICMP_SGE:
+          return SGE;
+        case ICmpInst::ICMP_SLT:
+          return SLT;
+        case ICmpInst::ICMP_SLE:
+          return SLE;
+      }
+    }
+
+  public:
+    VRPSolver(ValueNumbering &VN, InequalityGraph &IG, UnreachableBlocks &UB,
+              ValueRanges &VR, DomTreeDFS *DTDFS, bool &modified,
+              BasicBlock *TopBB)
+      : VN(VN),
+        IG(IG),
+        UB(UB),
+        VR(VR),
+        DTDFS(DTDFS),
+        Top(DTDFS->getNodeForBlock(TopBB)),
+        TopBB(TopBB),
+        TopInst(NULL),
+        modified(modified)
+    {
+      assert(Top && "VRPSolver created for unreachable basic block.");
+    }
+
+    VRPSolver(ValueNumbering &VN, InequalityGraph &IG, UnreachableBlocks &UB,
+              ValueRanges &VR, DomTreeDFS *DTDFS, bool &modified,
+              Instruction *TopInst)
+      : VN(VN),
+        IG(IG),
+        UB(UB),
+        VR(VR),
+        DTDFS(DTDFS),
+        Top(DTDFS->getNodeForBlock(TopInst->getParent())),
+        TopBB(TopInst->getParent()),
+        TopInst(TopInst),
+        modified(modified)
+    {
+      assert(Top && "VRPSolver created for unreachable basic block.");
+      assert(Top->getBlock() == TopInst->getParent() && "Context mismatch.");
+    }
+
+    bool isRelatedBy(Value *V1, Value *V2, ICmpInst::Predicate Pred) const {
+      if (Constant *C1 = dyn_cast<Constant>(V1))
+        if (Constant *C2 = dyn_cast<Constant>(V2))
+          return ConstantExpr::getCompare(Pred, C1, C2) ==
+                 ConstantInt::getTrue();
+
+      unsigned n1 = VN.valueNumber(V1, Top);
+      unsigned n2 = VN.valueNumber(V2, Top);
+
+      if (n1 && n2) {
+        if (n1 == n2) return Pred == ICmpInst::ICMP_EQ ||
+                             Pred == ICmpInst::ICMP_ULE ||
+                             Pred == ICmpInst::ICMP_UGE ||
+                             Pred == ICmpInst::ICMP_SLE ||
+                             Pred == ICmpInst::ICMP_SGE;
+        if (Pred == ICmpInst::ICMP_EQ) return false;
+        if (IG.isRelatedBy(n1, n2, Top, cmpInstToLattice(Pred))) return true;
+        if (VR.isRelatedBy(n1, n2, Top, cmpInstToLattice(Pred))) return true;
+      }
+
+      if ((n1 && !n2 && isa<Constant>(V2)) ||
+          (n2 && !n1 && isa<Constant>(V1))) {
+        ConstantRange CR1 = n1 ? VR.range(n1, Top) : VR.range(V1);
+        ConstantRange CR2 = n2 ? VR.range(n2, Top) : VR.range(V2);
+
+        if (Pred == ICmpInst::ICMP_EQ)
+          return CR1.isSingleElement() &&
+                 CR1.getSingleElement() == CR2.getSingleElement();
+
+        return VR.isRelatedBy(CR1, CR2, cmpInstToLattice(Pred));
+      }
+      if (Pred == ICmpInst::ICMP_EQ) return V1 == V2;
+      return false;
+    }
+
+    /// add - adds a new property to the work queue
+    void add(Value *V1, Value *V2, ICmpInst::Predicate Pred,
+             Instruction *I = NULL) {
+      DOUT << "adding " << *V1 << " " << Pred << " " << *V2;
+      if (I) DOUT << " context: " << *I;
+      else DOUT << " default context (" << Top->getDFSNumIn() << ")";
+      DOUT << "\n";
+
+      assert(V1->getType() == V2->getType() &&
+             "Can't relate two values with different types.");
+
+      WorkList.push_back(Operation());
+      Operation &O = WorkList.back();
+      O.LHS = V1, O.RHS = V2, O.Op = Pred, O.ContextInst = I;
+      O.ContextBB = I ? I->getParent() : TopBB;
+    }
+
+    /// defToOps - Given an instruction definition that we've learned something
+    /// new about, find any new relationships between its operands.
+    void defToOps(Instruction *I) {
+      Instruction *NewContext = below(I) ? I : TopInst;
+      Value *Canonical = VN.canonicalize(I, Top);
+
+      if (BinaryOperator *BO = dyn_cast<BinaryOperator>(I)) {
+        const Type *Ty = BO->getType();
+        assert(!Ty->isFPOrFPVector() && "Float in work queue!");
+
+        Value *Op0 = VN.canonicalize(BO->getOperand(0), Top);
+        Value *Op1 = VN.canonicalize(BO->getOperand(1), Top);
+
+        // TODO: "and i32 -1, %x" EQ %y then %x EQ %y.
+
+        switch (BO->getOpcode()) {
+          case Instruction::And: {
+            // "and i32 %a, %b" EQ -1 then %a EQ -1 and %b EQ -1
+            ConstantInt *CI = ConstantInt::getAllOnesValue(Ty);
+            if (Canonical == CI) {
+              add(CI, Op0, ICmpInst::ICMP_EQ, NewContext);
+              add(CI, Op1, ICmpInst::ICMP_EQ, NewContext);
+            }
+          } break;
+          case Instruction::Or: {
+            // "or i32 %a, %b" EQ 0 then %a EQ 0 and %b EQ 0
+            Constant *Zero = Constant::getNullValue(Ty);
+            if (Canonical == Zero) {
+              add(Zero, Op0, ICmpInst::ICMP_EQ, NewContext);
+              add(Zero, Op1, ICmpInst::ICMP_EQ, NewContext);
+            }
+          } break;
+          case Instruction::Xor: {
+            // "xor i32 %c, %a" EQ %b then %a EQ %c ^ %b
+            // "xor i32 %c, %a" EQ %c then %a EQ 0
+            // "xor i32 %c, %a" NE %c then %a NE 0
+            // Repeat the above, with order of operands reversed.
+            Value *LHS = Op0;
+            Value *RHS = Op1;
+            if (!isa<Constant>(LHS)) std::swap(LHS, RHS);
+
+            if (ConstantInt *CI = dyn_cast<ConstantInt>(Canonical)) {
+              if (ConstantInt *Arg = dyn_cast<ConstantInt>(LHS)) {
+                add(RHS, ConstantInt::get(CI->getValue() ^ Arg->getValue()),
+                    ICmpInst::ICMP_EQ, NewContext);
+              }
+            }
+            if (Canonical == LHS) {
+              if (isa<ConstantInt>(Canonical))
+                add(RHS, Constant::getNullValue(Ty), ICmpInst::ICMP_EQ,
+                    NewContext);
+            } else if (isRelatedBy(LHS, Canonical, ICmpInst::ICMP_NE)) {
+              add(RHS, Constant::getNullValue(Ty), ICmpInst::ICMP_NE,
+                  NewContext);
+            }
+          } break;
+          default:
+            break;
+        }
+      } else if (ICmpInst *IC = dyn_cast<ICmpInst>(I)) {
+        // "icmp ult i32 %a, %y" EQ true then %a u< y
+        // etc.
+
+        if (Canonical == ConstantInt::getTrue()) {
+          add(IC->getOperand(0), IC->getOperand(1), IC->getPredicate(),
+              NewContext);
+        } else if (Canonical == ConstantInt::getFalse()) {
+          add(IC->getOperand(0), IC->getOperand(1),
+              ICmpInst::getInversePredicate(IC->getPredicate()), NewContext);
+        }
+      } else if (SelectInst *SI = dyn_cast<SelectInst>(I)) {
+        if (I->getType()->isFPOrFPVector()) return;
+
+        // Given: "%a = select i1 %x, i32 %b, i32 %c"
+        // %a EQ %b and %b NE %c then %x EQ true
+        // %a EQ %c and %b NE %c then %x EQ false
+
+        Value *True  = SI->getTrueValue();
+        Value *False = SI->getFalseValue();
+        if (isRelatedBy(True, False, ICmpInst::ICMP_NE)) {
+          if (Canonical == VN.canonicalize(True, Top) ||
+              isRelatedBy(Canonical, False, ICmpInst::ICMP_NE))
+            add(SI->getCondition(), ConstantInt::getTrue(),
+                ICmpInst::ICMP_EQ, NewContext);
+          else if (Canonical == VN.canonicalize(False, Top) ||
+                   isRelatedBy(Canonical, True, ICmpInst::ICMP_NE))
+            add(SI->getCondition(), ConstantInt::getFalse(),
+                ICmpInst::ICMP_EQ, NewContext);
+        }
+      } else if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(I)) {
+        for (GetElementPtrInst::op_iterator OI = GEPI->idx_begin(),
+             OE = GEPI->idx_end(); OI != OE; ++OI) {
+          ConstantInt *Op = dyn_cast<ConstantInt>(VN.canonicalize(*OI, Top));
+          if (!Op || !Op->isZero()) return;
+        }
+        // TODO: The GEPI indices are all zero. Copy from definition to operand,
+        // jumping the type plane as needed.
+        if (isRelatedBy(GEPI, Constant::getNullValue(GEPI->getType()),
+                        ICmpInst::ICMP_NE)) {
+          Value *Ptr = GEPI->getPointerOperand();
+          add(Ptr, Constant::getNullValue(Ptr->getType()), ICmpInst::ICMP_NE,
+              NewContext);
+        }
+      } else if (CastInst *CI = dyn_cast<CastInst>(I)) {
+        const Type *SrcTy = CI->getSrcTy();
+
+        unsigned ci = VN.getOrInsertVN(CI, Top);
+        uint32_t W = VR.typeToWidth(SrcTy);
+        if (!W) return;
+        ConstantRange CR = VR.range(ci, Top);
+
+        if (CR.isFullSet()) return;
+
+        switch (CI->getOpcode()) {
+          default: break;
+          case Instruction::ZExt:
+          case Instruction::SExt:
+            VR.applyRange(VN.getOrInsertVN(CI->getOperand(0), Top),
+                          CR.truncate(W), Top, this);
+            break;
+          case Instruction::BitCast:
+            VR.applyRange(VN.getOrInsertVN(CI->getOperand(0), Top),
+                          CR, Top, this);
+            break;
+        }
+      }
+    }
+
+    /// opsToDef - A new relationship was discovered involving one of this
+    /// instruction's operands. Find any new relationship involving the
+    /// definition, or another operand.
+    void opsToDef(Instruction *I) {
+      Instruction *NewContext = below(I) ? I : TopInst;
+
+      if (BinaryOperator *BO = dyn_cast<BinaryOperator>(I)) {
+        Value *Op0 = VN.canonicalize(BO->getOperand(0), Top);
+        Value *Op1 = VN.canonicalize(BO->getOperand(1), Top);
+
+        if (ConstantInt *CI0 = dyn_cast<ConstantInt>(Op0))
+          if (ConstantInt *CI1 = dyn_cast<ConstantInt>(Op1)) {
+            add(BO, ConstantExpr::get(BO->getOpcode(), CI0, CI1),
+                ICmpInst::ICMP_EQ, NewContext);
+            return;
+          }
+
+        // "%y = and i1 true, %x" then %x EQ %y
+        // "%y = or i1 false, %x" then %x EQ %y
+        // "%x = add i32 %y, 0" then %x EQ %y
+        // "%x = mul i32 %y, 0" then %x EQ 0
+
+        Instruction::BinaryOps Opcode = BO->getOpcode();
+        const Type *Ty = BO->getType();
+        assert(!Ty->isFPOrFPVector() && "Float in work queue!");
+
+        Constant *Zero = Constant::getNullValue(Ty);
+        Constant *One = ConstantInt::get(Ty, 1);
+        ConstantInt *AllOnes = ConstantInt::getAllOnesValue(Ty);
+
+        switch (Opcode) {
+          default: break;
+          case Instruction::LShr:
+          case Instruction::AShr:
+          case Instruction::Shl:
+            if (Op1 == Zero) {
+              add(BO, Op0, ICmpInst::ICMP_EQ, NewContext);
+              return;
+            }
+            break;
+          case Instruction::Sub:
+            if (Op1 == Zero) {
+              add(BO, Op0, ICmpInst::ICMP_EQ, NewContext);
+              return;
+            }
+            if (ConstantInt *CI0 = dyn_cast<ConstantInt>(Op0)) {
+              unsigned n_ci0 = VN.getOrInsertVN(Op1, Top);
+              ConstantRange CR = VR.range(n_ci0, Top);
+              if (!CR.isFullSet()) {
+                CR.subtract(CI0->getValue());
+                unsigned n_bo = VN.getOrInsertVN(BO, Top);
+                VR.applyRange(n_bo, CR, Top, this);
+                return;
+              }
+            }
+            if (ConstantInt *CI1 = dyn_cast<ConstantInt>(Op1)) {
+              unsigned n_ci1 = VN.getOrInsertVN(Op0, Top);
+              ConstantRange CR = VR.range(n_ci1, Top);
+              if (!CR.isFullSet()) {
+                CR.subtract(CI1->getValue());
+                unsigned n_bo = VN.getOrInsertVN(BO, Top);
+                VR.applyRange(n_bo, CR, Top, this);
+                return;
+              }
+            }
+            break;
+          case Instruction::Or:
+            if (Op0 == AllOnes || Op1 == AllOnes) {
+              add(BO, AllOnes, ICmpInst::ICMP_EQ, NewContext);
+              return;
+            }
+            if (Op0 == Zero) {
+              add(BO, Op1, ICmpInst::ICMP_EQ, NewContext);
+              return;
+            } else if (Op1 == Zero) {
+              add(BO, Op0, ICmpInst::ICMP_EQ, NewContext);
+              return;
+            }
+            break;
+          case Instruction::Add:
+            if (ConstantInt *CI0 = dyn_cast<ConstantInt>(Op0)) {
+              unsigned n_ci0 = VN.getOrInsertVN(Op1, Top);
+              ConstantRange CR = VR.range(n_ci0, Top);
+              if (!CR.isFullSet()) {
+                CR.subtract(-CI0->getValue());
+                unsigned n_bo = VN.getOrInsertVN(BO, Top);
+                VR.applyRange(n_bo, CR, Top, this);
+                return;
+              }
+            }
+            if (ConstantInt *CI1 = dyn_cast<ConstantInt>(Op1)) {
+              unsigned n_ci1 = VN.getOrInsertVN(Op0, Top);
+              ConstantRange CR = VR.range(n_ci1, Top);
+              if (!CR.isFullSet()) {
+                CR.subtract(-CI1->getValue());
+                unsigned n_bo = VN.getOrInsertVN(BO, Top);
+                VR.applyRange(n_bo, CR, Top, this);
+                return;
+              }
+            }
+            // fall-through
+          case Instruction::Xor:
+            if (Op0 == Zero) {
+              add(BO, Op1, ICmpInst::ICMP_EQ, NewContext);
+              return;
+            } else if (Op1 == Zero) {
+              add(BO, Op0, ICmpInst::ICMP_EQ, NewContext);
+              return;
+            }
+            break;
+          case Instruction::And:
+            if (Op0 == AllOnes) {
+              add(BO, Op1, ICmpInst::ICMP_EQ, NewContext);
+              return;
+            } else if (Op1 == AllOnes) {
+              add(BO, Op0, ICmpInst::ICMP_EQ, NewContext);
+              return;
+            }
+            if (Op0 == Zero || Op1 == Zero) {
+              add(BO, Zero, ICmpInst::ICMP_EQ, NewContext);
+              return;
+            }
+            break;
+          case Instruction::Mul:
+            if (Op0 == Zero || Op1 == Zero) {
+              add(BO, Zero, ICmpInst::ICMP_EQ, NewContext);
+              return;
+            }
+            if (Op0 == One) {
+              add(BO, Op1, ICmpInst::ICMP_EQ, NewContext);
+              return;
+            } else if (Op1 == One) {
+              add(BO, Op0, ICmpInst::ICMP_EQ, NewContext);
+              return;
+            }
+            break;
+        }
+
+        // "%x = add i32 %y, %z" and %x EQ %y then %z EQ 0
+        // "%x = add i32 %y, %z" and %x EQ %z then %y EQ 0
+        // "%x = shl i32 %y, %z" and %x EQ %y and %y NE 0 then %z EQ 0
+        // "%x = udiv i32 %y, %z" and %x EQ %y and %y NE 0 then %z EQ 1
+
+        Value *Known = Op0, *Unknown = Op1,
+              *TheBO = VN.canonicalize(BO, Top);
+        if (Known != TheBO) std::swap(Known, Unknown);
+        if (Known == TheBO) {
+          switch (Opcode) {
+            default: break;
+            case Instruction::LShr:
+            case Instruction::AShr:
+            case Instruction::Shl:
+              if (!isRelatedBy(Known, Zero, ICmpInst::ICMP_NE)) break;
+              // otherwise, fall-through.
+            case Instruction::Sub:
+              if (Unknown == Op0) break;
+              // otherwise, fall-through.
+            case Instruction::Xor:
+            case Instruction::Add:
+              add(Unknown, Zero, ICmpInst::ICMP_EQ, NewContext);
+              break;
+            case Instruction::UDiv:
+            case Instruction::SDiv:
+              if (Unknown == Op1) break;
+              if (isRelatedBy(Known, Zero, ICmpInst::ICMP_NE))
+                add(Unknown, One, ICmpInst::ICMP_EQ, NewContext);
+              break;
+          }
+        }
+
+        // TODO: "%a = add i32 %b, 1" and %b > %z then %a >= %z.
+
+      } else if (ICmpInst *IC = dyn_cast<ICmpInst>(I)) {
+        // "%a = icmp ult i32 %b, %c" and %b u<  %c then %a EQ true
+        // "%a = icmp ult i32 %b, %c" and %b u>= %c then %a EQ false
+        // etc.
+
+        Value *Op0 = VN.canonicalize(IC->getOperand(0), Top);
+        Value *Op1 = VN.canonicalize(IC->getOperand(1), Top);
+
+        ICmpInst::Predicate Pred = IC->getPredicate();
+        if (isRelatedBy(Op0, Op1, Pred))
+          add(IC, ConstantInt::getTrue(), ICmpInst::ICMP_EQ, NewContext);
+        else if (isRelatedBy(Op0, Op1, ICmpInst::getInversePredicate(Pred)))
+          add(IC, ConstantInt::getFalse(), ICmpInst::ICMP_EQ, NewContext);
+
+      } else if (SelectInst *SI = dyn_cast<SelectInst>(I)) {
+        if (I->getType()->isFPOrFPVector()) return;
+
+        // Given: "%a = select i1 %x, i32 %b, i32 %c"
+        // %x EQ true  then %a EQ %b
+        // %x EQ false then %a EQ %c
+        // %b EQ %c then %a EQ %b
+
+        Value *Canonical = VN.canonicalize(SI->getCondition(), Top);
+        if (Canonical == ConstantInt::getTrue()) {
+          add(SI, SI->getTrueValue(), ICmpInst::ICMP_EQ, NewContext);
+        } else if (Canonical == ConstantInt::getFalse()) {
+          add(SI, SI->getFalseValue(), ICmpInst::ICMP_EQ, NewContext);
+        } else if (VN.canonicalize(SI->getTrueValue(), Top) ==
+                   VN.canonicalize(SI->getFalseValue(), Top)) {
+          add(SI, SI->getTrueValue(), ICmpInst::ICMP_EQ, NewContext);
+        }
+      } else if (CastInst *CI = dyn_cast<CastInst>(I)) {
+        const Type *DestTy = CI->getDestTy();
+        if (DestTy->isFPOrFPVector()) return;
+
+        Value *Op = VN.canonicalize(CI->getOperand(0), Top);
+        Instruction::CastOps Opcode = CI->getOpcode();
+
+        if (Constant *C = dyn_cast<Constant>(Op)) {
+          add(CI, ConstantExpr::getCast(Opcode, C, DestTy),
+              ICmpInst::ICMP_EQ, NewContext);
+        }
+
+        uint32_t W = VR.typeToWidth(DestTy);
+        unsigned ci = VN.getOrInsertVN(CI, Top);
+        ConstantRange CR = VR.range(VN.getOrInsertVN(Op, Top), Top);
+
+        if (!CR.isFullSet()) {
+          switch (Opcode) {
+            default: break;
+            case Instruction::ZExt:
+              VR.applyRange(ci, CR.zeroExtend(W), Top, this);
+              break;
+            case Instruction::SExt:
+              VR.applyRange(ci, CR.signExtend(W), Top, this);
+              break;
+            case Instruction::Trunc: {
+              ConstantRange Result = CR.truncate(W);
+              if (!Result.isFullSet())
+                VR.applyRange(ci, Result, Top, this);
+            } break;
+            case Instruction::BitCast:
+              VR.applyRange(ci, CR, Top, this);
+              break;
+            // TODO: other casts?
+          }
+        }
+      } else if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(I)) {
+        for (GetElementPtrInst::op_iterator OI = GEPI->idx_begin(),
+             OE = GEPI->idx_end(); OI != OE; ++OI) {
+          ConstantInt *Op = dyn_cast<ConstantInt>(VN.canonicalize(*OI, Top));
+          if (!Op || !Op->isZero()) return;
+        }
+        // TODO: The GEPI indices are all zero. Copy from operand to definition,
+        // jumping the type plane as needed.
+        Value *Ptr = GEPI->getPointerOperand();
+        if (isRelatedBy(Ptr, Constant::getNullValue(Ptr->getType()),
+                        ICmpInst::ICMP_NE)) {
+          add(GEPI, Constant::getNullValue(GEPI->getType()), ICmpInst::ICMP_NE,
+              NewContext);
+        }
+      }
+    }
+
+    /// solve - process the work queue
+    void solve() {
+      //DOUT << "WorkList entry, size: " << WorkList.size() << "\n";
+      while (!WorkList.empty()) {
+        //DOUT << "WorkList size: " << WorkList.size() << "\n";
+
+        Operation &O = WorkList.front();
+        TopInst = O.ContextInst;
+        TopBB = O.ContextBB;
+        Top = DTDFS->getNodeForBlock(TopBB); // XXX move this into Context
+
+        O.LHS = VN.canonicalize(O.LHS, Top);
+        O.RHS = VN.canonicalize(O.RHS, Top);
+
+        assert(O.LHS == VN.canonicalize(O.LHS, Top) && "Canonicalize isn't.");
+        assert(O.RHS == VN.canonicalize(O.RHS, Top) && "Canonicalize isn't.");
+
+        DOUT << "solving " << *O.LHS << " " << O.Op << " " << *O.RHS;
+        if (O.ContextInst) DOUT << " context inst: " << *O.ContextInst;
+        else DOUT << " context block: " << O.ContextBB->getName();
+        DOUT << "\n";
+
+        DEBUG(VN.dump());
+        DEBUG(IG.dump());
+        DEBUG(VR.dump());
+
+        // If they're both Constant, skip it. Check for contradiction and mark
+        // the BB as unreachable if so.
+        if (Constant *CI_L = dyn_cast<Constant>(O.LHS)) {
+          if (Constant *CI_R = dyn_cast<Constant>(O.RHS)) {
+            if (ConstantExpr::getCompare(O.Op, CI_L, CI_R) ==
+                ConstantInt::getFalse())
+              UB.mark(TopBB);
+
+            WorkList.pop_front();
+            continue;
+          }
+        }
+
+        if (VN.compare(O.LHS, O.RHS)) {
+          std::swap(O.LHS, O.RHS);
+          O.Op = ICmpInst::getSwappedPredicate(O.Op);
+        }
+
+        if (O.Op == ICmpInst::ICMP_EQ) {
+          if (!makeEqual(O.RHS, O.LHS))
+            UB.mark(TopBB);
+        } else {
+          LatticeVal LV = cmpInstToLattice(O.Op);
+
+          if ((LV & EQ_BIT) &&
+              isRelatedBy(O.LHS, O.RHS, ICmpInst::getSwappedPredicate(O.Op))) {
+            if (!makeEqual(O.RHS, O.LHS))
+              UB.mark(TopBB);
+          } else {
+            if (isRelatedBy(O.LHS, O.RHS, ICmpInst::getInversePredicate(O.Op))){
+              UB.mark(TopBB);
+              WorkList.pop_front();
+              continue;
+            }
+
+            unsigned n1 = VN.getOrInsertVN(O.LHS, Top);
+            unsigned n2 = VN.getOrInsertVN(O.RHS, Top);
+
+            if (n1 == n2) {
+              if (O.Op != ICmpInst::ICMP_UGE && O.Op != ICmpInst::ICMP_ULE &&
+                  O.Op != ICmpInst::ICMP_SGE && O.Op != ICmpInst::ICMP_SLE)
+                UB.mark(TopBB);
+
+              WorkList.pop_front();
+              continue;
+            }
+
+            if (VR.isRelatedBy(n1, n2, Top, LV) ||
+                IG.isRelatedBy(n1, n2, Top, LV)) {
+              WorkList.pop_front();
+              continue;
+            }
+
+            VR.addInequality(n1, n2, Top, LV, this);
+            if ((!isa<ConstantInt>(O.RHS) && !isa<ConstantInt>(O.LHS)) ||
+                LV == NE)
+              IG.addInequality(n1, n2, Top, LV);
+
+            if (Instruction *I1 = dyn_cast<Instruction>(O.LHS)) {
+              if (aboveOrBelow(I1))
+                defToOps(I1);
+            }
+            if (isa<Instruction>(O.LHS) || isa<Argument>(O.LHS)) {
+              for (Value::use_iterator UI = O.LHS->use_begin(),
+                   UE = O.LHS->use_end(); UI != UE;) {
+                Use &TheUse = UI.getUse();
+                ++UI;
+                if (Instruction *I = dyn_cast<Instruction>(TheUse.getUser())) {
+                  if (aboveOrBelow(I))
+                    opsToDef(I);
+                }
+              }
+            }
+            if (Instruction *I2 = dyn_cast<Instruction>(O.RHS)) {
+              if (aboveOrBelow(I2))
+              defToOps(I2);
+            }
+            if (isa<Instruction>(O.RHS) || isa<Argument>(O.RHS)) {
+              for (Value::use_iterator UI = O.RHS->use_begin(),
+                   UE = O.RHS->use_end(); UI != UE;) {
+                Use &TheUse = UI.getUse();
+                ++UI;
+                if (Instruction *I = dyn_cast<Instruction>(TheUse.getUser())) {
+                  if (aboveOrBelow(I))
+                    opsToDef(I);
+                }
+              }
+            }
+          }
+        }
+        WorkList.pop_front();
+      }
+    }
+  };
+
+  void ValueRanges::addToWorklist(Value *V, Constant *C,
+                                  ICmpInst::Predicate Pred, VRPSolver *VRP) {
+    VRP->add(V, C, Pred, VRP->TopInst);
+  }
+
+  void ValueRanges::markBlock(VRPSolver *VRP) {
+    VRP->UB.mark(VRP->TopBB);
+  }
+
+  /// PredicateSimplifier - This class is a simplifier that replaces
+  /// one equivalent variable with another. It also tracks what
+  /// can't be equal and will solve setcc instructions when possible.
+  /// @brief Root of the predicate simplifier optimization.
+  class VISIBILITY_HIDDEN PredicateSimplifier : public FunctionPass {
+    DomTreeDFS *DTDFS;
+    bool modified;
+    ValueNumbering *VN;
+    InequalityGraph *IG;
+    UnreachableBlocks UB;
+    ValueRanges *VR;
+
+    std::vector<DomTreeDFS::Node *> WorkList;
+
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    PredicateSimplifier() : FunctionPass(&ID) {}
+
+    bool runOnFunction(Function &F);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequiredID(BreakCriticalEdgesID);
+      AU.addRequired<DominatorTree>();
+      AU.addRequired<TargetData>();
+      AU.addPreserved<TargetData>();
+    }
+
+  private:
+    /// Forwards - Adds new properties to VRPSolver and uses them to
+    /// simplify instructions. Because new properties sometimes apply to
+    /// a transition from one BasicBlock to another, this will use the
+    /// PredicateSimplifier::proceedToSuccessor(s) interface to enter the
+    /// basic block.
+    /// @brief Performs abstract execution of the program.
+    class VISIBILITY_HIDDEN Forwards : public InstVisitor<Forwards> {
+      friend class InstVisitor<Forwards>;
+      PredicateSimplifier *PS;
+      DomTreeDFS::Node *DTNode;
+
+    public:
+      ValueNumbering &VN;
+      InequalityGraph &IG;
+      UnreachableBlocks &UB;
+      ValueRanges &VR;
+
+      Forwards(PredicateSimplifier *PS, DomTreeDFS::Node *DTNode)
+        : PS(PS), DTNode(DTNode), VN(*PS->VN), IG(*PS->IG), UB(PS->UB),
+          VR(*PS->VR) {}
+
+      void visitTerminatorInst(TerminatorInst &TI);
+      void visitBranchInst(BranchInst &BI);
+      void visitSwitchInst(SwitchInst &SI);
+
+      void visitAllocaInst(AllocaInst &AI);
+      void visitLoadInst(LoadInst &LI);
+      void visitStoreInst(StoreInst &SI);
+
+      void visitSExtInst(SExtInst &SI);
+      void visitZExtInst(ZExtInst &ZI);
+
+      void visitBinaryOperator(BinaryOperator &BO);
+      void visitICmpInst(ICmpInst &IC);
+    };
+  
+    // Used by terminator instructions to proceed from the current basic
+    // block to the next. Verifies that "current" dominates "next",
+    // then calls visitBasicBlock.
+    void proceedToSuccessors(DomTreeDFS::Node *Current) {
+      for (DomTreeDFS::Node::iterator I = Current->begin(),
+           E = Current->end(); I != E; ++I) {
+        WorkList.push_back(*I);
+      }
+    }
+
+    void proceedToSuccessor(DomTreeDFS::Node *Next) {
+      WorkList.push_back(Next);
+    }
+
+    // Visits each instruction in the basic block.
+    void visitBasicBlock(DomTreeDFS::Node *Node) {
+      BasicBlock *BB = Node->getBlock();
+      DOUT << "Entering Basic Block: " << BB->getName()
+           << " (" << Node->getDFSNumIn() << ")\n";
+      for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
+        visitInstruction(I++, Node);
+      }
+    }
+
+    // Tries to simplify each Instruction and add new properties.
+    void visitInstruction(Instruction *I, DomTreeDFS::Node *DT) {
+      DOUT << "Considering instruction " << *I << "\n";
+      DEBUG(VN->dump());
+      DEBUG(IG->dump());
+      DEBUG(VR->dump());
+
+      // Sometimes instructions are killed in earlier analysis.
+      if (isInstructionTriviallyDead(I)) {
+        ++NumSimple;
+        modified = true;
+        if (unsigned n = VN->valueNumber(I, DTDFS->getRootNode()))
+          if (VN->value(n) == I) IG->remove(n);
+        VN->remove(I);
+        I->eraseFromParent();
+        return;
+      }
+
+#ifndef NDEBUG
+      // Try to replace the whole instruction.
+      Value *V = VN->canonicalize(I, DT);
+      assert(V == I && "Late instruction canonicalization.");
+      if (V != I) {
+        modified = true;
+        ++NumInstruction;
+        DOUT << "Removing " << *I << ", replacing with " << *V << "\n";
+        if (unsigned n = VN->valueNumber(I, DTDFS->getRootNode()))
+          if (VN->value(n) == I) IG->remove(n);
+        VN->remove(I);
+        I->replaceAllUsesWith(V);
+        I->eraseFromParent();
+        return;
+      }
+
+      // Try to substitute operands.
+      for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
+        Value *Oper = I->getOperand(i);
+        Value *V = VN->canonicalize(Oper, DT);
+        assert(V == Oper && "Late operand canonicalization.");
+        if (V != Oper) {
+          modified = true;
+          ++NumVarsReplaced;
+          DOUT << "Resolving " << *I;
+          I->setOperand(i, V);
+          DOUT << " into " << *I;
+        }
+      }
+#endif
+
+      std::string name = I->getParent()->getName();
+      DOUT << "push (%" << name << ")\n";
+      Forwards visit(this, DT);
+      visit.visit(*I);
+      DOUT << "pop (%" << name << ")\n";
+    }
+  };
+
+  bool PredicateSimplifier::runOnFunction(Function &F) {
+    DominatorTree *DT = &getAnalysis<DominatorTree>();
+    DTDFS = new DomTreeDFS(DT);
+    TargetData *TD = &getAnalysis<TargetData>();
+
+    DOUT << "Entering Function: " << F.getName() << "\n";
+
+    modified = false;
+    DomTreeDFS::Node *Root = DTDFS->getRootNode();
+    VN = new ValueNumbering(DTDFS);
+    IG = new InequalityGraph(*VN, Root);
+    VR = new ValueRanges(*VN, TD);
+    WorkList.push_back(Root);
+
+    do {
+      DomTreeDFS::Node *DTNode = WorkList.back();
+      WorkList.pop_back();
+      if (!UB.isDead(DTNode->getBlock())) visitBasicBlock(DTNode);
+    } while (!WorkList.empty());
+
+    delete DTDFS;
+    delete VR;
+    delete IG;
+    delete VN;
+
+    modified |= UB.kill();
+
+    return modified;
+  }
+
+  void PredicateSimplifier::Forwards::visitTerminatorInst(TerminatorInst &TI) {
+    PS->proceedToSuccessors(DTNode);
+  }
+
+  void PredicateSimplifier::Forwards::visitBranchInst(BranchInst &BI) {
+    if (BI.isUnconditional()) {
+      PS->proceedToSuccessors(DTNode);
+      return;
+    }
+
+    Value *Condition = BI.getCondition();
+    BasicBlock *TrueDest  = BI.getSuccessor(0);
+    BasicBlock *FalseDest = BI.getSuccessor(1);
+
+    if (isa<Constant>(Condition) || TrueDest == FalseDest) {
+      PS->proceedToSuccessors(DTNode);
+      return;
+    }
+
+    for (DomTreeDFS::Node::iterator I = DTNode->begin(), E = DTNode->end();
+         I != E; ++I) {
+      BasicBlock *Dest = (*I)->getBlock();
+      DOUT << "Branch thinking about %" << Dest->getName()
+           << "(" << PS->DTDFS->getNodeForBlock(Dest)->getDFSNumIn() << ")\n";
+
+      if (Dest == TrueDest) {
+        DOUT << "(" << DTNode->getBlock()->getName() << ") true set:\n";
+        VRPSolver VRP(VN, IG, UB, VR, PS->DTDFS, PS->modified, Dest);
+        VRP.add(ConstantInt::getTrue(), Condition, ICmpInst::ICMP_EQ);
+        VRP.solve();
+        DEBUG(VN.dump());
+        DEBUG(IG.dump());
+        DEBUG(VR.dump());
+      } else if (Dest == FalseDest) {
+        DOUT << "(" << DTNode->getBlock()->getName() << ") false set:\n";
+        VRPSolver VRP(VN, IG, UB, VR, PS->DTDFS, PS->modified, Dest);
+        VRP.add(ConstantInt::getFalse(), Condition, ICmpInst::ICMP_EQ);
+        VRP.solve();
+        DEBUG(VN.dump());
+        DEBUG(IG.dump());
+        DEBUG(VR.dump());
+      }
+
+      PS->proceedToSuccessor(*I);
+    }
+  }
+
+  void PredicateSimplifier::Forwards::visitSwitchInst(SwitchInst &SI) {
+    Value *Condition = SI.getCondition();
+
+    // Set the EQProperty in each of the cases BBs, and the NEProperties
+    // in the default BB.
+
+    for (DomTreeDFS::Node::iterator I = DTNode->begin(), E = DTNode->end();
+         I != E; ++I) {
+      BasicBlock *BB = (*I)->getBlock();
+      DOUT << "Switch thinking about BB %" << BB->getName()
+           << "(" << PS->DTDFS->getNodeForBlock(BB)->getDFSNumIn() << ")\n";
+
+      VRPSolver VRP(VN, IG, UB, VR, PS->DTDFS, PS->modified, BB);
+      if (BB == SI.getDefaultDest()) {
+        for (unsigned i = 1, e = SI.getNumCases(); i < e; ++i)
+          if (SI.getSuccessor(i) != BB)
+            VRP.add(Condition, SI.getCaseValue(i), ICmpInst::ICMP_NE);
+        VRP.solve();
+      } else if (ConstantInt *CI = SI.findCaseDest(BB)) {
+        VRP.add(Condition, CI, ICmpInst::ICMP_EQ);
+        VRP.solve();
+      }
+      PS->proceedToSuccessor(*I);
+    }
+  }
+
+  void PredicateSimplifier::Forwards::visitAllocaInst(AllocaInst &AI) {
+    VRPSolver VRP(VN, IG, UB, VR, PS->DTDFS, PS->modified, &AI);
+    VRP.add(Constant::getNullValue(AI.getType()), &AI, ICmpInst::ICMP_NE);
+    VRP.solve();
+  }
+
+  void PredicateSimplifier::Forwards::visitLoadInst(LoadInst &LI) {
+    Value *Ptr = LI.getPointerOperand();
+    // avoid "load i8* null" -> null NE null.
+    if (isa<Constant>(Ptr)) return;
+
+    VRPSolver VRP(VN, IG, UB, VR, PS->DTDFS, PS->modified, &LI);
+    VRP.add(Constant::getNullValue(Ptr->getType()), Ptr, ICmpInst::ICMP_NE);
+    VRP.solve();
+  }
+
+  void PredicateSimplifier::Forwards::visitStoreInst(StoreInst &SI) {
+    Value *Ptr = SI.getPointerOperand();
+    if (isa<Constant>(Ptr)) return;
+
+    VRPSolver VRP(VN, IG, UB, VR, PS->DTDFS, PS->modified, &SI);
+    VRP.add(Constant::getNullValue(Ptr->getType()), Ptr, ICmpInst::ICMP_NE);
+    VRP.solve();
+  }
+
+  void PredicateSimplifier::Forwards::visitSExtInst(SExtInst &SI) {
+    VRPSolver VRP(VN, IG, UB, VR, PS->DTDFS, PS->modified, &SI);
+    uint32_t SrcBitWidth = cast<IntegerType>(SI.getSrcTy())->getBitWidth();
+    uint32_t DstBitWidth = cast<IntegerType>(SI.getDestTy())->getBitWidth();
+    APInt Min(APInt::getHighBitsSet(DstBitWidth, DstBitWidth-SrcBitWidth+1));
+    APInt Max(APInt::getLowBitsSet(DstBitWidth, SrcBitWidth-1));
+    VRP.add(ConstantInt::get(Min), &SI, ICmpInst::ICMP_SLE);
+    VRP.add(ConstantInt::get(Max), &SI, ICmpInst::ICMP_SGE);
+    VRP.solve();
+  }
+
+  void PredicateSimplifier::Forwards::visitZExtInst(ZExtInst &ZI) {
+    VRPSolver VRP(VN, IG, UB, VR, PS->DTDFS, PS->modified, &ZI);
+    uint32_t SrcBitWidth = cast<IntegerType>(ZI.getSrcTy())->getBitWidth();
+    uint32_t DstBitWidth = cast<IntegerType>(ZI.getDestTy())->getBitWidth();
+    APInt Max(APInt::getLowBitsSet(DstBitWidth, SrcBitWidth));
+    VRP.add(ConstantInt::get(Max), &ZI, ICmpInst::ICMP_UGE);
+    VRP.solve();
+  }
+
+  void PredicateSimplifier::Forwards::visitBinaryOperator(BinaryOperator &BO) {
+    Instruction::BinaryOps ops = BO.getOpcode();
+
+    switch (ops) {
+    default: break;
+      case Instruction::URem:
+      case Instruction::SRem:
+      case Instruction::UDiv:
+      case Instruction::SDiv: {
+        Value *Divisor = BO.getOperand(1);
+        VRPSolver VRP(VN, IG, UB, VR, PS->DTDFS, PS->modified, &BO);
+        VRP.add(Constant::getNullValue(Divisor->getType()), Divisor,
+                ICmpInst::ICMP_NE);
+        VRP.solve();
+        break;
+      }
+    }
+
+    switch (ops) {
+      default: break;
+      case Instruction::Shl: {
+        VRPSolver VRP(VN, IG, UB, VR, PS->DTDFS, PS->modified, &BO);
+        VRP.add(&BO, BO.getOperand(0), ICmpInst::ICMP_UGE);
+        VRP.solve();
+      } break;
+      case Instruction::AShr: {
+        VRPSolver VRP(VN, IG, UB, VR, PS->DTDFS, PS->modified, &BO);
+        VRP.add(&BO, BO.getOperand(0), ICmpInst::ICMP_SLE);
+        VRP.solve();
+      } break;
+      case Instruction::LShr:
+      case Instruction::UDiv: {
+        VRPSolver VRP(VN, IG, UB, VR, PS->DTDFS, PS->modified, &BO);
+        VRP.add(&BO, BO.getOperand(0), ICmpInst::ICMP_ULE);
+        VRP.solve();
+      } break;
+      case Instruction::URem: {
+        VRPSolver VRP(VN, IG, UB, VR, PS->DTDFS, PS->modified, &BO);
+        VRP.add(&BO, BO.getOperand(1), ICmpInst::ICMP_ULE);
+        VRP.solve();
+      } break;
+      case Instruction::And: {
+        VRPSolver VRP(VN, IG, UB, VR, PS->DTDFS, PS->modified, &BO);
+        VRP.add(&BO, BO.getOperand(0), ICmpInst::ICMP_ULE);
+        VRP.add(&BO, BO.getOperand(1), ICmpInst::ICMP_ULE);
+        VRP.solve();
+      } break;
+      case Instruction::Or: {
+        VRPSolver VRP(VN, IG, UB, VR, PS->DTDFS, PS->modified, &BO);
+        VRP.add(&BO, BO.getOperand(0), ICmpInst::ICMP_UGE);
+        VRP.add(&BO, BO.getOperand(1), ICmpInst::ICMP_UGE);
+        VRP.solve();
+      } break;
+    }
+  }
+
+  void PredicateSimplifier::Forwards::visitICmpInst(ICmpInst &IC) {
+    // If possible, squeeze the ICmp predicate into something simpler.
+    // Eg., if x = [0, 4) and we're being asked icmp uge %x, 3 then change
+    // the predicate to eq.
+
+    // XXX: once we do full PHI handling, modifying the instruction in the
+    // Forwards visitor will cause missed optimizations.
+
+    ICmpInst::Predicate Pred = IC.getPredicate();
+
+    switch (Pred) {
+      default: break;
+      case ICmpInst::ICMP_ULE: Pred = ICmpInst::ICMP_ULT; break;
+      case ICmpInst::ICMP_UGE: Pred = ICmpInst::ICMP_UGT; break;
+      case ICmpInst::ICMP_SLE: Pred = ICmpInst::ICMP_SLT; break;
+      case ICmpInst::ICMP_SGE: Pred = ICmpInst::ICMP_SGT; break;
+    }
+    if (Pred != IC.getPredicate()) {
+      VRPSolver VRP(VN, IG, UB, VR, PS->DTDFS, PS->modified, &IC);
+      if (VRP.isRelatedBy(IC.getOperand(1), IC.getOperand(0),
+                          ICmpInst::ICMP_NE)) {
+        ++NumSnuggle;
+        PS->modified = true;
+        IC.setPredicate(Pred);
+      }
+    }
+
+    Pred = IC.getPredicate();
+
+    if (ConstantInt *Op1 = dyn_cast<ConstantInt>(IC.getOperand(1))) {
+      ConstantInt *NextVal = 0;
+      switch (Pred) {
+        default: break;
+        case ICmpInst::ICMP_SLT:
+        case ICmpInst::ICMP_ULT:
+          if (Op1->getValue() != 0)
+            NextVal = ConstantInt::get(Op1->getValue()-1);
+         break;
+        case ICmpInst::ICMP_SGT:
+        case ICmpInst::ICMP_UGT:
+          if (!Op1->getValue().isAllOnesValue())
+            NextVal = ConstantInt::get(Op1->getValue()+1);
+         break;
+      }
+
+      if (NextVal) {
+        VRPSolver VRP(VN, IG, UB, VR, PS->DTDFS, PS->modified, &IC);
+        if (VRP.isRelatedBy(IC.getOperand(0), NextVal,
+                            ICmpInst::getInversePredicate(Pred))) {
+          ICmpInst *NewIC = new ICmpInst(ICmpInst::ICMP_EQ, IC.getOperand(0),
+                                         NextVal, "", &IC);
+          NewIC->takeName(&IC);
+          IC.replaceAllUsesWith(NewIC);
+
+          // XXX: prove this isn't necessary
+          if (unsigned n = VN.valueNumber(&IC, PS->DTDFS->getRootNode()))
+            if (VN.value(n) == &IC) IG.remove(n);
+          VN.remove(&IC);
+
+          IC.eraseFromParent();
+          ++NumSnuggle;
+          PS->modified = true;
+        }
+      }
+    }
+  }
+}
+
+char PredicateSimplifier::ID = 0;
+static RegisterPass<PredicateSimplifier>
+X("predsimplify", "Predicate Simplifier");
+
+FunctionPass *llvm::createPredicateSimplifierPass() {
+  return new PredicateSimplifier();
+}
diff --git a/lib/Transforms/Scalar/Reassociate.cpp b/lib/Transforms/Scalar/Reassociate.cpp
new file mode 100644
index 0000000..293cf92
--- /dev/null
+++ b/lib/Transforms/Scalar/Reassociate.cpp
@@ -0,0 +1,896 @@
+//===- Reassociate.cpp - Reassociate binary expressions -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass reassociates commutative expressions in an order that is designed
+// to promote better constant propagation, GCSE, LICM, PRE...
+//
+// For example: 4 + (x + 5) -> x + (4 + 5)
+//
+// In the implementation of this algorithm, constants are assigned rank = 0,
+// function arguments are rank = 1, and other values are assigned ranks
+// corresponding to the reverse post order traversal of current function
+// (starting at 2), which effectively gives values in deep loops higher rank
+// than values not in loops.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "reassociate"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
+#include "llvm/Instructions.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Pass.h"
+#include "llvm/Assembly/Writer.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ValueHandle.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/Statistic.h"
+#include <algorithm>
+#include <map>
+using namespace llvm;
+
+STATISTIC(NumLinear , "Number of insts linearized");
+STATISTIC(NumChanged, "Number of insts reassociated");
+STATISTIC(NumAnnihil, "Number of expr tree annihilated");
+STATISTIC(NumFactor , "Number of multiplies factored");
+
+namespace {
+  struct VISIBILITY_HIDDEN ValueEntry {
+    unsigned Rank;
+    Value *Op;
+    ValueEntry(unsigned R, Value *O) : Rank(R), Op(O) {}
+  };
+  inline bool operator<(const ValueEntry &LHS, const ValueEntry &RHS) {
+    return LHS.Rank > RHS.Rank;   // Sort so that highest rank goes to start.
+  }
+}
+
+#ifndef NDEBUG
+/// PrintOps - Print out the expression identified in the Ops list.
+///
+static void PrintOps(Instruction *I, const std::vector<ValueEntry> &Ops) {
+  Module *M = I->getParent()->getParent()->getParent();
+  cerr << Instruction::getOpcodeName(I->getOpcode()) << " "
+       << *Ops[0].Op->getType();
+  for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
+    WriteAsOperand(*cerr.stream() << " ", Ops[i].Op, false, M);
+    cerr << "," << Ops[i].Rank;
+  }
+}
+#endif
+  
+namespace {
+  class VISIBILITY_HIDDEN Reassociate : public FunctionPass {
+    std::map<BasicBlock*, unsigned> RankMap;
+    std::map<AssertingVH<>, unsigned> ValueRankMap;
+    bool MadeChange;
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    Reassociate() : FunctionPass(&ID) {}
+
+    bool runOnFunction(Function &F);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesCFG();
+    }
+  private:
+    void BuildRankMap(Function &F);
+    unsigned getRank(Value *V);
+    void ReassociateExpression(BinaryOperator *I);
+    void RewriteExprTree(BinaryOperator *I, std::vector<ValueEntry> &Ops,
+                         unsigned Idx = 0);
+    Value *OptimizeExpression(BinaryOperator *I, std::vector<ValueEntry> &Ops);
+    void LinearizeExprTree(BinaryOperator *I, std::vector<ValueEntry> &Ops);
+    void LinearizeExpr(BinaryOperator *I);
+    Value *RemoveFactorFromExpression(Value *V, Value *Factor);
+    void ReassociateBB(BasicBlock *BB);
+    
+    void RemoveDeadBinaryOp(Value *V);
+  };
+}
+
+char Reassociate::ID = 0;
+static RegisterPass<Reassociate> X("reassociate", "Reassociate expressions");
+
+// Public interface to the Reassociate pass
+FunctionPass *llvm::createReassociatePass() { return new Reassociate(); }
+
+void Reassociate::RemoveDeadBinaryOp(Value *V) {
+  Instruction *Op = dyn_cast<Instruction>(V);
+  if (!Op || !isa<BinaryOperator>(Op) || !isa<CmpInst>(Op) || !Op->use_empty())
+    return;
+  
+  Value *LHS = Op->getOperand(0), *RHS = Op->getOperand(1);
+  RemoveDeadBinaryOp(LHS);
+  RemoveDeadBinaryOp(RHS);
+}
+
+
+static bool isUnmovableInstruction(Instruction *I) {
+  if (I->getOpcode() == Instruction::PHI ||
+      I->getOpcode() == Instruction::Alloca ||
+      I->getOpcode() == Instruction::Load ||
+      I->getOpcode() == Instruction::Malloc ||
+      I->getOpcode() == Instruction::Invoke ||
+      (I->getOpcode() == Instruction::Call &&
+       !isa<DbgInfoIntrinsic>(I)) ||
+      I->getOpcode() == Instruction::UDiv || 
+      I->getOpcode() == Instruction::SDiv ||
+      I->getOpcode() == Instruction::FDiv ||
+      I->getOpcode() == Instruction::URem ||
+      I->getOpcode() == Instruction::SRem ||
+      I->getOpcode() == Instruction::FRem)
+    return true;
+  return false;
+}
+
+void Reassociate::BuildRankMap(Function &F) {
+  unsigned i = 2;
+
+  // Assign distinct ranks to function arguments
+  for (Function::arg_iterator I = F.arg_begin(), E = F.arg_end(); I != E; ++I)
+    ValueRankMap[&*I] = ++i;
+
+  ReversePostOrderTraversal<Function*> RPOT(&F);
+  for (ReversePostOrderTraversal<Function*>::rpo_iterator I = RPOT.begin(),
+         E = RPOT.end(); I != E; ++I) {
+    BasicBlock *BB = *I;
+    unsigned BBRank = RankMap[BB] = ++i << 16;
+
+    // Walk the basic block, adding precomputed ranks for any instructions that
+    // we cannot move.  This ensures that the ranks for these instructions are
+    // all different in the block.
+    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I)
+      if (isUnmovableInstruction(I))
+        ValueRankMap[&*I] = ++BBRank;
+  }
+}
+
+unsigned Reassociate::getRank(Value *V) {
+  if (isa<Argument>(V)) return ValueRankMap[V];   // Function argument...
+
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (I == 0) return 0;  // Otherwise it's a global or constant, rank 0.
+
+  unsigned &CachedRank = ValueRankMap[I];
+  if (CachedRank) return CachedRank;    // Rank already known?
+
+  // If this is an expression, return the 1+MAX(rank(LHS), rank(RHS)) so that
+  // we can reassociate expressions for code motion!  Since we do not recurse
+  // for PHI nodes, we cannot have infinite recursion here, because there
+  // cannot be loops in the value graph that do not go through PHI nodes.
+  unsigned Rank = 0, MaxRank = RankMap[I->getParent()];
+  for (unsigned i = 0, e = I->getNumOperands();
+       i != e && Rank != MaxRank; ++i)
+    Rank = std::max(Rank, getRank(I->getOperand(i)));
+
+  // If this is a not or neg instruction, do not count it for rank.  This
+  // assures us that X and ~X will have the same rank.
+  if (!I->getType()->isInteger() ||
+      (!BinaryOperator::isNot(I) && !BinaryOperator::isNeg(I)))
+    ++Rank;
+
+  //DOUT << "Calculated Rank[" << V->getName() << "] = "
+  //     << Rank << "\n";
+
+  return CachedRank = Rank;
+}
+
+/// isReassociableOp - Return true if V is an instruction of the specified
+/// opcode and if it only has one use.
+static BinaryOperator *isReassociableOp(Value *V, unsigned Opcode) {
+  if ((V->hasOneUse() || V->use_empty()) && isa<Instruction>(V) &&
+      cast<Instruction>(V)->getOpcode() == Opcode)
+    return cast<BinaryOperator>(V);
+  return 0;
+}
+
+/// LowerNegateToMultiply - Replace 0-X with X*-1.
+///
+static Instruction *LowerNegateToMultiply(Instruction *Neg,
+                              std::map<AssertingVH<>, unsigned> &ValueRankMap) {
+  Constant *Cst = ConstantInt::getAllOnesValue(Neg->getType());
+
+  Instruction *Res = BinaryOperator::CreateMul(Neg->getOperand(1), Cst, "",Neg);
+  ValueRankMap.erase(Neg);
+  Res->takeName(Neg);
+  Neg->replaceAllUsesWith(Res);
+  Neg->eraseFromParent();
+  return Res;
+}
+
+// Given an expression of the form '(A+B)+(D+C)', turn it into '(((A+B)+C)+D)'.
+// Note that if D is also part of the expression tree that we recurse to
+// linearize it as well.  Besides that case, this does not recurse into A,B, or
+// C.
+void Reassociate::LinearizeExpr(BinaryOperator *I) {
+  BinaryOperator *LHS = cast<BinaryOperator>(I->getOperand(0));
+  BinaryOperator *RHS = cast<BinaryOperator>(I->getOperand(1));
+  assert(isReassociableOp(LHS, I->getOpcode()) &&
+         isReassociableOp(RHS, I->getOpcode()) &&
+         "Not an expression that needs linearization?");
+
+  DOUT << "Linear" << *LHS << *RHS << *I;
+
+  // Move the RHS instruction to live immediately before I, avoiding breaking
+  // dominator properties.
+  RHS->moveBefore(I);
+
+  // Move operands around to do the linearization.
+  I->setOperand(1, RHS->getOperand(0));
+  RHS->setOperand(0, LHS);
+  I->setOperand(0, RHS);
+
+  ++NumLinear;
+  MadeChange = true;
+  DOUT << "Linearized: " << *I;
+
+  // If D is part of this expression tree, tail recurse.
+  if (isReassociableOp(I->getOperand(1), I->getOpcode()))
+    LinearizeExpr(I);
+}
+
+
+/// LinearizeExprTree - Given an associative binary expression tree, traverse
+/// all of the uses putting it into canonical form.  This forces a left-linear
+/// form of the the expression (((a+b)+c)+d), and collects information about the
+/// rank of the non-tree operands.
+///
+/// NOTE: These intentionally destroys the expression tree operands (turning
+/// them into undef values) to reduce #uses of the values.  This means that the
+/// caller MUST use something like RewriteExprTree to put the values back in.
+///
+void Reassociate::LinearizeExprTree(BinaryOperator *I,
+                                    std::vector<ValueEntry> &Ops) {
+  Value *LHS = I->getOperand(0), *RHS = I->getOperand(1);
+  unsigned Opcode = I->getOpcode();
+
+  // First step, linearize the expression if it is in ((A+B)+(C+D)) form.
+  BinaryOperator *LHSBO = isReassociableOp(LHS, Opcode);
+  BinaryOperator *RHSBO = isReassociableOp(RHS, Opcode);
+
+  // If this is a multiply expression tree and it contains internal negations,
+  // transform them into multiplies by -1 so they can be reassociated.
+  if (I->getOpcode() == Instruction::Mul) {
+    if (!LHSBO && LHS->hasOneUse() && BinaryOperator::isNeg(LHS)) {
+      LHS = LowerNegateToMultiply(cast<Instruction>(LHS), ValueRankMap);
+      LHSBO = isReassociableOp(LHS, Opcode);
+    }
+    if (!RHSBO && RHS->hasOneUse() && BinaryOperator::isNeg(RHS)) {
+      RHS = LowerNegateToMultiply(cast<Instruction>(RHS), ValueRankMap);
+      RHSBO = isReassociableOp(RHS, Opcode);
+    }
+  }
+
+  if (!LHSBO) {
+    if (!RHSBO) {
+      // Neither the LHS or RHS as part of the tree, thus this is a leaf.  As
+      // such, just remember these operands and their rank.
+      Ops.push_back(ValueEntry(getRank(LHS), LHS));
+      Ops.push_back(ValueEntry(getRank(RHS), RHS));
+      
+      // Clear the leaves out.
+      I->setOperand(0, UndefValue::get(I->getType()));
+      I->setOperand(1, UndefValue::get(I->getType()));
+      return;
+    } else {
+      // Turn X+(Y+Z) -> (Y+Z)+X
+      std::swap(LHSBO, RHSBO);
+      std::swap(LHS, RHS);
+      bool Success = !I->swapOperands();
+      assert(Success && "swapOperands failed");
+      Success = false;
+      MadeChange = true;
+    }
+  } else if (RHSBO) {
+    // Turn (A+B)+(C+D) -> (((A+B)+C)+D).  This guarantees the the RHS is not
+    // part of the expression tree.
+    LinearizeExpr(I);
+    LHS = LHSBO = cast<BinaryOperator>(I->getOperand(0));
+    RHS = I->getOperand(1);
+    RHSBO = 0;
+  }
+
+  // Okay, now we know that the LHS is a nested expression and that the RHS is
+  // not.  Perform reassociation.
+  assert(!isReassociableOp(RHS, Opcode) && "LinearizeExpr failed!");
+
+  // Move LHS right before I to make sure that the tree expression dominates all
+  // values.
+  LHSBO->moveBefore(I);
+
+  // Linearize the expression tree on the LHS.
+  LinearizeExprTree(LHSBO, Ops);
+
+  // Remember the RHS operand and its rank.
+  Ops.push_back(ValueEntry(getRank(RHS), RHS));
+  
+  // Clear the RHS leaf out.
+  I->setOperand(1, UndefValue::get(I->getType()));
+}
+
+// RewriteExprTree - Now that the operands for this expression tree are
+// linearized and optimized, emit them in-order.  This function is written to be
+// tail recursive.
+void Reassociate::RewriteExprTree(BinaryOperator *I,
+                                  std::vector<ValueEntry> &Ops,
+                                  unsigned i) {
+  if (i+2 == Ops.size()) {
+    if (I->getOperand(0) != Ops[i].Op ||
+        I->getOperand(1) != Ops[i+1].Op) {
+      Value *OldLHS = I->getOperand(0);
+      DOUT << "RA: " << *I;
+      I->setOperand(0, Ops[i].Op);
+      I->setOperand(1, Ops[i+1].Op);
+      DOUT << "TO: " << *I;
+      MadeChange = true;
+      ++NumChanged;
+      
+      // If we reassociated a tree to fewer operands (e.g. (1+a+2) -> (a+3)
+      // delete the extra, now dead, nodes.
+      RemoveDeadBinaryOp(OldLHS);
+    }
+    return;
+  }
+  assert(i+2 < Ops.size() && "Ops index out of range!");
+
+  if (I->getOperand(1) != Ops[i].Op) {
+    DOUT << "RA: " << *I;
+    I->setOperand(1, Ops[i].Op);
+    DOUT << "TO: " << *I;
+    MadeChange = true;
+    ++NumChanged;
+  }
+  
+  BinaryOperator *LHS = cast<BinaryOperator>(I->getOperand(0));
+  assert(LHS->getOpcode() == I->getOpcode() &&
+         "Improper expression tree!");
+  
+  // Compactify the tree instructions together with each other to guarantee
+  // that the expression tree is dominated by all of Ops.
+  LHS->moveBefore(I);
+  RewriteExprTree(LHS, Ops, i+1);
+}
+
+
+
+// NegateValue - Insert instructions before the instruction pointed to by BI,
+// that computes the negative version of the value specified.  The negative
+// version of the value is returned, and BI is left pointing at the instruction
+// that should be processed next by the reassociation pass.
+//
+static Value *NegateValue(Value *V, Instruction *BI) {
+  // We are trying to expose opportunity for reassociation.  One of the things
+  // that we want to do to achieve this is to push a negation as deep into an
+  // expression chain as possible, to expose the add instructions.  In practice,
+  // this means that we turn this:
+  //   X = -(A+12+C+D)   into    X = -A + -12 + -C + -D = -12 + -A + -C + -D
+  // so that later, a: Y = 12+X could get reassociated with the -12 to eliminate
+  // the constants.  We assume that instcombine will clean up the mess later if
+  // we introduce tons of unnecessary negation instructions...
+  //
+  if (Instruction *I = dyn_cast<Instruction>(V))
+    if (I->getOpcode() == Instruction::Add && I->hasOneUse()) {
+      // Push the negates through the add.
+      I->setOperand(0, NegateValue(I->getOperand(0), BI));
+      I->setOperand(1, NegateValue(I->getOperand(1), BI));
+
+      // We must move the add instruction here, because the neg instructions do
+      // not dominate the old add instruction in general.  By moving it, we are
+      // assured that the neg instructions we just inserted dominate the 
+      // instruction we are about to insert after them.
+      //
+      I->moveBefore(BI);
+      I->setName(I->getName()+".neg");
+      return I;
+    }
+
+  // Insert a 'neg' instruction that subtracts the value from zero to get the
+  // negation.
+  //
+  return BinaryOperator::CreateNeg(V, V->getName() + ".neg", BI);
+}
+
+/// ShouldBreakUpSubtract - Return true if we should break up this subtract of
+/// X-Y into (X + -Y).
+static bool ShouldBreakUpSubtract(Instruction *Sub) {
+  // If this is a negation, we can't split it up!
+  if (BinaryOperator::isNeg(Sub))
+    return false;
+  
+  // Don't bother to break this up unless either the LHS is an associable add or
+  // subtract or if this is only used by one.
+  if (isReassociableOp(Sub->getOperand(0), Instruction::Add) ||
+      isReassociableOp(Sub->getOperand(0), Instruction::Sub))
+    return true;
+  if (isReassociableOp(Sub->getOperand(1), Instruction::Add) ||
+      isReassociableOp(Sub->getOperand(1), Instruction::Sub))
+    return true;
+  if (Sub->hasOneUse() && 
+      (isReassociableOp(Sub->use_back(), Instruction::Add) ||
+       isReassociableOp(Sub->use_back(), Instruction::Sub)))
+    return true;
+    
+  return false;
+}
+
+/// BreakUpSubtract - If we have (X-Y), and if either X is an add, or if this is
+/// only used by an add, transform this into (X+(0-Y)) to promote better
+/// reassociation.
+static Instruction *BreakUpSubtract(Instruction *Sub,
+                              std::map<AssertingVH<>, unsigned> &ValueRankMap) {
+  // Convert a subtract into an add and a neg instruction... so that sub
+  // instructions can be commuted with other add instructions...
+  //
+  // Calculate the negative value of Operand 1 of the sub instruction...
+  // and set it as the RHS of the add instruction we just made...
+  //
+  Value *NegVal = NegateValue(Sub->getOperand(1), Sub);
+  Instruction *New =
+    BinaryOperator::CreateAdd(Sub->getOperand(0), NegVal, "", Sub);
+  New->takeName(Sub);
+
+  // Everyone now refers to the add instruction.
+  ValueRankMap.erase(Sub);
+  Sub->replaceAllUsesWith(New);
+  Sub->eraseFromParent();
+
+  DOUT << "Negated: " << *New;
+  return New;
+}
+
+/// ConvertShiftToMul - If this is a shift of a reassociable multiply or is used
+/// by one, change this into a multiply by a constant to assist with further
+/// reassociation.
+static Instruction *ConvertShiftToMul(Instruction *Shl, 
+                              std::map<AssertingVH<>, unsigned> &ValueRankMap) {
+  // If an operand of this shift is a reassociable multiply, or if the shift
+  // is used by a reassociable multiply or add, turn into a multiply.
+  if (isReassociableOp(Shl->getOperand(0), Instruction::Mul) ||
+      (Shl->hasOneUse() && 
+       (isReassociableOp(Shl->use_back(), Instruction::Mul) ||
+        isReassociableOp(Shl->use_back(), Instruction::Add)))) {
+    Constant *MulCst = ConstantInt::get(Shl->getType(), 1);
+    MulCst = ConstantExpr::getShl(MulCst, cast<Constant>(Shl->getOperand(1)));
+    
+    Instruction *Mul = BinaryOperator::CreateMul(Shl->getOperand(0), MulCst,
+                                                 "", Shl);
+    ValueRankMap.erase(Shl);
+    Mul->takeName(Shl);
+    Shl->replaceAllUsesWith(Mul);
+    Shl->eraseFromParent();
+    return Mul;
+  }
+  return 0;
+}
+
+// Scan backwards and forwards among values with the same rank as element i to
+// see if X exists.  If X does not exist, return i.
+static unsigned FindInOperandList(std::vector<ValueEntry> &Ops, unsigned i,
+                                  Value *X) {
+  unsigned XRank = Ops[i].Rank;
+  unsigned e = Ops.size();
+  for (unsigned j = i+1; j != e && Ops[j].Rank == XRank; ++j)
+    if (Ops[j].Op == X)
+      return j;
+  // Scan backwards
+  for (unsigned j = i-1; j != ~0U && Ops[j].Rank == XRank; --j)
+    if (Ops[j].Op == X)
+      return j;
+  return i;
+}
+
+/// EmitAddTreeOfValues - Emit a tree of add instructions, summing Ops together
+/// and returning the result.  Insert the tree before I.
+static Value *EmitAddTreeOfValues(Instruction *I, std::vector<Value*> &Ops) {
+  if (Ops.size() == 1) return Ops.back();
+  
+  Value *V1 = Ops.back();
+  Ops.pop_back();
+  Value *V2 = EmitAddTreeOfValues(I, Ops);
+  return BinaryOperator::CreateAdd(V2, V1, "tmp", I);
+}
+
+/// RemoveFactorFromExpression - If V is an expression tree that is a 
+/// multiplication sequence, and if this sequence contains a multiply by Factor,
+/// remove Factor from the tree and return the new tree.
+Value *Reassociate::RemoveFactorFromExpression(Value *V, Value *Factor) {
+  BinaryOperator *BO = isReassociableOp(V, Instruction::Mul);
+  if (!BO) return 0;
+  
+  std::vector<ValueEntry> Factors;
+  LinearizeExprTree(BO, Factors);
+
+  bool FoundFactor = false;
+  for (unsigned i = 0, e = Factors.size(); i != e; ++i)
+    if (Factors[i].Op == Factor) {
+      FoundFactor = true;
+      Factors.erase(Factors.begin()+i);
+      break;
+    }
+  if (!FoundFactor) {
+    // Make sure to restore the operands to the expression tree.
+    RewriteExprTree(BO, Factors);
+    return 0;
+  }
+  
+  if (Factors.size() == 1) return Factors[0].Op;
+  
+  RewriteExprTree(BO, Factors);
+  return BO;
+}
+
+/// FindSingleUseMultiplyFactors - If V is a single-use multiply, recursively
+/// add its operands as factors, otherwise add V to the list of factors.
+static void FindSingleUseMultiplyFactors(Value *V,
+                                         std::vector<Value*> &Factors) {
+  BinaryOperator *BO;
+  if ((!V->hasOneUse() && !V->use_empty()) ||
+      !(BO = dyn_cast<BinaryOperator>(V)) ||
+      BO->getOpcode() != Instruction::Mul) {
+    Factors.push_back(V);
+    return;
+  }
+  
+  // Otherwise, add the LHS and RHS to the list of factors.
+  FindSingleUseMultiplyFactors(BO->getOperand(1), Factors);
+  FindSingleUseMultiplyFactors(BO->getOperand(0), Factors);
+}
+
+
+
+Value *Reassociate::OptimizeExpression(BinaryOperator *I,
+                                       std::vector<ValueEntry> &Ops) {
+  // Now that we have the linearized expression tree, try to optimize it.
+  // Start by folding any constants that we found.
+  bool IterateOptimization = false;
+  if (Ops.size() == 1) return Ops[0].Op;
+
+  unsigned Opcode = I->getOpcode();
+  
+  if (Constant *V1 = dyn_cast<Constant>(Ops[Ops.size()-2].Op))
+    if (Constant *V2 = dyn_cast<Constant>(Ops.back().Op)) {
+      Ops.pop_back();
+      Ops.back().Op = ConstantExpr::get(Opcode, V1, V2);
+      return OptimizeExpression(I, Ops);
+    }
+
+  // Check for destructive annihilation due to a constant being used.
+  if (ConstantInt *CstVal = dyn_cast<ConstantInt>(Ops.back().Op))
+    switch (Opcode) {
+    default: break;
+    case Instruction::And:
+      if (CstVal->isZero()) {                // ... & 0 -> 0
+        ++NumAnnihil;
+        return CstVal;
+      } else if (CstVal->isAllOnesValue()) { // ... & -1 -> ...
+        Ops.pop_back();
+      }
+      break;
+    case Instruction::Mul:
+      if (CstVal->isZero()) {                // ... * 0 -> 0
+        ++NumAnnihil;
+        return CstVal;
+      } else if (cast<ConstantInt>(CstVal)->isOne()) {
+        Ops.pop_back();                      // ... * 1 -> ...
+      }
+      break;
+    case Instruction::Or:
+      if (CstVal->isAllOnesValue()) {        // ... | -1 -> -1
+        ++NumAnnihil;
+        return CstVal;
+      }
+      // FALLTHROUGH!
+    case Instruction::Add:
+    case Instruction::Xor:
+      if (CstVal->isZero())                  // ... [|^+] 0 -> ...
+        Ops.pop_back();
+      break;
+    }
+  if (Ops.size() == 1) return Ops[0].Op;
+
+  // Handle destructive annihilation do to identities between elements in the
+  // argument list here.
+  switch (Opcode) {
+  default: break;
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:
+    // Scan the operand lists looking for X and ~X pairs, along with X,X pairs.
+    // If we find any, we can simplify the expression. X&~X == 0, X|~X == -1.
+    for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
+      // First, check for X and ~X in the operand list.
+      assert(i < Ops.size());
+      if (BinaryOperator::isNot(Ops[i].Op)) {    // Cannot occur for ^.
+        Value *X = BinaryOperator::getNotArgument(Ops[i].Op);
+        unsigned FoundX = FindInOperandList(Ops, i, X);
+        if (FoundX != i) {
+          if (Opcode == Instruction::And) {   // ...&X&~X = 0
+            ++NumAnnihil;
+            return Constant::getNullValue(X->getType());
+          } else if (Opcode == Instruction::Or) {   // ...|X|~X = -1
+            ++NumAnnihil;
+            return ConstantInt::getAllOnesValue(X->getType());
+          }
+        }
+      }
+
+      // Next, check for duplicate pairs of values, which we assume are next to
+      // each other, due to our sorting criteria.
+      assert(i < Ops.size());
+      if (i+1 != Ops.size() && Ops[i+1].Op == Ops[i].Op) {
+        if (Opcode == Instruction::And || Opcode == Instruction::Or) {
+          // Drop duplicate values.
+          Ops.erase(Ops.begin()+i);
+          --i; --e;
+          IterateOptimization = true;
+          ++NumAnnihil;
+        } else {
+          assert(Opcode == Instruction::Xor);
+          if (e == 2) {
+            ++NumAnnihil;
+            return Constant::getNullValue(Ops[0].Op->getType());
+          }
+          // ... X^X -> ...
+          Ops.erase(Ops.begin()+i, Ops.begin()+i+2);
+          i -= 1; e -= 2;
+          IterateOptimization = true;
+          ++NumAnnihil;
+        }
+      }
+    }
+    break;
+
+  case Instruction::Add:
+    // Scan the operand lists looking for X and -X pairs.  If we find any, we
+    // can simplify the expression. X+-X == 0.
+    for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
+      assert(i < Ops.size());
+      // Check for X and -X in the operand list.
+      if (BinaryOperator::isNeg(Ops[i].Op)) {
+        Value *X = BinaryOperator::getNegArgument(Ops[i].Op);
+        unsigned FoundX = FindInOperandList(Ops, i, X);
+        if (FoundX != i) {
+          // Remove X and -X from the operand list.
+          if (Ops.size() == 2) {
+            ++NumAnnihil;
+            return Constant::getNullValue(X->getType());
+          } else {
+            Ops.erase(Ops.begin()+i);
+            if (i < FoundX)
+              --FoundX;
+            else
+              --i;   // Need to back up an extra one.
+            Ops.erase(Ops.begin()+FoundX);
+            IterateOptimization = true;
+            ++NumAnnihil;
+            --i;     // Revisit element.
+            e -= 2;  // Removed two elements.
+          }
+        }
+      }
+    }
+    
+
+    // Scan the operand list, checking to see if there are any common factors
+    // between operands.  Consider something like A*A+A*B*C+D.  We would like to
+    // reassociate this to A*(A+B*C)+D, which reduces the number of multiplies.
+    // To efficiently find this, we count the number of times a factor occurs
+    // for any ADD operands that are MULs.
+    std::map<Value*, unsigned> FactorOccurrences;
+    unsigned MaxOcc = 0;
+    Value *MaxOccVal = 0;
+    for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
+      if (BinaryOperator *BOp = dyn_cast<BinaryOperator>(Ops[i].Op)) {
+        if (BOp->getOpcode() == Instruction::Mul && BOp->use_empty()) {
+          // Compute all of the factors of this added value.
+          std::vector<Value*> Factors;
+          FindSingleUseMultiplyFactors(BOp, Factors);
+          assert(Factors.size() > 1 && "Bad linearize!");
+
+          // Add one to FactorOccurrences for each unique factor in this op.
+          if (Factors.size() == 2) {
+            unsigned Occ = ++FactorOccurrences[Factors[0]];
+            if (Occ > MaxOcc) { MaxOcc = Occ; MaxOccVal = Factors[0]; }
+            if (Factors[0] != Factors[1]) {   // Don't double count A*A.
+              Occ = ++FactorOccurrences[Factors[1]];
+              if (Occ > MaxOcc) { MaxOcc = Occ; MaxOccVal = Factors[1]; }
+            }
+          } else {
+            std::set<Value*> Duplicates;
+            for (unsigned i = 0, e = Factors.size(); i != e; ++i) {
+              if (Duplicates.insert(Factors[i]).second) {
+                unsigned Occ = ++FactorOccurrences[Factors[i]];
+                if (Occ > MaxOcc) { MaxOcc = Occ; MaxOccVal = Factors[i]; }
+              }
+            }
+          }
+        }
+      }
+    }
+
+    // If any factor occurred more than one time, we can pull it out.
+    if (MaxOcc > 1) {
+      DOUT << "\nFACTORING [" << MaxOcc << "]: " << *MaxOccVal << "\n";
+      
+      // Create a new instruction that uses the MaxOccVal twice.  If we don't do
+      // this, we could otherwise run into situations where removing a factor
+      // from an expression will drop a use of maxocc, and this can cause 
+      // RemoveFactorFromExpression on successive values to behave differently.
+      Instruction *DummyInst = BinaryOperator::CreateAdd(MaxOccVal, MaxOccVal);
+      std::vector<Value*> NewMulOps;
+      for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
+        if (Value *V = RemoveFactorFromExpression(Ops[i].Op, MaxOccVal)) {
+          NewMulOps.push_back(V);
+          Ops.erase(Ops.begin()+i);
+          --i; --e;
+        }
+      }
+      
+      // No need for extra uses anymore.
+      delete DummyInst;
+
+      unsigned NumAddedValues = NewMulOps.size();
+      Value *V = EmitAddTreeOfValues(I, NewMulOps);
+      Value *V2 = BinaryOperator::CreateMul(V, MaxOccVal, "tmp", I);
+
+      // Now that we have inserted V and its sole use, optimize it. This allows
+      // us to handle cases that require multiple factoring steps, such as this:
+      // A*A*B + A*A*C   -->   A*(A*B+A*C)   -->   A*(A*(B+C))
+      if (NumAddedValues > 1)
+        ReassociateExpression(cast<BinaryOperator>(V));
+      
+      ++NumFactor;
+      
+      if (Ops.empty())
+        return V2;
+
+      // Add the new value to the list of things being added.
+      Ops.insert(Ops.begin(), ValueEntry(getRank(V2), V2));
+      
+      // Rewrite the tree so that there is now a use of V.
+      RewriteExprTree(I, Ops);
+      return OptimizeExpression(I, Ops);
+    }
+    break;
+  //case Instruction::Mul:
+  }
+
+  if (IterateOptimization)
+    return OptimizeExpression(I, Ops);
+  return 0;
+}
+
+
+/// ReassociateBB - Inspect all of the instructions in this basic block,
+/// reassociating them as we go.
+void Reassociate::ReassociateBB(BasicBlock *BB) {
+  for (BasicBlock::iterator BBI = BB->begin(); BBI != BB->end(); ) {
+    Instruction *BI = BBI++;
+    if (BI->getOpcode() == Instruction::Shl &&
+        isa<ConstantInt>(BI->getOperand(1)))
+      if (Instruction *NI = ConvertShiftToMul(BI, ValueRankMap)) {
+        MadeChange = true;
+        BI = NI;
+      }
+
+    // Reject cases where it is pointless to do this.
+    if (!isa<BinaryOperator>(BI) || BI->getType()->isFloatingPoint() || 
+        isa<VectorType>(BI->getType()))
+      continue;  // Floating point ops are not associative.
+
+    // If this is a subtract instruction which is not already in negate form,
+    // see if we can convert it to X+-Y.
+    if (BI->getOpcode() == Instruction::Sub) {
+      if (ShouldBreakUpSubtract(BI)) {
+        BI = BreakUpSubtract(BI, ValueRankMap);
+        MadeChange = true;
+      } else if (BinaryOperator::isNeg(BI)) {
+        // Otherwise, this is a negation.  See if the operand is a multiply tree
+        // and if this is not an inner node of a multiply tree.
+        if (isReassociableOp(BI->getOperand(1), Instruction::Mul) &&
+            (!BI->hasOneUse() ||
+             !isReassociableOp(BI->use_back(), Instruction::Mul))) {
+          BI = LowerNegateToMultiply(BI, ValueRankMap);
+          MadeChange = true;
+        }
+      }
+    }
+
+    // If this instruction is a commutative binary operator, process it.
+    if (!BI->isAssociative()) continue;
+    BinaryOperator *I = cast<BinaryOperator>(BI);
+
+    // If this is an interior node of a reassociable tree, ignore it until we
+    // get to the root of the tree, to avoid N^2 analysis.
+    if (I->hasOneUse() && isReassociableOp(I->use_back(), I->getOpcode()))
+      continue;
+
+    // If this is an add tree that is used by a sub instruction, ignore it 
+    // until we process the subtract.
+    if (I->hasOneUse() && I->getOpcode() == Instruction::Add &&
+        cast<Instruction>(I->use_back())->getOpcode() == Instruction::Sub)
+      continue;
+
+    ReassociateExpression(I);
+  }
+}
+
+void Reassociate::ReassociateExpression(BinaryOperator *I) {
+  
+  // First, walk the expression tree, linearizing the tree, collecting
+  std::vector<ValueEntry> Ops;
+  LinearizeExprTree(I, Ops);
+  
+  DOUT << "RAIn:\t"; DEBUG(PrintOps(I, Ops)); DOUT << "\n";
+  
+  // Now that we have linearized the tree to a list and have gathered all of
+  // the operands and their ranks, sort the operands by their rank.  Use a
+  // stable_sort so that values with equal ranks will have their relative
+  // positions maintained (and so the compiler is deterministic).  Note that
+  // this sorts so that the highest ranking values end up at the beginning of
+  // the vector.
+  std::stable_sort(Ops.begin(), Ops.end());
+  
+  // OptimizeExpression - Now that we have the expression tree in a convenient
+  // sorted form, optimize it globally if possible.
+  if (Value *V = OptimizeExpression(I, Ops)) {
+    // This expression tree simplified to something that isn't a tree,
+    // eliminate it.
+    DOUT << "Reassoc to scalar: " << *V << "\n";
+    I->replaceAllUsesWith(V);
+    RemoveDeadBinaryOp(I);
+    return;
+  }
+  
+  // We want to sink immediates as deeply as possible except in the case where
+  // this is a multiply tree used only by an add, and the immediate is a -1.
+  // In this case we reassociate to put the negation on the outside so that we
+  // can fold the negation into the add: (-X)*Y + Z -> Z-X*Y
+  if (I->getOpcode() == Instruction::Mul && I->hasOneUse() &&
+      cast<Instruction>(I->use_back())->getOpcode() == Instruction::Add &&
+      isa<ConstantInt>(Ops.back().Op) &&
+      cast<ConstantInt>(Ops.back().Op)->isAllOnesValue()) {
+    Ops.insert(Ops.begin(), Ops.back());
+    Ops.pop_back();
+  }
+  
+  DOUT << "RAOut:\t"; DEBUG(PrintOps(I, Ops)); DOUT << "\n";
+  
+  if (Ops.size() == 1) {
+    // This expression tree simplified to something that isn't a tree,
+    // eliminate it.
+    I->replaceAllUsesWith(Ops[0].Op);
+    RemoveDeadBinaryOp(I);
+  } else {
+    // Now that we ordered and optimized the expressions, splat them back into
+    // the expression tree, removing any unneeded nodes.
+    RewriteExprTree(I, Ops);
+  }
+}
+
+
+bool Reassociate::runOnFunction(Function &F) {
+  // Recalculate the rank map for F
+  BuildRankMap(F);
+
+  MadeChange = false;
+  for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; ++FI)
+    ReassociateBB(FI);
+
+  // We are done with the rank map...
+  RankMap.clear();
+  ValueRankMap.clear();
+  return MadeChange;
+}
+
diff --git a/lib/Transforms/Scalar/Reg2Mem.cpp b/lib/Transforms/Scalar/Reg2Mem.cpp
new file mode 100644
index 0000000..46b2952
--- /dev/null
+++ b/lib/Transforms/Scalar/Reg2Mem.cpp
@@ -0,0 +1,125 @@
+//===- Reg2Mem.cpp - Convert registers to allocas -------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file demotes all registers to memory references.  It is intented to be
+// the inverse of PromoteMemoryToRegister.  By converting to loads, the only
+// values live accross basic blocks are allocas and loads before phi nodes.
+// It is intended that this should make CFG hacking much easier.
+// To make later hacking easier, the entry block is split into two, such that
+// all introduced allocas and nothing else are in the entry block.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "reg2mem"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Pass.h"
+#include "llvm/Function.h"
+#include "llvm/Module.h"
+#include "llvm/BasicBlock.h"
+#include "llvm/Instructions.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/CFG.h"
+#include <list>
+using namespace llvm;
+
+STATISTIC(NumRegsDemoted, "Number of registers demoted");
+STATISTIC(NumPhisDemoted, "Number of phi-nodes demoted");
+
+namespace {
+  struct VISIBILITY_HIDDEN RegToMem : public FunctionPass {
+    static char ID; // Pass identification, replacement for typeid
+    RegToMem() : FunctionPass(&ID) {}
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequiredID(BreakCriticalEdgesID);
+      AU.addPreservedID(BreakCriticalEdgesID);
+    }
+
+   bool valueEscapes(Instruction* i) {
+      BasicBlock* bb = i->getParent();
+      for (Value::use_iterator ii = i->use_begin(), ie = i->use_end();
+           ii != ie; ++ii)
+        if (cast<Instruction>(*ii)->getParent() != bb ||
+            isa<PHINode>(*ii))
+          return true;
+      return false;
+    }
+
+    virtual bool runOnFunction(Function &F) {
+      if (!F.isDeclaration()) {
+        // Insert all new allocas into entry block.
+        BasicBlock* BBEntry = &F.getEntryBlock();
+        assert(pred_begin(BBEntry) == pred_end(BBEntry) &&
+               "Entry block to function must not have predecessors!");
+
+        // Find first non-alloca instruction and create insertion point. This is
+        // safe if block is well-formed: it always have terminator, otherwise
+        // we'll get and assertion.
+        BasicBlock::iterator I = BBEntry->begin();
+        while (isa<AllocaInst>(I)) ++I;
+
+        CastInst *AllocaInsertionPoint =
+          CastInst::Create(Instruction::BitCast,
+                           Constant::getNullValue(Type::Int32Ty), Type::Int32Ty,
+                           "reg2mem alloca point", I);
+
+        // Find the escaped instructions. But don't create stack slots for
+        // allocas in entry block.
+        std::list<Instruction*> worklist;
+        for (Function::iterator ibb = F.begin(), ibe = F.end();
+             ibb != ibe; ++ibb)
+          for (BasicBlock::iterator iib = ibb->begin(), iie = ibb->end();
+               iib != iie; ++iib) {
+            if (!(isa<AllocaInst>(iib) && iib->getParent() == BBEntry) &&
+                valueEscapes(iib)) {
+              worklist.push_front(&*iib);
+            }
+          }
+
+        // Demote escaped instructions
+        NumRegsDemoted += worklist.size();
+        for (std::list<Instruction*>::iterator ilb = worklist.begin(), 
+               ile = worklist.end(); ilb != ile; ++ilb)
+          DemoteRegToStack(**ilb, false, AllocaInsertionPoint);
+
+        worklist.clear();
+
+        // Find all phi's
+        for (Function::iterator ibb = F.begin(), ibe = F.end();
+             ibb != ibe; ++ibb)
+          for (BasicBlock::iterator iib = ibb->begin(), iie = ibb->end();
+               iib != iie; ++iib)
+            if (isa<PHINode>(iib))
+              worklist.push_front(&*iib);
+
+        // Demote phi nodes
+        NumPhisDemoted += worklist.size();
+        for (std::list<Instruction*>::iterator ilb = worklist.begin(), 
+               ile = worklist.end(); ilb != ile; ++ilb)
+          DemotePHIToStack(cast<PHINode>(*ilb), AllocaInsertionPoint);
+
+        return true;
+      }
+      return false;
+    }
+  };
+}
+  
+char RegToMem::ID = 0;
+static RegisterPass<RegToMem>
+X("reg2mem", "Demote all values to stack slots");
+
+// createDemoteRegisterToMemory - Provide an entry point to create this pass.
+//
+const PassInfo *const llvm::DemoteRegisterToMemoryID = &X;
+FunctionPass *llvm::createDemoteRegisterToMemoryPass() {
+  return new RegToMem();
+}
diff --git a/lib/Transforms/Scalar/SCCP.cpp b/lib/Transforms/Scalar/SCCP.cpp
new file mode 100644
index 0000000..d73519c
--- /dev/null
+++ b/lib/Transforms/Scalar/SCCP.cpp
@@ -0,0 +1,1855 @@
+//===- SCCP.cpp - Sparse Conditional Constant Propagation -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements sparse conditional constant propagation and merging:
+//
+// Specifically, this:
+//   * Assumes values are constant unless proven otherwise
+//   * Assumes BasicBlocks are dead unless proven otherwise
+//   * Proves values to be constant, and replaces them with constants
+//   * Proves conditional branches to be unconditional
+//
+// Notice that:
+//   * This pass has a habit of making definitions be dead.  It is a good idea
+//     to to run a DCE pass sometime after running this pass.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "sccp"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Instructions.h"
+#include "llvm/Pass.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Support/CallSite.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/InstVisitor.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/STLExtras.h"
+#include <algorithm>
+#include <map>
+using namespace llvm;
+
+STATISTIC(NumInstRemoved, "Number of instructions removed");
+STATISTIC(NumDeadBlocks , "Number of basic blocks unreachable");
+
+STATISTIC(IPNumInstRemoved, "Number of instructions removed by IPSCCP");
+STATISTIC(IPNumDeadBlocks , "Number of basic blocks unreachable by IPSCCP");
+STATISTIC(IPNumArgsElimed ,"Number of arguments constant propagated by IPSCCP");
+STATISTIC(IPNumGlobalConst, "Number of globals found to be constant by IPSCCP");
+
+namespace {
+/// LatticeVal class - This class represents the different lattice values that
+/// an LLVM value may occupy.  It is a simple class with value semantics.
+///
+class VISIBILITY_HIDDEN LatticeVal {
+  enum {
+    /// undefined - This LLVM Value has no known value yet.
+    undefined,
+    
+    /// constant - This LLVM Value has a specific constant value.
+    constant,
+
+    /// forcedconstant - This LLVM Value was thought to be undef until
+    /// ResolvedUndefsIn.  This is treated just like 'constant', but if merged
+    /// with another (different) constant, it goes to overdefined, instead of
+    /// asserting.
+    forcedconstant,
+    
+    /// overdefined - This instruction is not known to be constant, and we know
+    /// it has a value.
+    overdefined
+  } LatticeValue;    // The current lattice position
+  
+  Constant *ConstantVal; // If Constant value, the current value
+public:
+  inline LatticeVal() : LatticeValue(undefined), ConstantVal(0) {}
+  
+  // markOverdefined - Return true if this is a new status to be in...
+  inline bool markOverdefined() {
+    if (LatticeValue != overdefined) {
+      LatticeValue = overdefined;
+      return true;
+    }
+    return false;
+  }
+
+  // markConstant - Return true if this is a new status for us.
+  inline bool markConstant(Constant *V) {
+    if (LatticeValue != constant) {
+      if (LatticeValue == undefined) {
+        LatticeValue = constant;
+        assert(V && "Marking constant with NULL");
+        ConstantVal = V;
+      } else {
+        assert(LatticeValue == forcedconstant && 
+               "Cannot move from overdefined to constant!");
+        // Stay at forcedconstant if the constant is the same.
+        if (V == ConstantVal) return false;
+        
+        // Otherwise, we go to overdefined.  Assumptions made based on the
+        // forced value are possibly wrong.  Assuming this is another constant
+        // could expose a contradiction.
+        LatticeValue = overdefined;
+      }
+      return true;
+    } else {
+      assert(ConstantVal == V && "Marking constant with different value");
+    }
+    return false;
+  }
+
+  inline void markForcedConstant(Constant *V) {
+    assert(LatticeValue == undefined && "Can't force a defined value!");
+    LatticeValue = forcedconstant;
+    ConstantVal = V;
+  }
+  
+  inline bool isUndefined() const { return LatticeValue == undefined; }
+  inline bool isConstant() const {
+    return LatticeValue == constant || LatticeValue == forcedconstant;
+  }
+  inline bool isOverdefined() const { return LatticeValue == overdefined; }
+
+  inline Constant *getConstant() const {
+    assert(isConstant() && "Cannot get the constant of a non-constant!");
+    return ConstantVal;
+  }
+};
+
+//===----------------------------------------------------------------------===//
+//
+/// SCCPSolver - This class is a general purpose solver for Sparse Conditional
+/// Constant Propagation.
+///
+class SCCPSolver : public InstVisitor<SCCPSolver> {
+  DenseSet<BasicBlock*> BBExecutable;// The basic blocks that are executable
+  std::map<Value*, LatticeVal> ValueState;  // The state each value is in.
+
+  /// GlobalValue - If we are tracking any values for the contents of a global
+  /// variable, we keep a mapping from the constant accessor to the element of
+  /// the global, to the currently known value.  If the value becomes
+  /// overdefined, it's entry is simply removed from this map.
+  DenseMap<GlobalVariable*, LatticeVal> TrackedGlobals;
+
+  /// TrackedRetVals - If we are tracking arguments into and the return
+  /// value out of a function, it will have an entry in this map, indicating
+  /// what the known return value for the function is.
+  DenseMap<Function*, LatticeVal> TrackedRetVals;
+
+  /// TrackedMultipleRetVals - Same as TrackedRetVals, but used for functions
+  /// that return multiple values.
+  DenseMap<std::pair<Function*, unsigned>, LatticeVal> TrackedMultipleRetVals;
+
+  // The reason for two worklists is that overdefined is the lowest state
+  // on the lattice, and moving things to overdefined as fast as possible
+  // makes SCCP converge much faster.
+  // By having a separate worklist, we accomplish this because everything
+  // possibly overdefined will become overdefined at the soonest possible
+  // point.
+  SmallVector<Value*, 64> OverdefinedInstWorkList;
+  SmallVector<Value*, 64> InstWorkList;
+
+
+  SmallVector<BasicBlock*, 64>  BBWorkList;  // The BasicBlock work list
+
+  /// UsersOfOverdefinedPHIs - Keep track of any users of PHI nodes that are not
+  /// overdefined, despite the fact that the PHI node is overdefined.
+  std::multimap<PHINode*, Instruction*> UsersOfOverdefinedPHIs;
+
+  /// KnownFeasibleEdges - Entries in this set are edges which have already had
+  /// PHI nodes retriggered.
+  typedef std::pair<BasicBlock*, BasicBlock*> Edge;
+  DenseSet<Edge> KnownFeasibleEdges;
+public:
+
+  /// MarkBlockExecutable - This method can be used by clients to mark all of
+  /// the blocks that are known to be intrinsically live in the processed unit.
+  void MarkBlockExecutable(BasicBlock *BB) {
+    DOUT << "Marking Block Executable: " << BB->getNameStart() << "\n";
+    BBExecutable.insert(BB);   // Basic block is executable!
+    BBWorkList.push_back(BB);  // Add the block to the work list!
+  }
+
+  /// TrackValueOfGlobalVariable - Clients can use this method to
+  /// inform the SCCPSolver that it should track loads and stores to the
+  /// specified global variable if it can.  This is only legal to call if
+  /// performing Interprocedural SCCP.
+  void TrackValueOfGlobalVariable(GlobalVariable *GV) {
+    const Type *ElTy = GV->getType()->getElementType();
+    if (ElTy->isFirstClassType()) {
+      LatticeVal &IV = TrackedGlobals[GV];
+      if (!isa<UndefValue>(GV->getInitializer()))
+        IV.markConstant(GV->getInitializer());
+    }
+  }
+
+  /// AddTrackedFunction - If the SCCP solver is supposed to track calls into
+  /// and out of the specified function (which cannot have its address taken),
+  /// this method must be called.
+  void AddTrackedFunction(Function *F) {
+    assert(F->hasLocalLinkage() && "Can only track internal functions!");
+    // Add an entry, F -> undef.
+    if (const StructType *STy = dyn_cast<StructType>(F->getReturnType())) {
+      for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i)
+        TrackedMultipleRetVals.insert(std::make_pair(std::make_pair(F, i),
+                                                     LatticeVal()));
+    } else
+      TrackedRetVals.insert(std::make_pair(F, LatticeVal()));
+  }
+
+  /// Solve - Solve for constants and executable blocks.
+  ///
+  void Solve();
+
+  /// ResolvedUndefsIn - While solving the dataflow for a function, we assume
+  /// that branches on undef values cannot reach any of their successors.
+  /// However, this is not a safe assumption.  After we solve dataflow, this
+  /// method should be use to handle this.  If this returns true, the solver
+  /// should be rerun.
+  bool ResolvedUndefsIn(Function &F);
+
+  bool isBlockExecutable(BasicBlock *BB) const {
+    return BBExecutable.count(BB);
+  }
+
+  /// getValueMapping - Once we have solved for constants, return the mapping of
+  /// LLVM values to LatticeVals.
+  std::map<Value*, LatticeVal> &getValueMapping() {
+    return ValueState;
+  }
+
+  /// getTrackedRetVals - Get the inferred return value map.
+  ///
+  const DenseMap<Function*, LatticeVal> &getTrackedRetVals() {
+    return TrackedRetVals;
+  }
+
+  /// getTrackedGlobals - Get and return the set of inferred initializers for
+  /// global variables.
+  const DenseMap<GlobalVariable*, LatticeVal> &getTrackedGlobals() {
+    return TrackedGlobals;
+  }
+
+  inline void markOverdefined(Value *V) {
+    markOverdefined(ValueState[V], V);
+  }
+
+private:
+  // markConstant - Make a value be marked as "constant".  If the value
+  // is not already a constant, add it to the instruction work list so that
+  // the users of the instruction are updated later.
+  //
+  inline void markConstant(LatticeVal &IV, Value *V, Constant *C) {
+    if (IV.markConstant(C)) {
+      DOUT << "markConstant: " << *C << ": " << *V;
+      InstWorkList.push_back(V);
+    }
+  }
+  
+  inline void markForcedConstant(LatticeVal &IV, Value *V, Constant *C) {
+    IV.markForcedConstant(C);
+    DOUT << "markForcedConstant: " << *C << ": " << *V;
+    InstWorkList.push_back(V);
+  }
+  
+  inline void markConstant(Value *V, Constant *C) {
+    markConstant(ValueState[V], V, C);
+  }
+
+  // markOverdefined - Make a value be marked as "overdefined". If the
+  // value is not already overdefined, add it to the overdefined instruction
+  // work list so that the users of the instruction are updated later.
+  inline void markOverdefined(LatticeVal &IV, Value *V) {
+    if (IV.markOverdefined()) {
+      DEBUG(DOUT << "markOverdefined: ";
+            if (Function *F = dyn_cast<Function>(V))
+              DOUT << "Function '" << F->getName() << "'\n";
+            else
+              DOUT << *V);
+      // Only instructions go on the work list
+      OverdefinedInstWorkList.push_back(V);
+    }
+  }
+
+  inline void mergeInValue(LatticeVal &IV, Value *V, LatticeVal &MergeWithV) {
+    if (IV.isOverdefined() || MergeWithV.isUndefined())
+      return;  // Noop.
+    if (MergeWithV.isOverdefined())
+      markOverdefined(IV, V);
+    else if (IV.isUndefined())
+      markConstant(IV, V, MergeWithV.getConstant());
+    else if (IV.getConstant() != MergeWithV.getConstant())
+      markOverdefined(IV, V);
+  }
+  
+  inline void mergeInValue(Value *V, LatticeVal &MergeWithV) {
+    return mergeInValue(ValueState[V], V, MergeWithV);
+  }
+
+
+  // getValueState - Return the LatticeVal object that corresponds to the value.
+  // This function is necessary because not all values should start out in the
+  // underdefined state... Argument's should be overdefined, and
+  // constants should be marked as constants.  If a value is not known to be an
+  // Instruction object, then use this accessor to get its value from the map.
+  //
+  inline LatticeVal &getValueState(Value *V) {
+    std::map<Value*, LatticeVal>::iterator I = ValueState.find(V);
+    if (I != ValueState.end()) return I->second;  // Common case, in the map
+
+    if (Constant *C = dyn_cast<Constant>(V)) {
+      if (isa<UndefValue>(V)) {
+        // Nothing to do, remain undefined.
+      } else {
+        LatticeVal &LV = ValueState[C];
+        LV.markConstant(C);          // Constants are constant
+        return LV;
+      }
+    }
+    // All others are underdefined by default...
+    return ValueState[V];
+  }
+
+  // markEdgeExecutable - Mark a basic block as executable, adding it to the BB
+  // work list if it is not already executable...
+  //
+  void markEdgeExecutable(BasicBlock *Source, BasicBlock *Dest) {
+    if (!KnownFeasibleEdges.insert(Edge(Source, Dest)).second)
+      return;  // This edge is already known to be executable!
+
+    if (BBExecutable.count(Dest)) {
+      DOUT << "Marking Edge Executable: " << Source->getNameStart()
+           << " -> " << Dest->getNameStart() << "\n";
+
+      // The destination is already executable, but we just made an edge
+      // feasible that wasn't before.  Revisit the PHI nodes in the block
+      // because they have potentially new operands.
+      for (BasicBlock::iterator I = Dest->begin(); isa<PHINode>(I); ++I)
+        visitPHINode(*cast<PHINode>(I));
+
+    } else {
+      MarkBlockExecutable(Dest);
+    }
+  }
+
+  // getFeasibleSuccessors - Return a vector of booleans to indicate which
+  // successors are reachable from a given terminator instruction.
+  //
+  void getFeasibleSuccessors(TerminatorInst &TI, SmallVector<bool, 16> &Succs);
+
+  // isEdgeFeasible - Return true if the control flow edge from the 'From' basic
+  // block to the 'To' basic block is currently feasible...
+  //
+  bool isEdgeFeasible(BasicBlock *From, BasicBlock *To);
+
+  // OperandChangedState - This method is invoked on all of the users of an
+  // instruction that was just changed state somehow....  Based on this
+  // information, we need to update the specified user of this instruction.
+  //
+  void OperandChangedState(User *U) {
+    // Only instructions use other variable values!
+    Instruction &I = cast<Instruction>(*U);
+    if (BBExecutable.count(I.getParent()))   // Inst is executable?
+      visit(I);
+  }
+
+private:
+  friend class InstVisitor<SCCPSolver>;
+
+  // visit implementations - Something changed in this instruction... Either an
+  // operand made a transition, or the instruction is newly executable.  Change
+  // the value type of I to reflect these changes if appropriate.
+  //
+  void visitPHINode(PHINode &I);
+
+  // Terminators
+  void visitReturnInst(ReturnInst &I);
+  void visitTerminatorInst(TerminatorInst &TI);
+
+  void visitCastInst(CastInst &I);
+  void visitSelectInst(SelectInst &I);
+  void visitBinaryOperator(Instruction &I);
+  void visitCmpInst(CmpInst &I);
+  void visitExtractElementInst(ExtractElementInst &I);
+  void visitInsertElementInst(InsertElementInst &I);
+  void visitShuffleVectorInst(ShuffleVectorInst &I);
+  void visitExtractValueInst(ExtractValueInst &EVI);
+  void visitInsertValueInst(InsertValueInst &IVI);
+
+  // Instructions that cannot be folded away...
+  void visitStoreInst     (Instruction &I);
+  void visitLoadInst      (LoadInst &I);
+  void visitGetElementPtrInst(GetElementPtrInst &I);
+  void visitCallInst      (CallInst &I) { visitCallSite(CallSite::get(&I)); }
+  void visitInvokeInst    (InvokeInst &II) {
+    visitCallSite(CallSite::get(&II));
+    visitTerminatorInst(II);
+  }
+  void visitCallSite      (CallSite CS);
+  void visitUnwindInst    (TerminatorInst &I) { /*returns void*/ }
+  void visitUnreachableInst(TerminatorInst &I) { /*returns void*/ }
+  void visitAllocationInst(Instruction &I) { markOverdefined(&I); }
+  void visitVANextInst    (Instruction &I) { markOverdefined(&I); }
+  void visitVAArgInst     (Instruction &I) { markOverdefined(&I); }
+  void visitFreeInst      (Instruction &I) { /*returns void*/ }
+
+  void visitInstruction(Instruction &I) {
+    // If a new instruction is added to LLVM that we don't handle...
+    cerr << "SCCP: Don't know how to handle: " << I;
+    markOverdefined(&I);   // Just in case
+  }
+};
+
+} // end anonymous namespace
+
+
+// getFeasibleSuccessors - Return a vector of booleans to indicate which
+// successors are reachable from a given terminator instruction.
+//
+void SCCPSolver::getFeasibleSuccessors(TerminatorInst &TI,
+                                       SmallVector<bool, 16> &Succs) {
+  Succs.resize(TI.getNumSuccessors());
+  if (BranchInst *BI = dyn_cast<BranchInst>(&TI)) {
+    if (BI->isUnconditional()) {
+      Succs[0] = true;
+    } else {
+      LatticeVal &BCValue = getValueState(BI->getCondition());
+      if (BCValue.isOverdefined() ||
+          (BCValue.isConstant() && !isa<ConstantInt>(BCValue.getConstant()))) {
+        // Overdefined condition variables, and branches on unfoldable constant
+        // conditions, mean the branch could go either way.
+        Succs[0] = Succs[1] = true;
+      } else if (BCValue.isConstant()) {
+        // Constant condition variables mean the branch can only go a single way
+        Succs[BCValue.getConstant() == ConstantInt::getFalse()] = true;
+      }
+    }
+  } else if (isa<InvokeInst>(&TI)) {
+    // Invoke instructions successors are always executable.
+    Succs[0] = Succs[1] = true;
+  } else if (SwitchInst *SI = dyn_cast<SwitchInst>(&TI)) {
+    LatticeVal &SCValue = getValueState(SI->getCondition());
+    if (SCValue.isOverdefined() ||   // Overdefined condition?
+        (SCValue.isConstant() && !isa<ConstantInt>(SCValue.getConstant()))) {
+      // All destinations are executable!
+      Succs.assign(TI.getNumSuccessors(), true);
+    } else if (SCValue.isConstant())
+      Succs[SI->findCaseValue(cast<ConstantInt>(SCValue.getConstant()))] = true;
+  } else {
+    assert(0 && "SCCP: Don't know how to handle this terminator!");
+  }
+}
+
+
+// isEdgeFeasible - Return true if the control flow edge from the 'From' basic
+// block to the 'To' basic block is currently feasible...
+//
+bool SCCPSolver::isEdgeFeasible(BasicBlock *From, BasicBlock *To) {
+  assert(BBExecutable.count(To) && "Dest should always be alive!");
+
+  // Make sure the source basic block is executable!!
+  if (!BBExecutable.count(From)) return false;
+
+  // Check to make sure this edge itself is actually feasible now...
+  TerminatorInst *TI = From->getTerminator();
+  if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
+    if (BI->isUnconditional())
+      return true;
+    else {
+      LatticeVal &BCValue = getValueState(BI->getCondition());
+      if (BCValue.isOverdefined()) {
+        // Overdefined condition variables mean the branch could go either way.
+        return true;
+      } else if (BCValue.isConstant()) {
+        // Not branching on an evaluatable constant?
+        if (!isa<ConstantInt>(BCValue.getConstant())) return true;
+
+        // Constant condition variables mean the branch can only go a single way
+        return BI->getSuccessor(BCValue.getConstant() ==
+                                       ConstantInt::getFalse()) == To;
+      }
+      return false;
+    }
+  } else if (isa<InvokeInst>(TI)) {
+    // Invoke instructions successors are always executable.
+    return true;
+  } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
+    LatticeVal &SCValue = getValueState(SI->getCondition());
+    if (SCValue.isOverdefined()) {  // Overdefined condition?
+      // All destinations are executable!
+      return true;
+    } else if (SCValue.isConstant()) {
+      Constant *CPV = SCValue.getConstant();
+      if (!isa<ConstantInt>(CPV))
+        return true;  // not a foldable constant?
+
+      // Make sure to skip the "default value" which isn't a value
+      for (unsigned i = 1, E = SI->getNumSuccessors(); i != E; ++i)
+        if (SI->getSuccessorValue(i) == CPV) // Found the taken branch...
+          return SI->getSuccessor(i) == To;
+
+      // Constant value not equal to any of the branches... must execute
+      // default branch then...
+      return SI->getDefaultDest() == To;
+    }
+    return false;
+  } else {
+    cerr << "Unknown terminator instruction: " << *TI;
+    abort();
+  }
+}
+
+// visit Implementations - Something changed in this instruction... Either an
+// operand made a transition, or the instruction is newly executable.  Change
+// the value type of I to reflect these changes if appropriate.  This method
+// makes sure to do the following actions:
+//
+// 1. If a phi node merges two constants in, and has conflicting value coming
+//    from different branches, or if the PHI node merges in an overdefined
+//    value, then the PHI node becomes overdefined.
+// 2. If a phi node merges only constants in, and they all agree on value, the
+//    PHI node becomes a constant value equal to that.
+// 3. If V <- x (op) y && isConstant(x) && isConstant(y) V = Constant
+// 4. If V <- x (op) y && (isOverdefined(x) || isOverdefined(y)) V = Overdefined
+// 5. If V <- MEM or V <- CALL or V <- (unknown) then V = Overdefined
+// 6. If a conditional branch has a value that is constant, make the selected
+//    destination executable
+// 7. If a conditional branch has a value that is overdefined, make all
+//    successors executable.
+//
+void SCCPSolver::visitPHINode(PHINode &PN) {
+  LatticeVal &PNIV = getValueState(&PN);
+  if (PNIV.isOverdefined()) {
+    // There may be instructions using this PHI node that are not overdefined
+    // themselves.  If so, make sure that they know that the PHI node operand
+    // changed.
+    std::multimap<PHINode*, Instruction*>::iterator I, E;
+    tie(I, E) = UsersOfOverdefinedPHIs.equal_range(&PN);
+    if (I != E) {
+      SmallVector<Instruction*, 16> Users;
+      for (; I != E; ++I) Users.push_back(I->second);
+      while (!Users.empty()) {
+        visit(Users.back());
+        Users.pop_back();
+      }
+    }
+    return;  // Quick exit
+  }
+
+  // Super-extra-high-degree PHI nodes are unlikely to ever be marked constant,
+  // and slow us down a lot.  Just mark them overdefined.
+  if (PN.getNumIncomingValues() > 64) {
+    markOverdefined(PNIV, &PN);
+    return;
+  }
+
+  // Look at all of the executable operands of the PHI node.  If any of them
+  // are overdefined, the PHI becomes overdefined as well.  If they are all
+  // constant, and they agree with each other, the PHI becomes the identical
+  // constant.  If they are constant and don't agree, the PHI is overdefined.
+  // If there are no executable operands, the PHI remains undefined.
+  //
+  Constant *OperandVal = 0;
+  for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i) {
+    LatticeVal &IV = getValueState(PN.getIncomingValue(i));
+    if (IV.isUndefined()) continue;  // Doesn't influence PHI node.
+
+    if (isEdgeFeasible(PN.getIncomingBlock(i), PN.getParent())) {
+      if (IV.isOverdefined()) {   // PHI node becomes overdefined!
+        markOverdefined(&PN);
+        return;
+      }
+
+      if (OperandVal == 0) {   // Grab the first value...
+        OperandVal = IV.getConstant();
+      } else {                // Another value is being merged in!
+        // There is already a reachable operand.  If we conflict with it,
+        // then the PHI node becomes overdefined.  If we agree with it, we
+        // can continue on.
+
+        // Check to see if there are two different constants merging...
+        if (IV.getConstant() != OperandVal) {
+          // Yes there is.  This means the PHI node is not constant.
+          // You must be overdefined poor PHI.
+          //
+          markOverdefined(&PN);    // The PHI node now becomes overdefined
+          return;    // I'm done analyzing you
+        }
+      }
+    }
+  }
+
+  // If we exited the loop, this means that the PHI node only has constant
+  // arguments that agree with each other(and OperandVal is the constant) or
+  // OperandVal is null because there are no defined incoming arguments.  If
+  // this is the case, the PHI remains undefined.
+  //
+  if (OperandVal)
+    markConstant(&PN, OperandVal);      // Acquire operand value
+}
+
+void SCCPSolver::visitReturnInst(ReturnInst &I) {
+  if (I.getNumOperands() == 0) return;  // Ret void
+
+  Function *F = I.getParent()->getParent();
+  // If we are tracking the return value of this function, merge it in.
+  if (!F->hasLocalLinkage())
+    return;
+
+  if (!TrackedRetVals.empty() && I.getNumOperands() == 1) {
+    DenseMap<Function*, LatticeVal>::iterator TFRVI =
+      TrackedRetVals.find(F);
+    if (TFRVI != TrackedRetVals.end() &&
+        !TFRVI->second.isOverdefined()) {
+      LatticeVal &IV = getValueState(I.getOperand(0));
+      mergeInValue(TFRVI->second, F, IV);
+      return;
+    }
+  }
+  
+  // Handle functions that return multiple values.
+  if (!TrackedMultipleRetVals.empty() && I.getNumOperands() > 1) {
+    for (unsigned i = 0, e = I.getNumOperands(); i != e; ++i) {
+      DenseMap<std::pair<Function*, unsigned>, LatticeVal>::iterator
+        It = TrackedMultipleRetVals.find(std::make_pair(F, i));
+      if (It == TrackedMultipleRetVals.end()) break;
+      mergeInValue(It->second, F, getValueState(I.getOperand(i)));
+    }
+  } else if (!TrackedMultipleRetVals.empty() &&
+             I.getNumOperands() == 1 &&
+             isa<StructType>(I.getOperand(0)->getType())) {
+    for (unsigned i = 0, e = I.getOperand(0)->getType()->getNumContainedTypes();
+         i != e; ++i) {
+      DenseMap<std::pair<Function*, unsigned>, LatticeVal>::iterator
+        It = TrackedMultipleRetVals.find(std::make_pair(F, i));
+      if (It == TrackedMultipleRetVals.end()) break;
+      Value *Val = FindInsertedValue(I.getOperand(0), i);
+      mergeInValue(It->second, F, getValueState(Val));
+    }
+  }
+}
+
+void SCCPSolver::visitTerminatorInst(TerminatorInst &TI) {
+  SmallVector<bool, 16> SuccFeasible;
+  getFeasibleSuccessors(TI, SuccFeasible);
+
+  BasicBlock *BB = TI.getParent();
+
+  // Mark all feasible successors executable...
+  for (unsigned i = 0, e = SuccFeasible.size(); i != e; ++i)
+    if (SuccFeasible[i])
+      markEdgeExecutable(BB, TI.getSuccessor(i));
+}
+
+void SCCPSolver::visitCastInst(CastInst &I) {
+  Value *V = I.getOperand(0);
+  LatticeVal &VState = getValueState(V);
+  if (VState.isOverdefined())          // Inherit overdefinedness of operand
+    markOverdefined(&I);
+  else if (VState.isConstant())        // Propagate constant value
+    markConstant(&I, ConstantExpr::getCast(I.getOpcode(), 
+                                           VState.getConstant(), I.getType()));
+}
+
+void SCCPSolver::visitExtractValueInst(ExtractValueInst &EVI) {
+  Value *Aggr = EVI.getAggregateOperand();
+
+  // If the operand to the extractvalue is an undef, the result is undef.
+  if (isa<UndefValue>(Aggr))
+    return;
+
+  // Currently only handle single-index extractvalues.
+  if (EVI.getNumIndices() != 1) {
+    markOverdefined(&EVI);
+    return;
+  }
+  
+  Function *F = 0;
+  if (CallInst *CI = dyn_cast<CallInst>(Aggr))
+    F = CI->getCalledFunction();
+  else if (InvokeInst *II = dyn_cast<InvokeInst>(Aggr))
+    F = II->getCalledFunction();
+
+  // TODO: If IPSCCP resolves the callee of this function, we could propagate a
+  // result back!
+  if (F == 0 || TrackedMultipleRetVals.empty()) {
+    markOverdefined(&EVI);
+    return;
+  }
+  
+  // See if we are tracking the result of the callee.  If not tracking this
+  // function (for example, it is a declaration) just move to overdefined.
+  if (!TrackedMultipleRetVals.count(std::make_pair(F, *EVI.idx_begin()))) {
+    markOverdefined(&EVI);
+    return;
+  }
+  
+  // Otherwise, the value will be merged in here as a result of CallSite
+  // handling.
+}
+
+void SCCPSolver::visitInsertValueInst(InsertValueInst &IVI) {
+  Value *Aggr = IVI.getAggregateOperand();
+  Value *Val = IVI.getInsertedValueOperand();
+
+  // If the operands to the insertvalue are undef, the result is undef.
+  if (isa<UndefValue>(Aggr) && isa<UndefValue>(Val))
+    return;
+
+  // Currently only handle single-index insertvalues.
+  if (IVI.getNumIndices() != 1) {
+    markOverdefined(&IVI);
+    return;
+  }
+
+  // Currently only handle insertvalue instructions that are in a single-use
+  // chain that builds up a return value.
+  for (const InsertValueInst *TmpIVI = &IVI; ; ) {
+    if (!TmpIVI->hasOneUse()) {
+      markOverdefined(&IVI);
+      return;
+    }
+    const Value *V = *TmpIVI->use_begin();
+    if (isa<ReturnInst>(V))
+      break;
+    TmpIVI = dyn_cast<InsertValueInst>(V);
+    if (!TmpIVI) {
+      markOverdefined(&IVI);
+      return;
+    }
+  }
+  
+  // See if we are tracking the result of the callee.
+  Function *F = IVI.getParent()->getParent();
+  DenseMap<std::pair<Function*, unsigned>, LatticeVal>::iterator
+    It = TrackedMultipleRetVals.find(std::make_pair(F, *IVI.idx_begin()));
+
+  // Merge in the inserted member value.
+  if (It != TrackedMultipleRetVals.end())
+    mergeInValue(It->second, F, getValueState(Val));
+
+  // Mark the aggregate result of the IVI overdefined; any tracking that we do
+  // will be done on the individual member values.
+  markOverdefined(&IVI);
+}
+
+void SCCPSolver::visitSelectInst(SelectInst &I) {
+  LatticeVal &CondValue = getValueState(I.getCondition());
+  if (CondValue.isUndefined())
+    return;
+  if (CondValue.isConstant()) {
+    if (ConstantInt *CondCB = dyn_cast<ConstantInt>(CondValue.getConstant())){
+      mergeInValue(&I, getValueState(CondCB->getZExtValue() ? I.getTrueValue()
+                                                          : I.getFalseValue()));
+      return;
+    }
+  }
+  
+  // Otherwise, the condition is overdefined or a constant we can't evaluate.
+  // See if we can produce something better than overdefined based on the T/F
+  // value.
+  LatticeVal &TVal = getValueState(I.getTrueValue());
+  LatticeVal &FVal = getValueState(I.getFalseValue());
+  
+  // select ?, C, C -> C.
+  if (TVal.isConstant() && FVal.isConstant() && 
+      TVal.getConstant() == FVal.getConstant()) {
+    markConstant(&I, FVal.getConstant());
+    return;
+  }
+
+  if (TVal.isUndefined()) {  // select ?, undef, X -> X.
+    mergeInValue(&I, FVal);
+  } else if (FVal.isUndefined()) {  // select ?, X, undef -> X.
+    mergeInValue(&I, TVal);
+  } else {
+    markOverdefined(&I);
+  }
+}
+
+// Handle BinaryOperators and Shift Instructions...
+void SCCPSolver::visitBinaryOperator(Instruction &I) {
+  LatticeVal &IV = ValueState[&I];
+  if (IV.isOverdefined()) return;
+
+  LatticeVal &V1State = getValueState(I.getOperand(0));
+  LatticeVal &V2State = getValueState(I.getOperand(1));
+
+  if (V1State.isOverdefined() || V2State.isOverdefined()) {
+    // If this is an AND or OR with 0 or -1, it doesn't matter that the other
+    // operand is overdefined.
+    if (I.getOpcode() == Instruction::And || I.getOpcode() == Instruction::Or) {
+      LatticeVal *NonOverdefVal = 0;
+      if (!V1State.isOverdefined()) {
+        NonOverdefVal = &V1State;
+      } else if (!V2State.isOverdefined()) {
+        NonOverdefVal = &V2State;
+      }
+
+      if (NonOverdefVal) {
+        if (NonOverdefVal->isUndefined()) {
+          // Could annihilate value.
+          if (I.getOpcode() == Instruction::And)
+            markConstant(IV, &I, Constant::getNullValue(I.getType()));
+          else if (const VectorType *PT = dyn_cast<VectorType>(I.getType()))
+            markConstant(IV, &I, ConstantVector::getAllOnesValue(PT));
+          else
+            markConstant(IV, &I, ConstantInt::getAllOnesValue(I.getType()));
+          return;
+        } else {
+          if (I.getOpcode() == Instruction::And) {
+            if (NonOverdefVal->getConstant()->isNullValue()) {
+              markConstant(IV, &I, NonOverdefVal->getConstant());
+              return;      // X and 0 = 0
+            }
+          } else {
+            if (ConstantInt *CI =
+                     dyn_cast<ConstantInt>(NonOverdefVal->getConstant()))
+              if (CI->isAllOnesValue()) {
+                markConstant(IV, &I, NonOverdefVal->getConstant());
+                return;    // X or -1 = -1
+              }
+          }
+        }
+      }
+    }
+
+
+    // If both operands are PHI nodes, it is possible that this instruction has
+    // a constant value, despite the fact that the PHI node doesn't.  Check for
+    // this condition now.
+    if (PHINode *PN1 = dyn_cast<PHINode>(I.getOperand(0)))
+      if (PHINode *PN2 = dyn_cast<PHINode>(I.getOperand(1)))
+        if (PN1->getParent() == PN2->getParent()) {
+          // Since the two PHI nodes are in the same basic block, they must have
+          // entries for the same predecessors.  Walk the predecessor list, and
+          // if all of the incoming values are constants, and the result of
+          // evaluating this expression with all incoming value pairs is the
+          // same, then this expression is a constant even though the PHI node
+          // is not a constant!
+          LatticeVal Result;
+          for (unsigned i = 0, e = PN1->getNumIncomingValues(); i != e; ++i) {
+            LatticeVal &In1 = getValueState(PN1->getIncomingValue(i));
+            BasicBlock *InBlock = PN1->getIncomingBlock(i);
+            LatticeVal &In2 =
+              getValueState(PN2->getIncomingValueForBlock(InBlock));
+
+            if (In1.isOverdefined() || In2.isOverdefined()) {
+              Result.markOverdefined();
+              break;  // Cannot fold this operation over the PHI nodes!
+            } else if (In1.isConstant() && In2.isConstant()) {
+              Constant *V = ConstantExpr::get(I.getOpcode(), In1.getConstant(),
+                                              In2.getConstant());
+              if (Result.isUndefined())
+                Result.markConstant(V);
+              else if (Result.isConstant() && Result.getConstant() != V) {
+                Result.markOverdefined();
+                break;
+              }
+            }
+          }
+
+          // If we found a constant value here, then we know the instruction is
+          // constant despite the fact that the PHI nodes are overdefined.
+          if (Result.isConstant()) {
+            markConstant(IV, &I, Result.getConstant());
+            // Remember that this instruction is virtually using the PHI node
+            // operands.
+            UsersOfOverdefinedPHIs.insert(std::make_pair(PN1, &I));
+            UsersOfOverdefinedPHIs.insert(std::make_pair(PN2, &I));
+            return;
+          } else if (Result.isUndefined()) {
+            return;
+          }
+
+          // Okay, this really is overdefined now.  Since we might have
+          // speculatively thought that this was not overdefined before, and
+          // added ourselves to the UsersOfOverdefinedPHIs list for the PHIs,
+          // make sure to clean out any entries that we put there, for
+          // efficiency.
+          std::multimap<PHINode*, Instruction*>::iterator It, E;
+          tie(It, E) = UsersOfOverdefinedPHIs.equal_range(PN1);
+          while (It != E) {
+            if (It->second == &I) {
+              UsersOfOverdefinedPHIs.erase(It++);
+            } else
+              ++It;
+          }
+          tie(It, E) = UsersOfOverdefinedPHIs.equal_range(PN2);
+          while (It != E) {
+            if (It->second == &I) {
+              UsersOfOverdefinedPHIs.erase(It++);
+            } else
+              ++It;
+          }
+        }
+
+    markOverdefined(IV, &I);
+  } else if (V1State.isConstant() && V2State.isConstant()) {
+    markConstant(IV, &I, ConstantExpr::get(I.getOpcode(), V1State.getConstant(),
+                                           V2State.getConstant()));
+  }
+}
+
+// Handle ICmpInst instruction...
+void SCCPSolver::visitCmpInst(CmpInst &I) {
+  LatticeVal &IV = ValueState[&I];
+  if (IV.isOverdefined()) return;
+
+  LatticeVal &V1State = getValueState(I.getOperand(0));
+  LatticeVal &V2State = getValueState(I.getOperand(1));
+
+  if (V1State.isOverdefined() || V2State.isOverdefined()) {
+    // If both operands are PHI nodes, it is possible that this instruction has
+    // a constant value, despite the fact that the PHI node doesn't.  Check for
+    // this condition now.
+    if (PHINode *PN1 = dyn_cast<PHINode>(I.getOperand(0)))
+      if (PHINode *PN2 = dyn_cast<PHINode>(I.getOperand(1)))
+        if (PN1->getParent() == PN2->getParent()) {
+          // Since the two PHI nodes are in the same basic block, they must have
+          // entries for the same predecessors.  Walk the predecessor list, and
+          // if all of the incoming values are constants, and the result of
+          // evaluating this expression with all incoming value pairs is the
+          // same, then this expression is a constant even though the PHI node
+          // is not a constant!
+          LatticeVal Result;
+          for (unsigned i = 0, e = PN1->getNumIncomingValues(); i != e; ++i) {
+            LatticeVal &In1 = getValueState(PN1->getIncomingValue(i));
+            BasicBlock *InBlock = PN1->getIncomingBlock(i);
+            LatticeVal &In2 =
+              getValueState(PN2->getIncomingValueForBlock(InBlock));
+
+            if (In1.isOverdefined() || In2.isOverdefined()) {
+              Result.markOverdefined();
+              break;  // Cannot fold this operation over the PHI nodes!
+            } else if (In1.isConstant() && In2.isConstant()) {
+              Constant *V = ConstantExpr::getCompare(I.getPredicate(), 
+                                                     In1.getConstant(), 
+                                                     In2.getConstant());
+              if (Result.isUndefined())
+                Result.markConstant(V);
+              else if (Result.isConstant() && Result.getConstant() != V) {
+                Result.markOverdefined();
+                break;
+              }
+            }
+          }
+
+          // If we found a constant value here, then we know the instruction is
+          // constant despite the fact that the PHI nodes are overdefined.
+          if (Result.isConstant()) {
+            markConstant(IV, &I, Result.getConstant());
+            // Remember that this instruction is virtually using the PHI node
+            // operands.
+            UsersOfOverdefinedPHIs.insert(std::make_pair(PN1, &I));
+            UsersOfOverdefinedPHIs.insert(std::make_pair(PN2, &I));
+            return;
+          } else if (Result.isUndefined()) {
+            return;
+          }
+
+          // Okay, this really is overdefined now.  Since we might have
+          // speculatively thought that this was not overdefined before, and
+          // added ourselves to the UsersOfOverdefinedPHIs list for the PHIs,
+          // make sure to clean out any entries that we put there, for
+          // efficiency.
+          std::multimap<PHINode*, Instruction*>::iterator It, E;
+          tie(It, E) = UsersOfOverdefinedPHIs.equal_range(PN1);
+          while (It != E) {
+            if (It->second == &I) {
+              UsersOfOverdefinedPHIs.erase(It++);
+            } else
+              ++It;
+          }
+          tie(It, E) = UsersOfOverdefinedPHIs.equal_range(PN2);
+          while (It != E) {
+            if (It->second == &I) {
+              UsersOfOverdefinedPHIs.erase(It++);
+            } else
+              ++It;
+          }
+        }
+
+    markOverdefined(IV, &I);
+  } else if (V1State.isConstant() && V2State.isConstant()) {
+    markConstant(IV, &I, ConstantExpr::getCompare(I.getPredicate(), 
+                                                  V1State.getConstant(), 
+                                                  V2State.getConstant()));
+  }
+}
+
+void SCCPSolver::visitExtractElementInst(ExtractElementInst &I) {
+  // FIXME : SCCP does not handle vectors properly.
+  markOverdefined(&I);
+  return;
+
+#if 0
+  LatticeVal &ValState = getValueState(I.getOperand(0));
+  LatticeVal &IdxState = getValueState(I.getOperand(1));
+
+  if (ValState.isOverdefined() || IdxState.isOverdefined())
+    markOverdefined(&I);
+  else if(ValState.isConstant() && IdxState.isConstant())
+    markConstant(&I, ConstantExpr::getExtractElement(ValState.getConstant(),
+                                                     IdxState.getConstant()));
+#endif
+}
+
+void SCCPSolver::visitInsertElementInst(InsertElementInst &I) {
+  // FIXME : SCCP does not handle vectors properly.
+  markOverdefined(&I);
+  return;
+#if 0
+  LatticeVal &ValState = getValueState(I.getOperand(0));
+  LatticeVal &EltState = getValueState(I.getOperand(1));
+  LatticeVal &IdxState = getValueState(I.getOperand(2));
+
+  if (ValState.isOverdefined() || EltState.isOverdefined() ||
+      IdxState.isOverdefined())
+    markOverdefined(&I);
+  else if(ValState.isConstant() && EltState.isConstant() &&
+          IdxState.isConstant())
+    markConstant(&I, ConstantExpr::getInsertElement(ValState.getConstant(),
+                                                    EltState.getConstant(),
+                                                    IdxState.getConstant()));
+  else if (ValState.isUndefined() && EltState.isConstant() &&
+           IdxState.isConstant()) 
+    markConstant(&I,ConstantExpr::getInsertElement(UndefValue::get(I.getType()),
+                                                   EltState.getConstant(),
+                                                   IdxState.getConstant()));
+#endif
+}
+
+void SCCPSolver::visitShuffleVectorInst(ShuffleVectorInst &I) {
+  // FIXME : SCCP does not handle vectors properly.
+  markOverdefined(&I);
+  return;
+#if 0
+  LatticeVal &V1State   = getValueState(I.getOperand(0));
+  LatticeVal &V2State   = getValueState(I.getOperand(1));
+  LatticeVal &MaskState = getValueState(I.getOperand(2));
+
+  if (MaskState.isUndefined() ||
+      (V1State.isUndefined() && V2State.isUndefined()))
+    return;  // Undefined output if mask or both inputs undefined.
+  
+  if (V1State.isOverdefined() || V2State.isOverdefined() ||
+      MaskState.isOverdefined()) {
+    markOverdefined(&I);
+  } else {
+    // A mix of constant/undef inputs.
+    Constant *V1 = V1State.isConstant() ? 
+        V1State.getConstant() : UndefValue::get(I.getType());
+    Constant *V2 = V2State.isConstant() ? 
+        V2State.getConstant() : UndefValue::get(I.getType());
+    Constant *Mask = MaskState.isConstant() ? 
+      MaskState.getConstant() : UndefValue::get(I.getOperand(2)->getType());
+    markConstant(&I, ConstantExpr::getShuffleVector(V1, V2, Mask));
+  }
+#endif
+}
+
+// Handle getelementptr instructions... if all operands are constants then we
+// can turn this into a getelementptr ConstantExpr.
+//
+void SCCPSolver::visitGetElementPtrInst(GetElementPtrInst &I) {
+  LatticeVal &IV = ValueState[&I];
+  if (IV.isOverdefined()) return;
+
+  SmallVector<Constant*, 8> Operands;
+  Operands.reserve(I.getNumOperands());
+
+  for (unsigned i = 0, e = I.getNumOperands(); i != e; ++i) {
+    LatticeVal &State = getValueState(I.getOperand(i));
+    if (State.isUndefined())
+      return;  // Operands are not resolved yet...
+    else if (State.isOverdefined()) {
+      markOverdefined(IV, &I);
+      return;
+    }
+    assert(State.isConstant() && "Unknown state!");
+    Operands.push_back(State.getConstant());
+  }
+
+  Constant *Ptr = Operands[0];
+  Operands.erase(Operands.begin());  // Erase the pointer from idx list...
+
+  markConstant(IV, &I, ConstantExpr::getGetElementPtr(Ptr, &Operands[0],
+                                                      Operands.size()));
+}
+
+void SCCPSolver::visitStoreInst(Instruction &SI) {
+  if (TrackedGlobals.empty() || !isa<GlobalVariable>(SI.getOperand(1)))
+    return;
+  GlobalVariable *GV = cast<GlobalVariable>(SI.getOperand(1));
+  DenseMap<GlobalVariable*, LatticeVal>::iterator I = TrackedGlobals.find(GV);
+  if (I == TrackedGlobals.end() || I->second.isOverdefined()) return;
+
+  // Get the value we are storing into the global.
+  LatticeVal &PtrVal = getValueState(SI.getOperand(0));
+
+  mergeInValue(I->second, GV, PtrVal);
+  if (I->second.isOverdefined())
+    TrackedGlobals.erase(I);      // No need to keep tracking this!
+}
+
+
+// Handle load instructions.  If the operand is a constant pointer to a constant
+// global, we can replace the load with the loaded constant value!
+void SCCPSolver::visitLoadInst(LoadInst &I) {
+  LatticeVal &IV = ValueState[&I];
+  if (IV.isOverdefined()) return;
+
+  LatticeVal &PtrVal = getValueState(I.getOperand(0));
+  if (PtrVal.isUndefined()) return;   // The pointer is not resolved yet!
+  if (PtrVal.isConstant() && !I.isVolatile()) {
+    Value *Ptr = PtrVal.getConstant();
+    // TODO: Consider a target hook for valid address spaces for this xform.
+    if (isa<ConstantPointerNull>(Ptr) && 
+        cast<PointerType>(Ptr->getType())->getAddressSpace() == 0) {
+      // load null -> null
+      markConstant(IV, &I, Constant::getNullValue(I.getType()));
+      return;
+    }
+
+    // Transform load (constant global) into the value loaded.
+    if (GlobalVariable *GV = dyn_cast<GlobalVariable>(Ptr)) {
+      if (GV->isConstant()) {
+        if (GV->hasDefinitiveInitializer()) {
+          markConstant(IV, &I, GV->getInitializer());
+          return;
+        }
+      } else if (!TrackedGlobals.empty()) {
+        // If we are tracking this global, merge in the known value for it.
+        DenseMap<GlobalVariable*, LatticeVal>::iterator It =
+          TrackedGlobals.find(GV);
+        if (It != TrackedGlobals.end()) {
+          mergeInValue(IV, &I, It->second);
+          return;
+        }
+      }
+    }
+
+    // Transform load (constantexpr_GEP global, 0, ...) into the value loaded.
+    if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Ptr))
+      if (CE->getOpcode() == Instruction::GetElementPtr)
+    if (GlobalVariable *GV = dyn_cast<GlobalVariable>(CE->getOperand(0)))
+      if (GV->isConstant() && GV->hasDefinitiveInitializer())
+        if (Constant *V =
+             ConstantFoldLoadThroughGEPConstantExpr(GV->getInitializer(), CE)) {
+          markConstant(IV, &I, V);
+          return;
+        }
+  }
+
+  // Otherwise we cannot say for certain what value this load will produce.
+  // Bail out.
+  markOverdefined(IV, &I);
+}
+
+void SCCPSolver::visitCallSite(CallSite CS) {
+  Function *F = CS.getCalledFunction();
+  Instruction *I = CS.getInstruction();
+  
+  // The common case is that we aren't tracking the callee, either because we
+  // are not doing interprocedural analysis or the callee is indirect, or is
+  // external.  Handle these cases first.
+  if (F == 0 || !F->hasLocalLinkage()) {
+CallOverdefined:
+    // Void return and not tracking callee, just bail.
+    if (I->getType() == Type::VoidTy) return;
+    
+    // Otherwise, if we have a single return value case, and if the function is
+    // a declaration, maybe we can constant fold it.
+    if (!isa<StructType>(I->getType()) && F && F->isDeclaration() && 
+        canConstantFoldCallTo(F)) {
+      
+      SmallVector<Constant*, 8> Operands;
+      for (CallSite::arg_iterator AI = CS.arg_begin(), E = CS.arg_end();
+           AI != E; ++AI) {
+        LatticeVal &State = getValueState(*AI);
+        if (State.isUndefined())
+          return;  // Operands are not resolved yet.
+        else if (State.isOverdefined()) {
+          markOverdefined(I);
+          return;
+        }
+        assert(State.isConstant() && "Unknown state!");
+        Operands.push_back(State.getConstant());
+      }
+     
+      // If we can constant fold this, mark the result of the call as a
+      // constant.
+      if (Constant *C = ConstantFoldCall(F, Operands.data(), Operands.size())) {
+        markConstant(I, C);
+        return;
+      }
+    }
+
+    // Otherwise, we don't know anything about this call, mark it overdefined.
+    markOverdefined(I);
+    return;
+  }
+
+  // If this is a single/zero retval case, see if we're tracking the function.
+  DenseMap<Function*, LatticeVal>::iterator TFRVI = TrackedRetVals.find(F);
+  if (TFRVI != TrackedRetVals.end()) {
+    // If so, propagate the return value of the callee into this call result.
+    mergeInValue(I, TFRVI->second);
+  } else if (isa<StructType>(I->getType())) {
+    // Check to see if we're tracking this callee, if not, handle it in the
+    // common path above.
+    DenseMap<std::pair<Function*, unsigned>, LatticeVal>::iterator
+    TMRVI = TrackedMultipleRetVals.find(std::make_pair(F, 0));
+    if (TMRVI == TrackedMultipleRetVals.end())
+      goto CallOverdefined;
+    
+    // If we are tracking this callee, propagate the return values of the call
+    // into this call site.  We do this by walking all the uses. Single-index
+    // ExtractValueInst uses can be tracked; anything more complicated is
+    // currently handled conservatively.
+    for (Value::use_iterator UI = I->use_begin(), E = I->use_end();
+         UI != E; ++UI) {
+      if (ExtractValueInst *EVI = dyn_cast<ExtractValueInst>(*UI)) {
+        if (EVI->getNumIndices() == 1) {
+          mergeInValue(EVI, 
+                  TrackedMultipleRetVals[std::make_pair(F, *EVI->idx_begin())]);
+          continue;
+        }
+      }
+      // The aggregate value is used in a way not handled here. Assume nothing.
+      markOverdefined(*UI);
+    }
+  } else {
+    // Otherwise we're not tracking this callee, so handle it in the
+    // common path above.
+    goto CallOverdefined;
+  }
+   
+  // Finally, if this is the first call to the function hit, mark its entry
+  // block executable.
+  if (!BBExecutable.count(F->begin()))
+    MarkBlockExecutable(F->begin());
+  
+  // Propagate information from this call site into the callee.
+  CallSite::arg_iterator CAI = CS.arg_begin();
+  for (Function::arg_iterator AI = F->arg_begin(), E = F->arg_end();
+       AI != E; ++AI, ++CAI) {
+    LatticeVal &IV = ValueState[AI];
+    if (!IV.isOverdefined())
+      mergeInValue(IV, AI, getValueState(*CAI));
+  }
+}
+
+
+void SCCPSolver::Solve() {
+  // Process the work lists until they are empty!
+  while (!BBWorkList.empty() || !InstWorkList.empty() ||
+         !OverdefinedInstWorkList.empty()) {
+    // Process the instruction work list...
+    while (!OverdefinedInstWorkList.empty()) {
+      Value *I = OverdefinedInstWorkList.back();
+      OverdefinedInstWorkList.pop_back();
+
+      DOUT << "\nPopped off OI-WL: " << *I;
+
+      // "I" got into the work list because it either made the transition from
+      // bottom to constant
+      //
+      // Anything on this worklist that is overdefined need not be visited
+      // since all of its users will have already been marked as overdefined
+      // Update all of the users of this instruction's value...
+      //
+      for (Value::use_iterator UI = I->use_begin(), E = I->use_end();
+           UI != E; ++UI)
+        OperandChangedState(*UI);
+    }
+    // Process the instruction work list...
+    while (!InstWorkList.empty()) {
+      Value *I = InstWorkList.back();
+      InstWorkList.pop_back();
+
+      DOUT << "\nPopped off I-WL: " << *I;
+
+      // "I" got into the work list because it either made the transition from
+      // bottom to constant
+      //
+      // Anything on this worklist that is overdefined need not be visited
+      // since all of its users will have already been marked as overdefined.
+      // Update all of the users of this instruction's value...
+      //
+      if (!getValueState(I).isOverdefined())
+        for (Value::use_iterator UI = I->use_begin(), E = I->use_end();
+             UI != E; ++UI)
+          OperandChangedState(*UI);
+    }
+
+    // Process the basic block work list...
+    while (!BBWorkList.empty()) {
+      BasicBlock *BB = BBWorkList.back();
+      BBWorkList.pop_back();
+
+      DOUT << "\nPopped off BBWL: " << *BB;
+
+      // Notify all instructions in this basic block that they are newly
+      // executable.
+      visit(BB);
+    }
+  }
+}
+
+/// ResolvedUndefsIn - While solving the dataflow for a function, we assume
+/// that branches on undef values cannot reach any of their successors.
+/// However, this is not a safe assumption.  After we solve dataflow, this
+/// method should be use to handle this.  If this returns true, the solver
+/// should be rerun.
+///
+/// This method handles this by finding an unresolved branch and marking it one
+/// of the edges from the block as being feasible, even though the condition
+/// doesn't say it would otherwise be.  This allows SCCP to find the rest of the
+/// CFG and only slightly pessimizes the analysis results (by marking one,
+/// potentially infeasible, edge feasible).  This cannot usefully modify the
+/// constraints on the condition of the branch, as that would impact other users
+/// of the value.
+///
+/// This scan also checks for values that use undefs, whose results are actually
+/// defined.  For example, 'zext i8 undef to i32' should produce all zeros
+/// conservatively, as "(zext i8 X -> i32) & 0xFF00" must always return zero,
+/// even if X isn't defined.
+bool SCCPSolver::ResolvedUndefsIn(Function &F) {
+  for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
+    if (!BBExecutable.count(BB))
+      continue;
+    
+    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
+      // Look for instructions which produce undef values.
+      if (I->getType() == Type::VoidTy) continue;
+      
+      LatticeVal &LV = getValueState(I);
+      if (!LV.isUndefined()) continue;
+
+      // Get the lattice values of the first two operands for use below.
+      LatticeVal &Op0LV = getValueState(I->getOperand(0));
+      LatticeVal Op1LV;
+      if (I->getNumOperands() == 2) {
+        // If this is a two-operand instruction, and if both operands are
+        // undefs, the result stays undef.
+        Op1LV = getValueState(I->getOperand(1));
+        if (Op0LV.isUndefined() && Op1LV.isUndefined())
+          continue;
+      }
+      
+      // If this is an instructions whose result is defined even if the input is
+      // not fully defined, propagate the information.
+      const Type *ITy = I->getType();
+      switch (I->getOpcode()) {
+      default: break;          // Leave the instruction as an undef.
+      case Instruction::ZExt:
+        // After a zero extend, we know the top part is zero.  SExt doesn't have
+        // to be handled here, because we don't know whether the top part is 1's
+        // or 0's.
+        assert(Op0LV.isUndefined());
+        markForcedConstant(LV, I, Constant::getNullValue(ITy));
+        return true;
+      case Instruction::Mul:
+      case Instruction::And:
+        // undef * X -> 0.   X could be zero.
+        // undef & X -> 0.   X could be zero.
+        markForcedConstant(LV, I, Constant::getNullValue(ITy));
+        return true;
+
+      case Instruction::Or:
+        // undef | X -> -1.   X could be -1.
+        if (const VectorType *PTy = dyn_cast<VectorType>(ITy))
+          markForcedConstant(LV, I, ConstantVector::getAllOnesValue(PTy));
+        else          
+          markForcedConstant(LV, I, ConstantInt::getAllOnesValue(ITy));
+        return true;
+
+      case Instruction::SDiv:
+      case Instruction::UDiv:
+      case Instruction::SRem:
+      case Instruction::URem:
+        // X / undef -> undef.  No change.
+        // X % undef -> undef.  No change.
+        if (Op1LV.isUndefined()) break;
+        
+        // undef / X -> 0.   X could be maxint.
+        // undef % X -> 0.   X could be 1.
+        markForcedConstant(LV, I, Constant::getNullValue(ITy));
+        return true;
+        
+      case Instruction::AShr:
+        // undef >>s X -> undef.  No change.
+        if (Op0LV.isUndefined()) break;
+        
+        // X >>s undef -> X.  X could be 0, X could have the high-bit known set.
+        if (Op0LV.isConstant())
+          markForcedConstant(LV, I, Op0LV.getConstant());
+        else
+          markOverdefined(LV, I);
+        return true;
+      case Instruction::LShr:
+      case Instruction::Shl:
+        // undef >> X -> undef.  No change.
+        // undef << X -> undef.  No change.
+        if (Op0LV.isUndefined()) break;
+        
+        // X >> undef -> 0.  X could be 0.
+        // X << undef -> 0.  X could be 0.
+        markForcedConstant(LV, I, Constant::getNullValue(ITy));
+        return true;
+      case Instruction::Select:
+        // undef ? X : Y  -> X or Y.  There could be commonality between X/Y.
+        if (Op0LV.isUndefined()) {
+          if (!Op1LV.isConstant())  // Pick the constant one if there is any.
+            Op1LV = getValueState(I->getOperand(2));
+        } else if (Op1LV.isUndefined()) {
+          // c ? undef : undef -> undef.  No change.
+          Op1LV = getValueState(I->getOperand(2));
+          if (Op1LV.isUndefined())
+            break;
+          // Otherwise, c ? undef : x -> x.
+        } else {
+          // Leave Op1LV as Operand(1)'s LatticeValue.
+        }
+        
+        if (Op1LV.isConstant())
+          markForcedConstant(LV, I, Op1LV.getConstant());
+        else
+          markOverdefined(LV, I);
+        return true;
+      case Instruction::Call:
+        // If a call has an undef result, it is because it is constant foldable
+        // but one of the inputs was undef.  Just force the result to
+        // overdefined.
+        markOverdefined(LV, I);
+        return true;
+      }
+    }
+  
+    TerminatorInst *TI = BB->getTerminator();
+    if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
+      if (!BI->isConditional()) continue;
+      if (!getValueState(BI->getCondition()).isUndefined())
+        continue;
+    } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
+      if (SI->getNumSuccessors()<2)   // no cases
+        continue;
+      if (!getValueState(SI->getCondition()).isUndefined())
+        continue;
+    } else {
+      continue;
+    }
+    
+    // If the edge to the second successor isn't thought to be feasible yet,
+    // mark it so now.  We pick the second one so that this goes to some
+    // enumerated value in a switch instead of going to the default destination.
+    if (KnownFeasibleEdges.count(Edge(BB, TI->getSuccessor(1))))
+      continue;
+    
+    // Otherwise, it isn't already thought to be feasible.  Mark it as such now
+    // and return.  This will make other blocks reachable, which will allow new
+    // values to be discovered and existing ones to be moved in the lattice.
+    markEdgeExecutable(BB, TI->getSuccessor(1));
+    
+    // This must be a conditional branch of switch on undef.  At this point,
+    // force the old terminator to branch to the first successor.  This is
+    // required because we are now influencing the dataflow of the function with
+    // the assumption that this edge is taken.  If we leave the branch condition
+    // as undef, then further analysis could think the undef went another way
+    // leading to an inconsistent set of conclusions.
+    if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
+      BI->setCondition(ConstantInt::getFalse());
+    } else {
+      SwitchInst *SI = cast<SwitchInst>(TI);
+      SI->setCondition(SI->getCaseValue(1));
+    }
+    
+    return true;
+  }
+
+  return false;
+}
+
+
+namespace {
+  //===--------------------------------------------------------------------===//
+  //
+  /// SCCP Class - This class uses the SCCPSolver to implement a per-function
+  /// Sparse Conditional Constant Propagator.
+  ///
+  struct VISIBILITY_HIDDEN SCCP : public FunctionPass {
+    static char ID; // Pass identification, replacement for typeid
+    SCCP() : FunctionPass(&ID) {}
+
+    // runOnFunction - Run the Sparse Conditional Constant Propagation
+    // algorithm, and return true if the function was modified.
+    //
+    bool runOnFunction(Function &F);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesCFG();
+    }
+  };
+} // end anonymous namespace
+
+char SCCP::ID = 0;
+static RegisterPass<SCCP>
+X("sccp", "Sparse Conditional Constant Propagation");
+
+// createSCCPPass - This is the public interface to this file...
+FunctionPass *llvm::createSCCPPass() {
+  return new SCCP();
+}
+
+
+// runOnFunction() - Run the Sparse Conditional Constant Propagation algorithm,
+// and return true if the function was modified.
+//
+bool SCCP::runOnFunction(Function &F) {
+  DOUT << "SCCP on function '" << F.getNameStart() << "'\n";
+  SCCPSolver Solver;
+
+  // Mark the first block of the function as being executable.
+  Solver.MarkBlockExecutable(F.begin());
+
+  // Mark all arguments to the function as being overdefined.
+  for (Function::arg_iterator AI = F.arg_begin(), E = F.arg_end(); AI != E;++AI)
+    Solver.markOverdefined(AI);
+
+  // Solve for constants.
+  bool ResolvedUndefs = true;
+  while (ResolvedUndefs) {
+    Solver.Solve();
+    DOUT << "RESOLVING UNDEFs\n";
+    ResolvedUndefs = Solver.ResolvedUndefsIn(F);
+  }
+
+  bool MadeChanges = false;
+
+  // If we decided that there are basic blocks that are dead in this function,
+  // delete their contents now.  Note that we cannot actually delete the blocks,
+  // as we cannot modify the CFG of the function.
+  //
+  SmallVector<Instruction*, 512> Insts;
+  std::map<Value*, LatticeVal> &Values = Solver.getValueMapping();
+
+  for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB)
+    if (!Solver.isBlockExecutable(BB)) {
+      DOUT << "  BasicBlock Dead:" << *BB;
+      ++NumDeadBlocks;
+
+      // Delete the instructions backwards, as it has a reduced likelihood of
+      // having to update as many def-use and use-def chains.
+      for (BasicBlock::iterator I = BB->begin(), E = BB->getTerminator();
+           I != E; ++I)
+        Insts.push_back(I);
+      while (!Insts.empty()) {
+        Instruction *I = Insts.back();
+        Insts.pop_back();
+        if (!I->use_empty())
+          I->replaceAllUsesWith(UndefValue::get(I->getType()));
+        BB->getInstList().erase(I);
+        MadeChanges = true;
+        ++NumInstRemoved;
+      }
+    } else {
+      // Iterate over all of the instructions in a function, replacing them with
+      // constants if we have found them to be of constant values.
+      //
+      for (BasicBlock::iterator BI = BB->begin(), E = BB->end(); BI != E; ) {
+        Instruction *Inst = BI++;
+        if (Inst->getType() == Type::VoidTy ||
+            isa<TerminatorInst>(Inst))
+          continue;
+        
+        LatticeVal &IV = Values[Inst];
+        if (!IV.isConstant() && !IV.isUndefined())
+          continue;
+        
+        Constant *Const = IV.isConstant()
+          ? IV.getConstant() : UndefValue::get(Inst->getType());
+        DOUT << "  Constant: " << *Const << " = " << *Inst;
+
+        // Replaces all of the uses of a variable with uses of the constant.
+        Inst->replaceAllUsesWith(Const);
+        
+        // Delete the instruction.
+        Inst->eraseFromParent();
+        
+        // Hey, we just changed something!
+        MadeChanges = true;
+        ++NumInstRemoved;
+      }
+    }
+
+  return MadeChanges;
+}
+
+namespace {
+  //===--------------------------------------------------------------------===//
+  //
+  /// IPSCCP Class - This class implements interprocedural Sparse Conditional
+  /// Constant Propagation.
+  ///
+  struct VISIBILITY_HIDDEN IPSCCP : public ModulePass {
+    static char ID;
+    IPSCCP() : ModulePass(&ID) {}
+    bool runOnModule(Module &M);
+  };
+} // end anonymous namespace
+
+char IPSCCP::ID = 0;
+static RegisterPass<IPSCCP>
+Y("ipsccp", "Interprocedural Sparse Conditional Constant Propagation");
+
+// createIPSCCPPass - This is the public interface to this file...
+ModulePass *llvm::createIPSCCPPass() {
+  return new IPSCCP();
+}
+
+
+static bool AddressIsTaken(GlobalValue *GV) {
+  // Delete any dead constantexpr klingons.
+  GV->removeDeadConstantUsers();
+
+  for (Value::use_iterator UI = GV->use_begin(), E = GV->use_end();
+       UI != E; ++UI)
+    if (StoreInst *SI = dyn_cast<StoreInst>(*UI)) {
+      if (SI->getOperand(0) == GV || SI->isVolatile())
+        return true;  // Storing addr of GV.
+    } else if (isa<InvokeInst>(*UI) || isa<CallInst>(*UI)) {
+      // Make sure we are calling the function, not passing the address.
+      CallSite CS = CallSite::get(cast<Instruction>(*UI));
+      if (CS.hasArgument(GV))
+        return true;
+    } else if (LoadInst *LI = dyn_cast<LoadInst>(*UI)) {
+      if (LI->isVolatile())
+        return true;
+    } else {
+      return true;
+    }
+  return false;
+}
+
+bool IPSCCP::runOnModule(Module &M) {
+  SCCPSolver Solver;
+
+  // Loop over all functions, marking arguments to those with their addresses
+  // taken or that are external as overdefined.
+  //
+  for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F)
+    if (!F->hasLocalLinkage() || AddressIsTaken(F)) {
+      if (!F->isDeclaration())
+        Solver.MarkBlockExecutable(F->begin());
+      for (Function::arg_iterator AI = F->arg_begin(), E = F->arg_end();
+           AI != E; ++AI)
+        Solver.markOverdefined(AI);
+    } else {
+      Solver.AddTrackedFunction(F);
+    }
+
+  // Loop over global variables.  We inform the solver about any internal global
+  // variables that do not have their 'addresses taken'.  If they don't have
+  // their addresses taken, we can propagate constants through them.
+  for (Module::global_iterator G = M.global_begin(), E = M.global_end();
+       G != E; ++G)
+    if (!G->isConstant() && G->hasLocalLinkage() && !AddressIsTaken(G))
+      Solver.TrackValueOfGlobalVariable(G);
+
+  // Solve for constants.
+  bool ResolvedUndefs = true;
+  while (ResolvedUndefs) {
+    Solver.Solve();
+
+    DOUT << "RESOLVING UNDEFS\n";
+    ResolvedUndefs = false;
+    for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F)
+      ResolvedUndefs |= Solver.ResolvedUndefsIn(*F);
+  }
+
+  bool MadeChanges = false;
+
+  // Iterate over all of the instructions in the module, replacing them with
+  // constants if we have found them to be of constant values.
+  //
+  SmallVector<Instruction*, 512> Insts;
+  SmallVector<BasicBlock*, 512> BlocksToErase;
+  std::map<Value*, LatticeVal> &Values = Solver.getValueMapping();
+
+  for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) {
+    for (Function::arg_iterator AI = F->arg_begin(), E = F->arg_end();
+         AI != E; ++AI)
+      if (!AI->use_empty()) {
+        LatticeVal &IV = Values[AI];
+        if (IV.isConstant() || IV.isUndefined()) {
+          Constant *CST = IV.isConstant() ?
+            IV.getConstant() : UndefValue::get(AI->getType());
+          DOUT << "***  Arg " << *AI << " = " << *CST <<"\n";
+
+          // Replaces all of the uses of a variable with uses of the
+          // constant.
+          AI->replaceAllUsesWith(CST);
+          ++IPNumArgsElimed;
+        }
+      }
+
+    for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB)
+      if (!Solver.isBlockExecutable(BB)) {
+        DOUT << "  BasicBlock Dead:" << *BB;
+        ++IPNumDeadBlocks;
+
+        // Delete the instructions backwards, as it has a reduced likelihood of
+        // having to update as many def-use and use-def chains.
+        TerminatorInst *TI = BB->getTerminator();
+        for (BasicBlock::iterator I = BB->begin(), E = TI; I != E; ++I)
+          Insts.push_back(I);
+
+        while (!Insts.empty()) {
+          Instruction *I = Insts.back();
+          Insts.pop_back();
+          if (!I->use_empty())
+            I->replaceAllUsesWith(UndefValue::get(I->getType()));
+          BB->getInstList().erase(I);
+          MadeChanges = true;
+          ++IPNumInstRemoved;
+        }
+
+        for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i) {
+          BasicBlock *Succ = TI->getSuccessor(i);
+          if (!Succ->empty() && isa<PHINode>(Succ->begin()))
+            TI->getSuccessor(i)->removePredecessor(BB);
+        }
+        if (!TI->use_empty())
+          TI->replaceAllUsesWith(UndefValue::get(TI->getType()));
+        BB->getInstList().erase(TI);
+
+        if (&*BB != &F->front())
+          BlocksToErase.push_back(BB);
+        else
+          new UnreachableInst(BB);
+
+      } else {
+        for (BasicBlock::iterator BI = BB->begin(), E = BB->end(); BI != E; ) {
+          Instruction *Inst = BI++;
+          if (Inst->getType() == Type::VoidTy)
+            continue;
+          
+          LatticeVal &IV = Values[Inst];
+          if (!IV.isConstant() && !IV.isUndefined())
+            continue;
+          
+          Constant *Const = IV.isConstant()
+            ? IV.getConstant() : UndefValue::get(Inst->getType());
+          DOUT << "  Constant: " << *Const << " = " << *Inst;
+
+          // Replaces all of the uses of a variable with uses of the
+          // constant.
+          Inst->replaceAllUsesWith(Const);
+          
+          // Delete the instruction.
+          if (!isa<CallInst>(Inst) && !isa<TerminatorInst>(Inst))
+            Inst->eraseFromParent();
+
+          // Hey, we just changed something!
+          MadeChanges = true;
+          ++IPNumInstRemoved;
+        }
+      }
+
+    // Now that all instructions in the function are constant folded, erase dead
+    // blocks, because we can now use ConstantFoldTerminator to get rid of
+    // in-edges.
+    for (unsigned i = 0, e = BlocksToErase.size(); i != e; ++i) {
+      // If there are any PHI nodes in this successor, drop entries for BB now.
+      BasicBlock *DeadBB = BlocksToErase[i];
+      while (!DeadBB->use_empty()) {
+        Instruction *I = cast<Instruction>(DeadBB->use_back());
+        bool Folded = ConstantFoldTerminator(I->getParent());
+        if (!Folded) {
+          // The constant folder may not have been able to fold the terminator
+          // if this is a branch or switch on undef.  Fold it manually as a
+          // branch to the first successor.
+#ifndef NDEBUG
+          if (BranchInst *BI = dyn_cast<BranchInst>(I)) {
+            assert(BI->isConditional() && isa<UndefValue>(BI->getCondition()) &&
+                   "Branch should be foldable!");
+          } else if (SwitchInst *SI = dyn_cast<SwitchInst>(I)) {
+            assert(isa<UndefValue>(SI->getCondition()) && "Switch should fold");
+          } else {
+            assert(0 && "Didn't fold away reference to block!");
+          }
+#endif
+          
+          // Make this an uncond branch to the first successor.
+          TerminatorInst *TI = I->getParent()->getTerminator();
+          BranchInst::Create(TI->getSuccessor(0), TI);
+          
+          // Remove entries in successor phi nodes to remove edges.
+          for (unsigned i = 1, e = TI->getNumSuccessors(); i != e; ++i)
+            TI->getSuccessor(i)->removePredecessor(TI->getParent());
+          
+          // Remove the old terminator.
+          TI->eraseFromParent();
+        }
+      }
+
+      // Finally, delete the basic block.
+      F->getBasicBlockList().erase(DeadBB);
+    }
+    BlocksToErase.clear();
+  }
+
+  // If we inferred constant or undef return values for a function, we replaced
+  // all call uses with the inferred value.  This means we don't need to bother
+  // actually returning anything from the function.  Replace all return
+  // instructions with return undef.
+  // TODO: Process multiple value ret instructions also.
+  const DenseMap<Function*, LatticeVal> &RV = Solver.getTrackedRetVals();
+  for (DenseMap<Function*, LatticeVal>::const_iterator I = RV.begin(),
+         E = RV.end(); I != E; ++I)
+    if (!I->second.isOverdefined() &&
+        I->first->getReturnType() != Type::VoidTy) {
+      Function *F = I->first;
+      for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB)
+        if (ReturnInst *RI = dyn_cast<ReturnInst>(BB->getTerminator()))
+          if (!isa<UndefValue>(RI->getOperand(0)))
+            RI->setOperand(0, UndefValue::get(F->getReturnType()));
+    }
+
+  // If we infered constant or undef values for globals variables, we can delete
+  // the global and any stores that remain to it.
+  const DenseMap<GlobalVariable*, LatticeVal> &TG = Solver.getTrackedGlobals();
+  for (DenseMap<GlobalVariable*, LatticeVal>::const_iterator I = TG.begin(),
+         E = TG.end(); I != E; ++I) {
+    GlobalVariable *GV = I->first;
+    assert(!I->second.isOverdefined() &&
+           "Overdefined values should have been taken out of the map!");
+    DOUT << "Found that GV '" << GV->getNameStart() << "' is constant!\n";
+    while (!GV->use_empty()) {
+      StoreInst *SI = cast<StoreInst>(GV->use_back());
+      SI->eraseFromParent();
+    }
+    M.getGlobalList().erase(GV);
+    ++IPNumGlobalConst;
+  }
+
+  return MadeChanges;
+}
diff --git a/lib/Transforms/Scalar/Scalar.cpp b/lib/Transforms/Scalar/Scalar.cpp
new file mode 100644
index 0000000..5669da0
--- /dev/null
+++ b/lib/Transforms/Scalar/Scalar.cpp
@@ -0,0 +1,111 @@
+//===-- Scalar.cpp --------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the C bindings for libLLVMScalarOpts.a, which implements
+// several scalar transformations over the LLVM intermediate representation.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm-c/Transforms/Scalar.h"
+#include "llvm/PassManager.h"
+#include "llvm/Transforms/Scalar.h"
+
+using namespace llvm;
+
+void LLVMAddAggressiveDCEPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createAggressiveDCEPass());
+}
+
+void LLVMAddCFGSimplificationPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createCFGSimplificationPass());
+}
+
+void LLVMAddCondPropagationPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createCondPropagationPass());
+}
+
+void LLVMAddDeadStoreEliminationPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createDeadStoreEliminationPass());
+}
+
+void LLVMAddGVNPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createGVNPass());
+}
+
+void LLVMAddIndVarSimplifyPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createIndVarSimplifyPass());
+}
+
+void LLVMAddInstructionCombiningPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createInstructionCombiningPass());
+}
+
+void LLVMAddJumpThreadingPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createJumpThreadingPass());
+}
+
+void LLVMAddLICMPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createLICMPass());
+}
+
+void LLVMAddLoopDeletionPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createLoopDeletionPass());
+}
+
+void LLVMAddLoopIndexSplitPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createLoopIndexSplitPass());
+}
+
+void LLVMAddLoopRotatePass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createLoopRotatePass());
+}
+
+void LLVMAddLoopUnrollPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createLoopUnrollPass());
+}
+
+void LLVMAddLoopUnswitchPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createLoopUnswitchPass());
+}
+
+void LLVMAddMemCpyOptPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createMemCpyOptPass());
+}
+
+void LLVMAddPromoteMemoryToRegisterPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createPromoteMemoryToRegisterPass());
+}
+
+void LLVMAddReassociatePass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createReassociatePass());
+}
+
+void LLVMAddSCCPPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createSCCPPass());
+}
+
+void LLVMAddScalarReplAggregatesPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createScalarReplAggregatesPass());
+}
+
+void LLVMAddSimplifyLibCallsPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createSimplifyLibCallsPass());
+}
+
+void LLVMAddTailCallEliminationPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createTailCallEliminationPass());
+}
+
+void LLVMAddConstantPropagationPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createConstantPropagationPass());
+}
+
+void LLVMAddDemoteMemoryToRegisterPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createDemoteRegisterToMemoryPass());
+}
diff --git a/lib/Transforms/Scalar/ScalarReplAggregates.cpp b/lib/Transforms/Scalar/ScalarReplAggregates.cpp
new file mode 100644
index 0000000..9935f12
--- /dev/null
+++ b/lib/Transforms/Scalar/ScalarReplAggregates.cpp
@@ -0,0 +1,1820 @@
+//===- ScalarReplAggregates.cpp - Scalar Replacement of Aggregates --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This transformation implements the well known scalar replacement of
+// aggregates transformation.  This xform breaks up alloca instructions of
+// aggregate type (structure or array) into individual alloca instructions for
+// each member (if possible).  Then, if possible, it transforms the individual
+// alloca instructions into nice clean scalar SSA form.
+//
+// This combines a simple SRoA algorithm with the Mem2Reg algorithm because
+// often interact, especially for C++ programs.  As such, iterating between
+// SRoA, then Mem2Reg until we run out of things to promote works well.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "scalarrepl"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
+#include "llvm/GlobalVariable.h"
+#include "llvm/Instructions.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Pass.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Transforms/Utils/PromoteMemToReg.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/GetElementPtrTypeIterator.h"
+#include "llvm/Support/IRBuilder.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
+using namespace llvm;
+
+STATISTIC(NumReplaced,  "Number of allocas broken up");
+STATISTIC(NumPromoted,  "Number of allocas promoted");
+STATISTIC(NumConverted, "Number of aggregates converted to scalar");
+STATISTIC(NumGlobals,   "Number of allocas copied from constant global");
+
+namespace {
+  struct VISIBILITY_HIDDEN SROA : public FunctionPass {
+    static char ID; // Pass identification, replacement for typeid
+    explicit SROA(signed T = -1) : FunctionPass(&ID) {
+      if (T == -1)
+        SRThreshold = 128;
+      else
+        SRThreshold = T;
+    }
+
+    bool runOnFunction(Function &F);
+
+    bool performScalarRepl(Function &F);
+    bool performPromotion(Function &F);
+
+    // getAnalysisUsage - This pass does not require any passes, but we know it
+    // will not alter the CFG, so say so.
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequired<DominatorTree>();
+      AU.addRequired<DominanceFrontier>();
+      AU.addRequired<TargetData>();
+      AU.setPreservesCFG();
+    }
+
+  private:
+    TargetData *TD;
+    
+    /// AllocaInfo - When analyzing uses of an alloca instruction, this captures
+    /// information about the uses.  All these fields are initialized to false
+    /// and set to true when something is learned.
+    struct AllocaInfo {
+      /// isUnsafe - This is set to true if the alloca cannot be SROA'd.
+      bool isUnsafe : 1;
+      
+      /// needsCleanup - This is set to true if there is some use of the alloca
+      /// that requires cleanup.
+      bool needsCleanup : 1;
+      
+      /// isMemCpySrc - This is true if this aggregate is memcpy'd from.
+      bool isMemCpySrc : 1;
+
+      /// isMemCpyDst - This is true if this aggregate is memcpy'd into.
+      bool isMemCpyDst : 1;
+
+      AllocaInfo()
+        : isUnsafe(false), needsCleanup(false), 
+          isMemCpySrc(false), isMemCpyDst(false) {}
+    };
+    
+    unsigned SRThreshold;
+
+    void MarkUnsafe(AllocaInfo &I) { I.isUnsafe = true; }
+
+    int isSafeAllocaToScalarRepl(AllocationInst *AI);
+
+    void isSafeUseOfAllocation(Instruction *User, AllocationInst *AI,
+                               AllocaInfo &Info);
+    void isSafeElementUse(Value *Ptr, bool isFirstElt, AllocationInst *AI,
+                         AllocaInfo &Info);
+    void isSafeMemIntrinsicOnAllocation(MemIntrinsic *MI, AllocationInst *AI,
+                                        unsigned OpNo, AllocaInfo &Info);
+    void isSafeUseOfBitCastedAllocation(BitCastInst *User, AllocationInst *AI,
+                                        AllocaInfo &Info);
+    
+    void DoScalarReplacement(AllocationInst *AI, 
+                             std::vector<AllocationInst*> &WorkList);
+    void CleanupGEP(GetElementPtrInst *GEP);
+    void CleanupAllocaUsers(AllocationInst *AI);
+    AllocaInst *AddNewAlloca(Function &F, const Type *Ty, AllocationInst *Base);
+    
+    void RewriteBitCastUserOfAlloca(Instruction *BCInst, AllocationInst *AI,
+                                    SmallVector<AllocaInst*, 32> &NewElts);
+    
+    void RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *BCInst,
+                                      AllocationInst *AI,
+                                      SmallVector<AllocaInst*, 32> &NewElts);
+    void RewriteStoreUserOfWholeAlloca(StoreInst *SI, AllocationInst *AI,
+                                       SmallVector<AllocaInst*, 32> &NewElts);
+    void RewriteLoadUserOfWholeAlloca(LoadInst *LI, AllocationInst *AI,
+                                      SmallVector<AllocaInst*, 32> &NewElts);
+    
+    bool CanConvertToScalar(Value *V, bool &IsNotTrivial, const Type *&VecTy,
+                            bool &SawVec, uint64_t Offset, unsigned AllocaSize);
+    void ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI, uint64_t Offset);
+    Value *ConvertScalar_ExtractValue(Value *NV, const Type *ToType,
+                                     uint64_t Offset, IRBuilder<> &Builder);
+    Value *ConvertScalar_InsertValue(Value *StoredVal, Value *ExistingVal,
+                                     uint64_t Offset, IRBuilder<> &Builder);
+    static Instruction *isOnlyCopiedFromConstantGlobal(AllocationInst *AI);
+  };
+}
+
+char SROA::ID = 0;
+static RegisterPass<SROA> X("scalarrepl", "Scalar Replacement of Aggregates");
+
+// Public interface to the ScalarReplAggregates pass
+FunctionPass *llvm::createScalarReplAggregatesPass(signed int Threshold) { 
+  return new SROA(Threshold);
+}
+
+
+bool SROA::runOnFunction(Function &F) {
+  TD = &getAnalysis<TargetData>();
+  
+  bool Changed = performPromotion(F);
+  while (1) {
+    bool LocalChange = performScalarRepl(F);
+    if (!LocalChange) break;   // No need to repromote if no scalarrepl
+    Changed = true;
+    LocalChange = performPromotion(F);
+    if (!LocalChange) break;   // No need to re-scalarrepl if no promotion
+  }
+
+  return Changed;
+}
+
+
+bool SROA::performPromotion(Function &F) {
+  std::vector<AllocaInst*> Allocas;
+  DominatorTree         &DT = getAnalysis<DominatorTree>();
+  DominanceFrontier &DF = getAnalysis<DominanceFrontier>();
+
+  BasicBlock &BB = F.getEntryBlock();  // Get the entry node for the function
+
+  bool Changed = false;
+
+  while (1) {
+    Allocas.clear();
+
+    // Find allocas that are safe to promote, by looking at all instructions in
+    // the entry node
+    for (BasicBlock::iterator I = BB.begin(), E = --BB.end(); I != E; ++I)
+      if (AllocaInst *AI = dyn_cast<AllocaInst>(I))       // Is it an alloca?
+        if (isAllocaPromotable(AI))
+          Allocas.push_back(AI);
+
+    if (Allocas.empty()) break;
+
+    PromoteMemToReg(Allocas, DT, DF);
+    NumPromoted += Allocas.size();
+    Changed = true;
+  }
+
+  return Changed;
+}
+
+/// getNumSAElements - Return the number of elements in the specific struct or
+/// array.
+static uint64_t getNumSAElements(const Type *T) {
+  if (const StructType *ST = dyn_cast<StructType>(T))
+    return ST->getNumElements();
+  return cast<ArrayType>(T)->getNumElements();
+}
+
+// performScalarRepl - This algorithm is a simple worklist driven algorithm,
+// which runs on all of the malloc/alloca instructions in the function, removing
+// them if they are only used by getelementptr instructions.
+//
+bool SROA::performScalarRepl(Function &F) {
+  std::vector<AllocationInst*> WorkList;
+
+  // Scan the entry basic block, adding any alloca's and mallocs to the worklist
+  BasicBlock &BB = F.getEntryBlock();
+  for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; ++I)
+    if (AllocationInst *A = dyn_cast<AllocationInst>(I))
+      WorkList.push_back(A);
+
+  // Process the worklist
+  bool Changed = false;
+  while (!WorkList.empty()) {
+    AllocationInst *AI = WorkList.back();
+    WorkList.pop_back();
+    
+    // Handle dead allocas trivially.  These can be formed by SROA'ing arrays
+    // with unused elements.
+    if (AI->use_empty()) {
+      AI->eraseFromParent();
+      continue;
+    }
+
+    // If this alloca is impossible for us to promote, reject it early.
+    if (AI->isArrayAllocation() || !AI->getAllocatedType()->isSized())
+      continue;
+    
+    // Check to see if this allocation is only modified by a memcpy/memmove from
+    // a constant global.  If this is the case, we can change all users to use
+    // the constant global instead.  This is commonly produced by the CFE by
+    // constructs like "void foo() { int A[] = {1,2,3,4,5,6,7,8,9...}; }" if 'A'
+    // is only subsequently read.
+    if (Instruction *TheCopy = isOnlyCopiedFromConstantGlobal(AI)) {
+      DOUT << "Found alloca equal to global: " << *AI;
+      DOUT << "  memcpy = " << *TheCopy;
+      Constant *TheSrc = cast<Constant>(TheCopy->getOperand(2));
+      AI->replaceAllUsesWith(ConstantExpr::getBitCast(TheSrc, AI->getType()));
+      TheCopy->eraseFromParent();  // Don't mutate the global.
+      AI->eraseFromParent();
+      ++NumGlobals;
+      Changed = true;
+      continue;
+    }
+    
+    // Check to see if we can perform the core SROA transformation.  We cannot
+    // transform the allocation instruction if it is an array allocation
+    // (allocations OF arrays are ok though), and an allocation of a scalar
+    // value cannot be decomposed at all.
+    uint64_t AllocaSize = TD->getTypeAllocSize(AI->getAllocatedType());
+
+    // Do not promote any struct whose size is too big.
+    if (AllocaSize > SRThreshold) continue;
+        
+    if ((isa<StructType>(AI->getAllocatedType()) ||
+         isa<ArrayType>(AI->getAllocatedType())) &&
+        // Do not promote any struct into more than "32" separate vars.
+        getNumSAElements(AI->getAllocatedType()) <= SRThreshold/4) {
+      // Check that all of the users of the allocation are capable of being
+      // transformed.
+      switch (isSafeAllocaToScalarRepl(AI)) {
+      default: assert(0 && "Unexpected value!");
+      case 0:  // Not safe to scalar replace.
+        break;
+      case 1:  // Safe, but requires cleanup/canonicalizations first
+        CleanupAllocaUsers(AI);
+        // FALL THROUGH.
+      case 3:  // Safe to scalar replace.
+        DoScalarReplacement(AI, WorkList);
+        Changed = true;
+        continue;
+      }
+    }
+
+    // If we can turn this aggregate value (potentially with casts) into a
+    // simple scalar value that can be mem2reg'd into a register value.
+    // IsNotTrivial tracks whether this is something that mem2reg could have
+    // promoted itself.  If so, we don't want to transform it needlessly.  Note
+    // that we can't just check based on the type: the alloca may be of an i32
+    // but that has pointer arithmetic to set byte 3 of it or something.
+    bool IsNotTrivial = false;
+    const Type *VectorTy = 0;
+    bool HadAVector = false;
+    if (CanConvertToScalar(AI, IsNotTrivial, VectorTy, HadAVector, 
+                           0, unsigned(AllocaSize)) && IsNotTrivial) {
+      AllocaInst *NewAI;
+      // If we were able to find a vector type that can handle this with
+      // insert/extract elements, and if there was at least one use that had
+      // a vector type, promote this to a vector.  We don't want to promote
+      // random stuff that doesn't use vectors (e.g. <9 x double>) because then
+      // we just get a lot of insert/extracts.  If at least one vector is
+      // involved, then we probably really do have a union of vector/array.
+      if (VectorTy && isa<VectorType>(VectorTy) && HadAVector) {
+        DOUT << "CONVERT TO VECTOR: " << *AI << "  TYPE = " << *VectorTy <<"\n";
+        
+        // Create and insert the vector alloca.
+        NewAI = new AllocaInst(VectorTy, 0, "", AI->getParent()->begin());
+        ConvertUsesToScalar(AI, NewAI, 0);
+      } else {
+        DOUT << "CONVERT TO SCALAR INTEGER: " << *AI << "\n";
+        
+        // Create and insert the integer alloca.
+        const Type *NewTy = IntegerType::get(AllocaSize*8);
+        NewAI = new AllocaInst(NewTy, 0, "", AI->getParent()->begin());
+        ConvertUsesToScalar(AI, NewAI, 0);
+      }
+      NewAI->takeName(AI);
+      AI->eraseFromParent();
+      ++NumConverted;
+      Changed = true;
+      continue;
+    }
+    
+    // Otherwise, couldn't process this alloca.
+  }
+
+  return Changed;
+}
+
+/// DoScalarReplacement - This alloca satisfied the isSafeAllocaToScalarRepl
+/// predicate, do SROA now.
+void SROA::DoScalarReplacement(AllocationInst *AI, 
+                               std::vector<AllocationInst*> &WorkList) {
+  DOUT << "Found inst to SROA: " << *AI;
+  SmallVector<AllocaInst*, 32> ElementAllocas;
+  if (const StructType *ST = dyn_cast<StructType>(AI->getAllocatedType())) {
+    ElementAllocas.reserve(ST->getNumContainedTypes());
+    for (unsigned i = 0, e = ST->getNumContainedTypes(); i != e; ++i) {
+      AllocaInst *NA = new AllocaInst(ST->getContainedType(i), 0, 
+                                      AI->getAlignment(),
+                                      AI->getName() + "." + utostr(i), AI);
+      ElementAllocas.push_back(NA);
+      WorkList.push_back(NA);  // Add to worklist for recursive processing
+    }
+  } else {
+    const ArrayType *AT = cast<ArrayType>(AI->getAllocatedType());
+    ElementAllocas.reserve(AT->getNumElements());
+    const Type *ElTy = AT->getElementType();
+    for (unsigned i = 0, e = AT->getNumElements(); i != e; ++i) {
+      AllocaInst *NA = new AllocaInst(ElTy, 0, AI->getAlignment(),
+                                      AI->getName() + "." + utostr(i), AI);
+      ElementAllocas.push_back(NA);
+      WorkList.push_back(NA);  // Add to worklist for recursive processing
+    }
+  }
+
+  // Now that we have created the alloca instructions that we want to use,
+  // expand the getelementptr instructions to use them.
+  //
+  while (!AI->use_empty()) {
+    Instruction *User = cast<Instruction>(AI->use_back());
+    if (BitCastInst *BCInst = dyn_cast<BitCastInst>(User)) {
+      RewriteBitCastUserOfAlloca(BCInst, AI, ElementAllocas);
+      BCInst->eraseFromParent();
+      continue;
+    }
+    
+    // Replace:
+    //   %res = load { i32, i32 }* %alloc
+    // with:
+    //   %load.0 = load i32* %alloc.0
+    //   %insert.0 insertvalue { i32, i32 } zeroinitializer, i32 %load.0, 0 
+    //   %load.1 = load i32* %alloc.1
+    //   %insert = insertvalue { i32, i32 } %insert.0, i32 %load.1, 1 
+    // (Also works for arrays instead of structs)
+    if (LoadInst *LI = dyn_cast<LoadInst>(User)) {
+      Value *Insert = UndefValue::get(LI->getType());
+      for (unsigned i = 0, e = ElementAllocas.size(); i != e; ++i) {
+        Value *Load = new LoadInst(ElementAllocas[i], "load", LI);
+        Insert = InsertValueInst::Create(Insert, Load, i, "insert", LI);
+      }
+      LI->replaceAllUsesWith(Insert);
+      LI->eraseFromParent();
+      continue;
+    }
+
+    // Replace:
+    //   store { i32, i32 } %val, { i32, i32 }* %alloc
+    // with:
+    //   %val.0 = extractvalue { i32, i32 } %val, 0 
+    //   store i32 %val.0, i32* %alloc.0
+    //   %val.1 = extractvalue { i32, i32 } %val, 1 
+    //   store i32 %val.1, i32* %alloc.1
+    // (Also works for arrays instead of structs)
+    if (StoreInst *SI = dyn_cast<StoreInst>(User)) {
+      Value *Val = SI->getOperand(0);
+      for (unsigned i = 0, e = ElementAllocas.size(); i != e; ++i) {
+        Value *Extract = ExtractValueInst::Create(Val, i, Val->getName(), SI);
+        new StoreInst(Extract, ElementAllocas[i], SI);
+      }
+      SI->eraseFromParent();
+      continue;
+    }
+    
+    GetElementPtrInst *GEPI = cast<GetElementPtrInst>(User);
+    // We now know that the GEP is of the form: GEP <ptr>, 0, <cst>
+    unsigned Idx =
+       (unsigned)cast<ConstantInt>(GEPI->getOperand(2))->getZExtValue();
+
+    assert(Idx < ElementAllocas.size() && "Index out of range?");
+    AllocaInst *AllocaToUse = ElementAllocas[Idx];
+
+    Value *RepValue;
+    if (GEPI->getNumOperands() == 3) {
+      // Do not insert a new getelementptr instruction with zero indices, only
+      // to have it optimized out later.
+      RepValue = AllocaToUse;
+    } else {
+      // We are indexing deeply into the structure, so we still need a
+      // getelement ptr instruction to finish the indexing.  This may be
+      // expanded itself once the worklist is rerun.
+      //
+      SmallVector<Value*, 8> NewArgs;
+      NewArgs.push_back(Constant::getNullValue(Type::Int32Ty));
+      NewArgs.append(GEPI->op_begin()+3, GEPI->op_end());
+      RepValue = GetElementPtrInst::Create(AllocaToUse, NewArgs.begin(),
+                                           NewArgs.end(), "", GEPI);
+      RepValue->takeName(GEPI);
+    }
+    
+    // If this GEP is to the start of the aggregate, check for memcpys.
+    if (Idx == 0 && GEPI->hasAllZeroIndices())
+      RewriteBitCastUserOfAlloca(GEPI, AI, ElementAllocas);
+
+    // Move all of the users over to the new GEP.
+    GEPI->replaceAllUsesWith(RepValue);
+    // Delete the old GEP
+    GEPI->eraseFromParent();
+  }
+
+  // Finally, delete the Alloca instruction
+  AI->eraseFromParent();
+  NumReplaced++;
+}
+
+
+/// isSafeElementUse - Check to see if this use is an allowed use for a
+/// getelementptr instruction of an array aggregate allocation.  isFirstElt
+/// indicates whether Ptr is known to the start of the aggregate.
+///
+void SROA::isSafeElementUse(Value *Ptr, bool isFirstElt, AllocationInst *AI,
+                            AllocaInfo &Info) {
+  for (Value::use_iterator I = Ptr->use_begin(), E = Ptr->use_end();
+       I != E; ++I) {
+    Instruction *User = cast<Instruction>(*I);
+    switch (User->getOpcode()) {
+    case Instruction::Load:  break;
+    case Instruction::Store:
+      // Store is ok if storing INTO the pointer, not storing the pointer
+      if (User->getOperand(0) == Ptr) return MarkUnsafe(Info);
+      break;
+    case Instruction::GetElementPtr: {
+      GetElementPtrInst *GEP = cast<GetElementPtrInst>(User);
+      bool AreAllZeroIndices = isFirstElt;
+      if (GEP->getNumOperands() > 1) {
+        if (!isa<ConstantInt>(GEP->getOperand(1)) ||
+            !cast<ConstantInt>(GEP->getOperand(1))->isZero())
+          // Using pointer arithmetic to navigate the array.
+          return MarkUnsafe(Info);
+       
+        if (AreAllZeroIndices)
+          AreAllZeroIndices = GEP->hasAllZeroIndices();
+      }
+      isSafeElementUse(GEP, AreAllZeroIndices, AI, Info);
+      if (Info.isUnsafe) return;
+      break;
+    }
+    case Instruction::BitCast:
+      if (isFirstElt) {
+        isSafeUseOfBitCastedAllocation(cast<BitCastInst>(User), AI, Info);
+        if (Info.isUnsafe) return;
+        break;
+      }
+      DOUT << "  Transformation preventing inst: " << *User;
+      return MarkUnsafe(Info);
+    case Instruction::Call:
+      if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(User)) {
+        if (isFirstElt) {
+          isSafeMemIntrinsicOnAllocation(MI, AI, I.getOperandNo(), Info);
+          if (Info.isUnsafe) return;
+          break;
+        }
+      }
+      DOUT << "  Transformation preventing inst: " << *User;
+      return MarkUnsafe(Info);
+    default:
+      DOUT << "  Transformation preventing inst: " << *User;
+      return MarkUnsafe(Info);
+    }
+  }
+  return;  // All users look ok :)
+}
+
+/// AllUsersAreLoads - Return true if all users of this value are loads.
+static bool AllUsersAreLoads(Value *Ptr) {
+  for (Value::use_iterator I = Ptr->use_begin(), E = Ptr->use_end();
+       I != E; ++I)
+    if (cast<Instruction>(*I)->getOpcode() != Instruction::Load)
+      return false;
+  return true;
+}
+
+/// isSafeUseOfAllocation - Check to see if this user is an allowed use for an
+/// aggregate allocation.
+///
+void SROA::isSafeUseOfAllocation(Instruction *User, AllocationInst *AI,
+                                 AllocaInfo &Info) {
+  if (BitCastInst *C = dyn_cast<BitCastInst>(User))
+    return isSafeUseOfBitCastedAllocation(C, AI, Info);
+
+  if (LoadInst *LI = dyn_cast<LoadInst>(User))
+    if (!LI->isVolatile())
+      return;// Loads (returning a first class aggregrate) are always rewritable
+
+  if (StoreInst *SI = dyn_cast<StoreInst>(User))
+    if (!SI->isVolatile() && SI->getOperand(0) != AI)
+      return;// Store is ok if storing INTO the pointer, not storing the pointer
+ 
+  GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(User);
+  if (GEPI == 0)
+    return MarkUnsafe(Info);
+
+  gep_type_iterator I = gep_type_begin(GEPI), E = gep_type_end(GEPI);
+
+  // The GEP is not safe to transform if not of the form "GEP <ptr>, 0, <cst>".
+  if (I == E ||
+      I.getOperand() != Constant::getNullValue(I.getOperand()->getType())) {
+    return MarkUnsafe(Info);
+  }
+
+  ++I;
+  if (I == E) return MarkUnsafe(Info);  // ran out of GEP indices??
+
+  bool IsAllZeroIndices = true;
+  
+  // If the first index is a non-constant index into an array, see if we can
+  // handle it as a special case.
+  if (const ArrayType *AT = dyn_cast<ArrayType>(*I)) {
+    if (!isa<ConstantInt>(I.getOperand())) {
+      IsAllZeroIndices = 0;
+      uint64_t NumElements = AT->getNumElements();
+      
+      // If this is an array index and the index is not constant, we cannot
+      // promote... that is unless the array has exactly one or two elements in
+      // it, in which case we CAN promote it, but we have to canonicalize this
+      // out if this is the only problem.
+      if ((NumElements == 1 || NumElements == 2) &&
+          AllUsersAreLoads(GEPI)) {
+        Info.needsCleanup = true;
+        return;  // Canonicalization required!
+      }
+      return MarkUnsafe(Info);
+    }
+  }
+ 
+  // Walk through the GEP type indices, checking the types that this indexes
+  // into.
+  for (; I != E; ++I) {
+    // Ignore struct elements, no extra checking needed for these.
+    if (isa<StructType>(*I))
+      continue;
+    
+    ConstantInt *IdxVal = dyn_cast<ConstantInt>(I.getOperand());
+    if (!IdxVal) return MarkUnsafe(Info);
+
+    // Are all indices still zero?
+    IsAllZeroIndices &= IdxVal->isZero();
+    
+    if (const ArrayType *AT = dyn_cast<ArrayType>(*I)) {
+      // This GEP indexes an array.  Verify that this is an in-range constant
+      // integer. Specifically, consider A[0][i]. We cannot know that the user
+      // isn't doing invalid things like allowing i to index an out-of-range
+      // subscript that accesses A[1].  Because of this, we have to reject SROA
+      // of any accesses into structs where any of the components are variables. 
+      if (IdxVal->getZExtValue() >= AT->getNumElements())
+        return MarkUnsafe(Info);
+    } else if (const VectorType *VT = dyn_cast<VectorType>(*I)) {
+      if (IdxVal->getZExtValue() >= VT->getNumElements())
+        return MarkUnsafe(Info);
+    }
+  }
+  
+  // If there are any non-simple uses of this getelementptr, make sure to reject
+  // them.
+  return isSafeElementUse(GEPI, IsAllZeroIndices, AI, Info);
+}
+
+/// isSafeMemIntrinsicOnAllocation - Return true if the specified memory
+/// intrinsic can be promoted by SROA.  At this point, we know that the operand
+/// of the memintrinsic is a pointer to the beginning of the allocation.
+void SROA::isSafeMemIntrinsicOnAllocation(MemIntrinsic *MI, AllocationInst *AI,
+                                          unsigned OpNo, AllocaInfo &Info) {
+  // If not constant length, give up.
+  ConstantInt *Length = dyn_cast<ConstantInt>(MI->getLength());
+  if (!Length) return MarkUnsafe(Info);
+  
+  // If not the whole aggregate, give up.
+  if (Length->getZExtValue() !=
+      TD->getTypeAllocSize(AI->getType()->getElementType()))
+    return MarkUnsafe(Info);
+  
+  // We only know about memcpy/memset/memmove.
+  if (!isa<MemIntrinsic>(MI))
+    return MarkUnsafe(Info);
+  
+  // Otherwise, we can transform it.  Determine whether this is a memcpy/set
+  // into or out of the aggregate.
+  if (OpNo == 1)
+    Info.isMemCpyDst = true;
+  else {
+    assert(OpNo == 2);
+    Info.isMemCpySrc = true;
+  }
+}
+
+/// isSafeUseOfBitCastedAllocation - Return true if all users of this bitcast
+/// are 
+void SROA::isSafeUseOfBitCastedAllocation(BitCastInst *BC, AllocationInst *AI,
+                                          AllocaInfo &Info) {
+  for (Value::use_iterator UI = BC->use_begin(), E = BC->use_end();
+       UI != E; ++UI) {
+    if (BitCastInst *BCU = dyn_cast<BitCastInst>(UI)) {
+      isSafeUseOfBitCastedAllocation(BCU, AI, Info);
+    } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(UI)) {
+      isSafeMemIntrinsicOnAllocation(MI, AI, UI.getOperandNo(), Info);
+    } else if (StoreInst *SI = dyn_cast<StoreInst>(UI)) {
+      if (SI->isVolatile())
+        return MarkUnsafe(Info);
+      
+      // If storing the entire alloca in one chunk through a bitcasted pointer
+      // to integer, we can transform it.  This happens (for example) when you
+      // cast a {i32,i32}* to i64* and store through it.  This is similar to the
+      // memcpy case and occurs in various "byval" cases and emulated memcpys.
+      if (isa<IntegerType>(SI->getOperand(0)->getType()) &&
+          TD->getTypeAllocSize(SI->getOperand(0)->getType()) ==
+          TD->getTypeAllocSize(AI->getType()->getElementType())) {
+        Info.isMemCpyDst = true;
+        continue;
+      }
+      return MarkUnsafe(Info);
+    } else if (LoadInst *LI = dyn_cast<LoadInst>(UI)) {
+      if (LI->isVolatile())
+        return MarkUnsafe(Info);
+
+      // If loading the entire alloca in one chunk through a bitcasted pointer
+      // to integer, we can transform it.  This happens (for example) when you
+      // cast a {i32,i32}* to i64* and load through it.  This is similar to the
+      // memcpy case and occurs in various "byval" cases and emulated memcpys.
+      if (isa<IntegerType>(LI->getType()) &&
+          TD->getTypeAllocSize(LI->getType()) ==
+          TD->getTypeAllocSize(AI->getType()->getElementType())) {
+        Info.isMemCpySrc = true;
+        continue;
+      }
+      return MarkUnsafe(Info);
+    } else if (isa<DbgInfoIntrinsic>(UI)) {
+      // If one user is DbgInfoIntrinsic then check if all users are
+      // DbgInfoIntrinsics.
+      if (OnlyUsedByDbgInfoIntrinsics(BC)) {
+        Info.needsCleanup = true;
+        return;
+      }
+      else
+        MarkUnsafe(Info);
+    }
+    else {
+      return MarkUnsafe(Info);
+    }
+    if (Info.isUnsafe) return;
+  }
+}
+
+/// RewriteBitCastUserOfAlloca - BCInst (transitively) bitcasts AI, or indexes
+/// to its first element.  Transform users of the cast to use the new values
+/// instead.
+void SROA::RewriteBitCastUserOfAlloca(Instruction *BCInst, AllocationInst *AI,
+                                      SmallVector<AllocaInst*, 32> &NewElts) {
+  Value::use_iterator UI = BCInst->use_begin(), UE = BCInst->use_end();
+  while (UI != UE) {
+    Instruction *User = cast<Instruction>(*UI++);
+    if (BitCastInst *BCU = dyn_cast<BitCastInst>(User)) {
+      RewriteBitCastUserOfAlloca(BCU, AI, NewElts);
+      if (BCU->use_empty()) BCU->eraseFromParent();
+      continue;
+    }
+
+    if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(User)) {
+      // This must be memcpy/memmove/memset of the entire aggregate.
+      // Split into one per element.
+      RewriteMemIntrinUserOfAlloca(MI, BCInst, AI, NewElts);
+      continue;
+    }
+      
+    if (StoreInst *SI = dyn_cast<StoreInst>(User)) {
+      // If this is a store of the entire alloca from an integer, rewrite it.
+      RewriteStoreUserOfWholeAlloca(SI, AI, NewElts);
+      continue;
+    }
+
+    if (LoadInst *LI = dyn_cast<LoadInst>(User)) {
+      // If this is a load of the entire alloca to an integer, rewrite it.
+      RewriteLoadUserOfWholeAlloca(LI, AI, NewElts);
+      continue;
+    }
+    
+    // Otherwise it must be some other user of a gep of the first pointer.  Just
+    // leave these alone.
+    continue;
+  }
+}
+
+/// RewriteMemIntrinUserOfAlloca - MI is a memcpy/memset/memmove from or to AI.
+/// Rewrite it to copy or set the elements of the scalarized memory.
+void SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *BCInst,
+                                        AllocationInst *AI,
+                                        SmallVector<AllocaInst*, 32> &NewElts) {
+  
+  // If this is a memcpy/memmove, construct the other pointer as the
+  // appropriate type.  The "Other" pointer is the pointer that goes to memory
+  // that doesn't have anything to do with the alloca that we are promoting. For
+  // memset, this Value* stays null.
+  Value *OtherPtr = 0;
+  unsigned MemAlignment = MI->getAlignment();
+  if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(MI)) { // memmove/memcopy
+    if (BCInst == MTI->getRawDest())
+      OtherPtr = MTI->getRawSource();
+    else {
+      assert(BCInst == MTI->getRawSource());
+      OtherPtr = MTI->getRawDest();
+    }
+  }
+  
+  // If there is an other pointer, we want to convert it to the same pointer
+  // type as AI has, so we can GEP through it safely.
+  if (OtherPtr) {
+    // It is likely that OtherPtr is a bitcast, if so, remove it.
+    if (BitCastInst *BC = dyn_cast<BitCastInst>(OtherPtr))
+      OtherPtr = BC->getOperand(0);
+    // All zero GEPs are effectively bitcasts.
+    if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(OtherPtr))
+      if (GEP->hasAllZeroIndices())
+        OtherPtr = GEP->getOperand(0);
+    
+    if (ConstantExpr *BCE = dyn_cast<ConstantExpr>(OtherPtr))
+      if (BCE->getOpcode() == Instruction::BitCast)
+        OtherPtr = BCE->getOperand(0);
+    
+    // If the pointer is not the right type, insert a bitcast to the right
+    // type.
+    if (OtherPtr->getType() != AI->getType())
+      OtherPtr = new BitCastInst(OtherPtr, AI->getType(), OtherPtr->getName(),
+                                 MI);
+  }
+  
+  // Process each element of the aggregate.
+  Value *TheFn = MI->getOperand(0);
+  const Type *BytePtrTy = MI->getRawDest()->getType();
+  bool SROADest = MI->getRawDest() == BCInst;
+  
+  Constant *Zero = Constant::getNullValue(Type::Int32Ty);
+
+  for (unsigned i = 0, e = NewElts.size(); i != e; ++i) {
+    // If this is a memcpy/memmove, emit a GEP of the other element address.
+    Value *OtherElt = 0;
+    unsigned OtherEltAlign = MemAlignment;
+    
+    if (OtherPtr) {
+      Value *Idx[2] = { Zero, ConstantInt::get(Type::Int32Ty, i) };
+      OtherElt = GetElementPtrInst::Create(OtherPtr, Idx, Idx + 2,
+                                           OtherPtr->getNameStr()+"."+utostr(i),
+                                           MI);
+      uint64_t EltOffset;
+      const PointerType *OtherPtrTy = cast<PointerType>(OtherPtr->getType());
+      if (const StructType *ST =
+            dyn_cast<StructType>(OtherPtrTy->getElementType())) {
+        EltOffset = TD->getStructLayout(ST)->getElementOffset(i);
+      } else {
+        const Type *EltTy =
+          cast<SequentialType>(OtherPtr->getType())->getElementType();
+        EltOffset = TD->getTypeAllocSize(EltTy)*i;
+      }
+      
+      // The alignment of the other pointer is the guaranteed alignment of the
+      // element, which is affected by both the known alignment of the whole
+      // mem intrinsic and the alignment of the element.  If the alignment of
+      // the memcpy (f.e.) is 32 but the element is at a 4-byte offset, then the
+      // known alignment is just 4 bytes.
+      OtherEltAlign = (unsigned)MinAlign(OtherEltAlign, EltOffset);
+    }
+    
+    Value *EltPtr = NewElts[i];
+    const Type *EltTy = cast<PointerType>(EltPtr->getType())->getElementType();
+    
+    // If we got down to a scalar, insert a load or store as appropriate.
+    if (EltTy->isSingleValueType()) {
+      if (isa<MemTransferInst>(MI)) {
+        if (SROADest) {
+          // From Other to Alloca.
+          Value *Elt = new LoadInst(OtherElt, "tmp", false, OtherEltAlign, MI);
+          new StoreInst(Elt, EltPtr, MI);
+        } else {
+          // From Alloca to Other.
+          Value *Elt = new LoadInst(EltPtr, "tmp", MI);
+          new StoreInst(Elt, OtherElt, false, OtherEltAlign, MI);
+        }
+        continue;
+      }
+      assert(isa<MemSetInst>(MI));
+      
+      // If the stored element is zero (common case), just store a null
+      // constant.
+      Constant *StoreVal;
+      if (ConstantInt *CI = dyn_cast<ConstantInt>(MI->getOperand(2))) {
+        if (CI->isZero()) {
+          StoreVal = Constant::getNullValue(EltTy);  // 0.0, null, 0, <0,0>
+        } else {
+          // If EltTy is a vector type, get the element type.
+          const Type *ValTy = EltTy;
+          if (const VectorType *VTy = dyn_cast<VectorType>(ValTy))
+            ValTy = VTy->getElementType();
+          
+          // Construct an integer with the right value.
+          unsigned EltSize = TD->getTypeSizeInBits(ValTy);
+          APInt OneVal(EltSize, CI->getZExtValue());
+          APInt TotalVal(OneVal);
+          // Set each byte.
+          for (unsigned i = 0; 8*i < EltSize; ++i) {
+            TotalVal = TotalVal.shl(8);
+            TotalVal |= OneVal;
+          }
+          
+          // Convert the integer value to the appropriate type.
+          StoreVal = ConstantInt::get(TotalVal);
+          if (isa<PointerType>(ValTy))
+            StoreVal = ConstantExpr::getIntToPtr(StoreVal, ValTy);
+          else if (ValTy->isFloatingPoint())
+            StoreVal = ConstantExpr::getBitCast(StoreVal, ValTy);
+          assert(StoreVal->getType() == ValTy && "Type mismatch!");
+          
+          // If the requested value was a vector constant, create it.
+          if (EltTy != ValTy) {
+            unsigned NumElts = cast<VectorType>(ValTy)->getNumElements();
+            SmallVector<Constant*, 16> Elts(NumElts, StoreVal);
+            StoreVal = ConstantVector::get(&Elts[0], NumElts);
+          }
+        }
+        new StoreInst(StoreVal, EltPtr, MI);
+        continue;
+      }
+      // Otherwise, if we're storing a byte variable, use a memset call for
+      // this element.
+    }
+    
+    // Cast the element pointer to BytePtrTy.
+    if (EltPtr->getType() != BytePtrTy)
+      EltPtr = new BitCastInst(EltPtr, BytePtrTy, EltPtr->getNameStr(), MI);
+    
+    // Cast the other pointer (if we have one) to BytePtrTy. 
+    if (OtherElt && OtherElt->getType() != BytePtrTy)
+      OtherElt = new BitCastInst(OtherElt, BytePtrTy,OtherElt->getNameStr(),
+                                 MI);
+    
+    unsigned EltSize = TD->getTypeAllocSize(EltTy);
+    
+    // Finally, insert the meminst for this element.
+    if (isa<MemTransferInst>(MI)) {
+      Value *Ops[] = {
+        SROADest ? EltPtr : OtherElt,  // Dest ptr
+        SROADest ? OtherElt : EltPtr,  // Src ptr
+        ConstantInt::get(MI->getOperand(3)->getType(), EltSize), // Size
+        ConstantInt::get(Type::Int32Ty, OtherEltAlign)  // Align
+      };
+      CallInst::Create(TheFn, Ops, Ops + 4, "", MI);
+    } else {
+      assert(isa<MemSetInst>(MI));
+      Value *Ops[] = {
+        EltPtr, MI->getOperand(2),  // Dest, Value,
+        ConstantInt::get(MI->getOperand(3)->getType(), EltSize), // Size
+        Zero  // Align
+      };
+      CallInst::Create(TheFn, Ops, Ops + 4, "", MI);
+    }
+  }
+  MI->eraseFromParent();
+}
+
+/// RewriteStoreUserOfWholeAlloca - We found an store of an integer that
+/// overwrites the entire allocation.  Extract out the pieces of the stored
+/// integer and store them individually.
+void SROA::RewriteStoreUserOfWholeAlloca(StoreInst *SI,
+                                         AllocationInst *AI,
+                                         SmallVector<AllocaInst*, 32> &NewElts){
+  // Extract each element out of the integer according to its structure offset
+  // and store the element value to the individual alloca.
+  Value *SrcVal = SI->getOperand(0);
+  const Type *AllocaEltTy = AI->getType()->getElementType();
+  uint64_t AllocaSizeBits = TD->getTypeAllocSizeInBits(AllocaEltTy);
+  
+  // If this isn't a store of an integer to the whole alloca, it may be a store
+  // to the first element.  Just ignore the store in this case and normal SROA
+  // will handle it.
+  if (!isa<IntegerType>(SrcVal->getType()) ||
+      TD->getTypeAllocSizeInBits(SrcVal->getType()) != AllocaSizeBits)
+    return;
+  // Handle tail padding by extending the operand
+  if (TD->getTypeSizeInBits(SrcVal->getType()) != AllocaSizeBits)
+    SrcVal = new ZExtInst(SrcVal, IntegerType::get(AllocaSizeBits), "", SI);
+
+  DOUT << "PROMOTING STORE TO WHOLE ALLOCA: " << *AI << *SI;
+
+  // There are two forms here: AI could be an array or struct.  Both cases
+  // have different ways to compute the element offset.
+  if (const StructType *EltSTy = dyn_cast<StructType>(AllocaEltTy)) {
+    const StructLayout *Layout = TD->getStructLayout(EltSTy);
+    
+    for (unsigned i = 0, e = NewElts.size(); i != e; ++i) {
+      // Get the number of bits to shift SrcVal to get the value.
+      const Type *FieldTy = EltSTy->getElementType(i);
+      uint64_t Shift = Layout->getElementOffsetInBits(i);
+      
+      if (TD->isBigEndian())
+        Shift = AllocaSizeBits-Shift-TD->getTypeAllocSizeInBits(FieldTy);
+      
+      Value *EltVal = SrcVal;
+      if (Shift) {
+        Value *ShiftVal = ConstantInt::get(EltVal->getType(), Shift);
+        EltVal = BinaryOperator::CreateLShr(EltVal, ShiftVal,
+                                            "sroa.store.elt", SI);
+      }
+      
+      // Truncate down to an integer of the right size.
+      uint64_t FieldSizeBits = TD->getTypeSizeInBits(FieldTy);
+      
+      // Ignore zero sized fields like {}, they obviously contain no data.
+      if (FieldSizeBits == 0) continue;
+      
+      if (FieldSizeBits != AllocaSizeBits)
+        EltVal = new TruncInst(EltVal, IntegerType::get(FieldSizeBits), "", SI);
+      Value *DestField = NewElts[i];
+      if (EltVal->getType() == FieldTy) {
+        // Storing to an integer field of this size, just do it.
+      } else if (FieldTy->isFloatingPoint() || isa<VectorType>(FieldTy)) {
+        // Bitcast to the right element type (for fp/vector values).
+        EltVal = new BitCastInst(EltVal, FieldTy, "", SI);
+      } else {
+        // Otherwise, bitcast the dest pointer (for aggregates).
+        DestField = new BitCastInst(DestField,
+                                    PointerType::getUnqual(EltVal->getType()),
+                                    "", SI);
+      }
+      new StoreInst(EltVal, DestField, SI);
+    }
+    
+  } else {
+    const ArrayType *ATy = cast<ArrayType>(AllocaEltTy);
+    const Type *ArrayEltTy = ATy->getElementType();
+    uint64_t ElementOffset = TD->getTypeAllocSizeInBits(ArrayEltTy);
+    uint64_t ElementSizeBits = TD->getTypeSizeInBits(ArrayEltTy);
+
+    uint64_t Shift;
+    
+    if (TD->isBigEndian())
+      Shift = AllocaSizeBits-ElementOffset;
+    else 
+      Shift = 0;
+    
+    for (unsigned i = 0, e = NewElts.size(); i != e; ++i) {
+      // Ignore zero sized fields like {}, they obviously contain no data.
+      if (ElementSizeBits == 0) continue;
+      
+      Value *EltVal = SrcVal;
+      if (Shift) {
+        Value *ShiftVal = ConstantInt::get(EltVal->getType(), Shift);
+        EltVal = BinaryOperator::CreateLShr(EltVal, ShiftVal,
+                                            "sroa.store.elt", SI);
+      }
+      
+      // Truncate down to an integer of the right size.
+      if (ElementSizeBits != AllocaSizeBits)
+        EltVal = new TruncInst(EltVal, IntegerType::get(ElementSizeBits),"",SI);
+      Value *DestField = NewElts[i];
+      if (EltVal->getType() == ArrayEltTy) {
+        // Storing to an integer field of this size, just do it.
+      } else if (ArrayEltTy->isFloatingPoint() || isa<VectorType>(ArrayEltTy)) {
+        // Bitcast to the right element type (for fp/vector values).
+        EltVal = new BitCastInst(EltVal, ArrayEltTy, "", SI);
+      } else {
+        // Otherwise, bitcast the dest pointer (for aggregates).
+        DestField = new BitCastInst(DestField,
+                                    PointerType::getUnqual(EltVal->getType()),
+                                    "", SI);
+      }
+      new StoreInst(EltVal, DestField, SI);
+      
+      if (TD->isBigEndian())
+        Shift -= ElementOffset;
+      else 
+        Shift += ElementOffset;
+    }
+  }
+  
+  SI->eraseFromParent();
+}
+
+/// RewriteLoadUserOfWholeAlloca - We found an load of the entire allocation to
+/// an integer.  Load the individual pieces to form the aggregate value.
+void SROA::RewriteLoadUserOfWholeAlloca(LoadInst *LI, AllocationInst *AI,
+                                        SmallVector<AllocaInst*, 32> &NewElts) {
+  // Extract each element out of the NewElts according to its structure offset
+  // and form the result value.
+  const Type *AllocaEltTy = AI->getType()->getElementType();
+  uint64_t AllocaSizeBits = TD->getTypeAllocSizeInBits(AllocaEltTy);
+  
+  // If this isn't a load of the whole alloca to an integer, it may be a load
+  // of the first element.  Just ignore the load in this case and normal SROA
+  // will handle it.
+  if (!isa<IntegerType>(LI->getType()) ||
+      TD->getTypeAllocSizeInBits(LI->getType()) != AllocaSizeBits)
+    return;
+  
+  DOUT << "PROMOTING LOAD OF WHOLE ALLOCA: " << *AI << *LI;
+  
+  // There are two forms here: AI could be an array or struct.  Both cases
+  // have different ways to compute the element offset.
+  const StructLayout *Layout = 0;
+  uint64_t ArrayEltBitOffset = 0;
+  if (const StructType *EltSTy = dyn_cast<StructType>(AllocaEltTy)) {
+    Layout = TD->getStructLayout(EltSTy);
+  } else {
+    const Type *ArrayEltTy = cast<ArrayType>(AllocaEltTy)->getElementType();
+    ArrayEltBitOffset = TD->getTypeAllocSizeInBits(ArrayEltTy);
+  }    
+    
+  Value *ResultVal = Constant::getNullValue(IntegerType::get(AllocaSizeBits));
+  
+  for (unsigned i = 0, e = NewElts.size(); i != e; ++i) {
+    // Load the value from the alloca.  If the NewElt is an aggregate, cast
+    // the pointer to an integer of the same size before doing the load.
+    Value *SrcField = NewElts[i];
+    const Type *FieldTy =
+      cast<PointerType>(SrcField->getType())->getElementType();
+    uint64_t FieldSizeBits = TD->getTypeSizeInBits(FieldTy);
+    
+    // Ignore zero sized fields like {}, they obviously contain no data.
+    if (FieldSizeBits == 0) continue;
+    
+    const IntegerType *FieldIntTy = IntegerType::get(FieldSizeBits);
+    if (!isa<IntegerType>(FieldTy) && !FieldTy->isFloatingPoint() &&
+        !isa<VectorType>(FieldTy))
+      SrcField = new BitCastInst(SrcField, PointerType::getUnqual(FieldIntTy),
+                                 "", LI);
+    SrcField = new LoadInst(SrcField, "sroa.load.elt", LI);
+
+    // If SrcField is a fp or vector of the right size but that isn't an
+    // integer type, bitcast to an integer so we can shift it.
+    if (SrcField->getType() != FieldIntTy)
+      SrcField = new BitCastInst(SrcField, FieldIntTy, "", LI);
+
+    // Zero extend the field to be the same size as the final alloca so that
+    // we can shift and insert it.
+    if (SrcField->getType() != ResultVal->getType())
+      SrcField = new ZExtInst(SrcField, ResultVal->getType(), "", LI);
+    
+    // Determine the number of bits to shift SrcField.
+    uint64_t Shift;
+    if (Layout) // Struct case.
+      Shift = Layout->getElementOffsetInBits(i);
+    else  // Array case.
+      Shift = i*ArrayEltBitOffset;
+    
+    if (TD->isBigEndian())
+      Shift = AllocaSizeBits-Shift-FieldIntTy->getBitWidth();
+    
+    if (Shift) {
+      Value *ShiftVal = ConstantInt::get(SrcField->getType(), Shift);
+      SrcField = BinaryOperator::CreateShl(SrcField, ShiftVal, "", LI);
+    }
+
+    ResultVal = BinaryOperator::CreateOr(SrcField, ResultVal, "", LI);
+  }
+
+  // Handle tail padding by truncating the result
+  if (TD->getTypeSizeInBits(LI->getType()) != AllocaSizeBits)
+    ResultVal = new TruncInst(ResultVal, LI->getType(), "", LI);
+
+  LI->replaceAllUsesWith(ResultVal);
+  LI->eraseFromParent();
+}
+
+
+/// HasPadding - Return true if the specified type has any structure or
+/// alignment padding, false otherwise.
+static bool HasPadding(const Type *Ty, const TargetData &TD) {
+  if (const StructType *STy = dyn_cast<StructType>(Ty)) {
+    const StructLayout *SL = TD.getStructLayout(STy);
+    unsigned PrevFieldBitOffset = 0;
+    for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
+      unsigned FieldBitOffset = SL->getElementOffsetInBits(i);
+
+      // Padding in sub-elements?
+      if (HasPadding(STy->getElementType(i), TD))
+        return true;
+
+      // Check to see if there is any padding between this element and the
+      // previous one.
+      if (i) {
+        unsigned PrevFieldEnd =
+        PrevFieldBitOffset+TD.getTypeSizeInBits(STy->getElementType(i-1));
+        if (PrevFieldEnd < FieldBitOffset)
+          return true;
+      }
+
+      PrevFieldBitOffset = FieldBitOffset;
+    }
+
+    //  Check for tail padding.
+    if (unsigned EltCount = STy->getNumElements()) {
+      unsigned PrevFieldEnd = PrevFieldBitOffset +
+                   TD.getTypeSizeInBits(STy->getElementType(EltCount-1));
+      if (PrevFieldEnd < SL->getSizeInBits())
+        return true;
+    }
+
+  } else if (const ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
+    return HasPadding(ATy->getElementType(), TD);
+  } else if (const VectorType *VTy = dyn_cast<VectorType>(Ty)) {
+    return HasPadding(VTy->getElementType(), TD);
+  }
+  return TD.getTypeSizeInBits(Ty) != TD.getTypeAllocSizeInBits(Ty);
+}
+
+/// isSafeStructAllocaToScalarRepl - Check to see if the specified allocation of
+/// an aggregate can be broken down into elements.  Return 0 if not, 3 if safe,
+/// or 1 if safe after canonicalization has been performed.
+///
+int SROA::isSafeAllocaToScalarRepl(AllocationInst *AI) {
+  // Loop over the use list of the alloca.  We can only transform it if all of
+  // the users are safe to transform.
+  AllocaInfo Info;
+  
+  for (Value::use_iterator I = AI->use_begin(), E = AI->use_end();
+       I != E; ++I) {
+    isSafeUseOfAllocation(cast<Instruction>(*I), AI, Info);
+    if (Info.isUnsafe) {
+      DOUT << "Cannot transform: " << *AI << "  due to user: " << **I;
+      return 0;
+    }
+  }
+  
+  // Okay, we know all the users are promotable.  If the aggregate is a memcpy
+  // source and destination, we have to be careful.  In particular, the memcpy
+  // could be moving around elements that live in structure padding of the LLVM
+  // types, but may actually be used.  In these cases, we refuse to promote the
+  // struct.
+  if (Info.isMemCpySrc && Info.isMemCpyDst &&
+      HasPadding(AI->getType()->getElementType(), *TD))
+    return 0;
+
+  // If we require cleanup, return 1, otherwise return 3.
+  return Info.needsCleanup ? 1 : 3;
+}
+
+/// CleanupGEP - GEP is used by an Alloca, which can be prompted after the GEP
+/// is canonicalized here.
+void SROA::CleanupGEP(GetElementPtrInst *GEPI) {
+  gep_type_iterator I = gep_type_begin(GEPI);
+  ++I;
+  
+  const ArrayType *AT = dyn_cast<ArrayType>(*I);
+  if (!AT) 
+    return;
+
+  uint64_t NumElements = AT->getNumElements();
+  
+  if (isa<ConstantInt>(I.getOperand()))
+    return;
+
+  if (NumElements == 1) {
+    GEPI->setOperand(2, Constant::getNullValue(Type::Int32Ty));
+    return;
+  } 
+    
+  assert(NumElements == 2 && "Unhandled case!");
+  // All users of the GEP must be loads.  At each use of the GEP, insert
+  // two loads of the appropriate indexed GEP and select between them.
+  Value *IsOne = new ICmpInst(ICmpInst::ICMP_NE, I.getOperand(), 
+                              Constant::getNullValue(I.getOperand()->getType()),
+                              "isone", GEPI);
+  // Insert the new GEP instructions, which are properly indexed.
+  SmallVector<Value*, 8> Indices(GEPI->op_begin()+1, GEPI->op_end());
+  Indices[1] = Constant::getNullValue(Type::Int32Ty);
+  Value *ZeroIdx = GetElementPtrInst::Create(GEPI->getOperand(0),
+                                             Indices.begin(),
+                                             Indices.end(),
+                                             GEPI->getName()+".0", GEPI);
+  Indices[1] = ConstantInt::get(Type::Int32Ty, 1);
+  Value *OneIdx = GetElementPtrInst::Create(GEPI->getOperand(0),
+                                            Indices.begin(),
+                                            Indices.end(),
+                                            GEPI->getName()+".1", GEPI);
+  // Replace all loads of the variable index GEP with loads from both
+  // indexes and a select.
+  while (!GEPI->use_empty()) {
+    LoadInst *LI = cast<LoadInst>(GEPI->use_back());
+    Value *Zero = new LoadInst(ZeroIdx, LI->getName()+".0", LI);
+    Value *One  = new LoadInst(OneIdx , LI->getName()+".1", LI);
+    Value *R = SelectInst::Create(IsOne, One, Zero, LI->getName(), LI);
+    LI->replaceAllUsesWith(R);
+    LI->eraseFromParent();
+  }
+  GEPI->eraseFromParent();
+}
+
+
+/// CleanupAllocaUsers - If SROA reported that it can promote the specified
+/// allocation, but only if cleaned up, perform the cleanups required.
+void SROA::CleanupAllocaUsers(AllocationInst *AI) {
+  // At this point, we know that the end result will be SROA'd and promoted, so
+  // we can insert ugly code if required so long as sroa+mem2reg will clean it
+  // up.
+  for (Value::use_iterator UI = AI->use_begin(), E = AI->use_end();
+       UI != E; ) {
+    User *U = *UI++;
+    if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(U))
+      CleanupGEP(GEPI);
+    else if (Instruction *I = dyn_cast<Instruction>(U)) {
+      SmallVector<DbgInfoIntrinsic *, 2> DbgInUses;
+      if (!isa<StoreInst>(I) && OnlyUsedByDbgInfoIntrinsics(I, &DbgInUses)) {
+        // Safe to remove debug info uses.
+        while (!DbgInUses.empty()) {
+          DbgInfoIntrinsic *DI = DbgInUses.back(); DbgInUses.pop_back();
+          DI->eraseFromParent();
+        }
+        I->eraseFromParent();
+      }
+    }
+  }
+}
+
+/// MergeInType - Add the 'In' type to the accumulated type (Accum) so far at
+/// the offset specified by Offset (which is specified in bytes).
+///
+/// There are two cases we handle here:
+///   1) A union of vector types of the same size and potentially its elements.
+///      Here we turn element accesses into insert/extract element operations.
+///      This promotes a <4 x float> with a store of float to the third element
+///      into a <4 x float> that uses insert element.
+///   2) A fully general blob of memory, which we turn into some (potentially
+///      large) integer type with extract and insert operations where the loads
+///      and stores would mutate the memory.
+static void MergeInType(const Type *In, uint64_t Offset, const Type *&VecTy,
+                        unsigned AllocaSize, const TargetData &TD) {
+  // If this could be contributing to a vector, analyze it.
+  if (VecTy != Type::VoidTy) { // either null or a vector type.
+
+    // If the In type is a vector that is the same size as the alloca, see if it
+    // matches the existing VecTy.
+    if (const VectorType *VInTy = dyn_cast<VectorType>(In)) {
+      if (VInTy->getBitWidth()/8 == AllocaSize && Offset == 0) {
+        // If we're storing/loading a vector of the right size, allow it as a
+        // vector.  If this the first vector we see, remember the type so that
+        // we know the element size.
+        if (VecTy == 0)
+          VecTy = VInTy;
+        return;
+      }
+    } else if (In == Type::FloatTy || In == Type::DoubleTy ||
+               (isa<IntegerType>(In) && In->getPrimitiveSizeInBits() >= 8 &&
+                isPowerOf2_32(In->getPrimitiveSizeInBits()))) {
+      // If we're accessing something that could be an element of a vector, see
+      // if the implied vector agrees with what we already have and if Offset is
+      // compatible with it.
+      unsigned EltSize = In->getPrimitiveSizeInBits()/8;
+      if (Offset % EltSize == 0 &&
+          AllocaSize % EltSize == 0 &&
+          (VecTy == 0 || 
+           cast<VectorType>(VecTy)->getElementType()
+                 ->getPrimitiveSizeInBits()/8 == EltSize)) {
+        if (VecTy == 0)
+          VecTy = VectorType::get(In, AllocaSize/EltSize);
+        return;
+      }
+    }
+  }
+  
+  // Otherwise, we have a case that we can't handle with an optimized vector
+  // form.  We can still turn this into a large integer.
+  VecTy = Type::VoidTy;
+}
+
+/// CanConvertToScalar - V is a pointer.  If we can convert the pointee and all
+/// its accesses to use a to single vector type, return true, and set VecTy to
+/// the new type.  If we could convert the alloca into a single promotable
+/// integer, return true but set VecTy to VoidTy.  Further, if the use is not a
+/// completely trivial use that mem2reg could promote, set IsNotTrivial.  Offset
+/// is the current offset from the base of the alloca being analyzed.
+///
+/// If we see at least one access to the value that is as a vector type, set the
+/// SawVec flag.
+///
+bool SROA::CanConvertToScalar(Value *V, bool &IsNotTrivial, const Type *&VecTy,
+                              bool &SawVec, uint64_t Offset,
+                              unsigned AllocaSize) {
+  for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); UI!=E; ++UI) {
+    Instruction *User = cast<Instruction>(*UI);
+    
+    if (LoadInst *LI = dyn_cast<LoadInst>(User)) {
+      // Don't break volatile loads.
+      if (LI->isVolatile())
+        return false;
+      MergeInType(LI->getType(), Offset, VecTy, AllocaSize, *TD);
+      SawVec |= isa<VectorType>(LI->getType());
+      continue;
+    }
+    
+    if (StoreInst *SI = dyn_cast<StoreInst>(User)) {
+      // Storing the pointer, not into the value?
+      if (SI->getOperand(0) == V || SI->isVolatile()) return 0;
+      MergeInType(SI->getOperand(0)->getType(), Offset, VecTy, AllocaSize, *TD);
+      SawVec |= isa<VectorType>(SI->getOperand(0)->getType());
+      continue;
+    }
+    
+    if (BitCastInst *BCI = dyn_cast<BitCastInst>(User)) {
+      if (!CanConvertToScalar(BCI, IsNotTrivial, VecTy, SawVec, Offset,
+                              AllocaSize))
+        return false;
+      IsNotTrivial = true;
+      continue;
+    }
+
+    if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(User)) {
+      // If this is a GEP with a variable indices, we can't handle it.
+      if (!GEP->hasAllConstantIndices())
+        return false;
+      
+      // Compute the offset that this GEP adds to the pointer.
+      SmallVector<Value*, 8> Indices(GEP->op_begin()+1, GEP->op_end());
+      uint64_t GEPOffset = TD->getIndexedOffset(GEP->getOperand(0)->getType(),
+                                                &Indices[0], Indices.size());
+      // See if all uses can be converted.
+      if (!CanConvertToScalar(GEP, IsNotTrivial, VecTy, SawVec,Offset+GEPOffset,
+                              AllocaSize))
+        return false;
+      IsNotTrivial = true;
+      continue;
+    }
+
+    // If this is a constant sized memset of a constant value (e.g. 0) we can
+    // handle it.
+    if (MemSetInst *MSI = dyn_cast<MemSetInst>(User)) {
+      // Store of constant value and constant size.
+      if (isa<ConstantInt>(MSI->getValue()) &&
+          isa<ConstantInt>(MSI->getLength())) {
+        IsNotTrivial = true;
+        continue;
+      }
+    }
+
+    // If this is a memcpy or memmove into or out of the whole allocation, we
+    // can handle it like a load or store of the scalar type.
+    if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(User)) {
+      if (ConstantInt *Len = dyn_cast<ConstantInt>(MTI->getLength()))
+        if (Len->getZExtValue() == AllocaSize && Offset == 0) {
+          IsNotTrivial = true;
+          continue;
+        }
+    }
+    
+    // Ignore dbg intrinsic.
+    if (isa<DbgInfoIntrinsic>(User))
+      continue;
+
+    // Otherwise, we cannot handle this!
+    return false;
+  }
+  
+  return true;
+}
+
+
+/// ConvertUsesToScalar - Convert all of the users of Ptr to use the new alloca
+/// directly.  This happens when we are converting an "integer union" to a
+/// single integer scalar, or when we are converting a "vector union" to a
+/// vector with insert/extractelement instructions.
+///
+/// Offset is an offset from the original alloca, in bits that need to be
+/// shifted to the right.  By the end of this, there should be no uses of Ptr.
+void SROA::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI, uint64_t Offset) {
+  while (!Ptr->use_empty()) {
+    Instruction *User = cast<Instruction>(Ptr->use_back());
+
+    if (BitCastInst *CI = dyn_cast<BitCastInst>(User)) {
+      ConvertUsesToScalar(CI, NewAI, Offset);
+      CI->eraseFromParent();
+      continue;
+    }
+
+    if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(User)) {
+      // Compute the offset that this GEP adds to the pointer.
+      SmallVector<Value*, 8> Indices(GEP->op_begin()+1, GEP->op_end());
+      uint64_t GEPOffset = TD->getIndexedOffset(GEP->getOperand(0)->getType(),
+                                                &Indices[0], Indices.size());
+      ConvertUsesToScalar(GEP, NewAI, Offset+GEPOffset*8);
+      GEP->eraseFromParent();
+      continue;
+    }
+    
+    IRBuilder<> Builder(User->getParent(), User);
+    
+    if (LoadInst *LI = dyn_cast<LoadInst>(User)) {
+      // The load is a bit extract from NewAI shifted right by Offset bits.
+      Value *LoadedVal = Builder.CreateLoad(NewAI, "tmp");
+      Value *NewLoadVal
+        = ConvertScalar_ExtractValue(LoadedVal, LI->getType(), Offset, Builder);
+      LI->replaceAllUsesWith(NewLoadVal);
+      LI->eraseFromParent();
+      continue;
+    }
+    
+    if (StoreInst *SI = dyn_cast<StoreInst>(User)) {
+      assert(SI->getOperand(0) != Ptr && "Consistency error!");
+      Value *Old = Builder.CreateLoad(NewAI, (NewAI->getName()+".in").c_str());
+      Value *New = ConvertScalar_InsertValue(SI->getOperand(0), Old, Offset,
+                                             Builder);
+      Builder.CreateStore(New, NewAI);
+      SI->eraseFromParent();
+      continue;
+    }
+    
+    // If this is a constant sized memset of a constant value (e.g. 0) we can
+    // transform it into a store of the expanded constant value.
+    if (MemSetInst *MSI = dyn_cast<MemSetInst>(User)) {
+      assert(MSI->getRawDest() == Ptr && "Consistency error!");
+      unsigned NumBytes = cast<ConstantInt>(MSI->getLength())->getZExtValue();
+      if (NumBytes != 0) {
+        unsigned Val = cast<ConstantInt>(MSI->getValue())->getZExtValue();
+        
+        // Compute the value replicated the right number of times.
+        APInt APVal(NumBytes*8, Val);
+
+        // Splat the value if non-zero.
+        if (Val)
+          for (unsigned i = 1; i != NumBytes; ++i)
+            APVal |= APVal << 8;
+        
+        Value *Old = Builder.CreateLoad(NewAI, (NewAI->getName()+".in").c_str());
+        Value *New = ConvertScalar_InsertValue(ConstantInt::get(APVal), Old,
+                                               Offset, Builder);
+        Builder.CreateStore(New, NewAI);
+      }
+      MSI->eraseFromParent();
+      continue;
+    }
+
+    // If this is a memcpy or memmove into or out of the whole allocation, we
+    // can handle it like a load or store of the scalar type.
+    if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(User)) {
+      assert(Offset == 0 && "must be store to start of alloca");
+      
+      // If the source and destination are both to the same alloca, then this is
+      // a noop copy-to-self, just delete it.  Otherwise, emit a load and store
+      // as appropriate.
+      AllocaInst *OrigAI = cast<AllocaInst>(Ptr->getUnderlyingObject());
+      
+      if (MTI->getSource()->getUnderlyingObject() != OrigAI) {
+        // Dest must be OrigAI, change this to be a load from the original
+        // pointer (bitcasted), then a store to our new alloca.
+        assert(MTI->getRawDest() == Ptr && "Neither use is of pointer?");
+        Value *SrcPtr = MTI->getSource();
+        SrcPtr = Builder.CreateBitCast(SrcPtr, NewAI->getType());
+        
+        LoadInst *SrcVal = Builder.CreateLoad(SrcPtr, "srcval");
+        SrcVal->setAlignment(MTI->getAlignment());
+        Builder.CreateStore(SrcVal, NewAI);
+      } else if (MTI->getDest()->getUnderlyingObject() != OrigAI) {
+        // Src must be OrigAI, change this to be a load from NewAI then a store
+        // through the original dest pointer (bitcasted).
+        assert(MTI->getRawSource() == Ptr && "Neither use is of pointer?");
+        LoadInst *SrcVal = Builder.CreateLoad(NewAI, "srcval");
+
+        Value *DstPtr = Builder.CreateBitCast(MTI->getDest(), NewAI->getType());
+        StoreInst *NewStore = Builder.CreateStore(SrcVal, DstPtr);
+        NewStore->setAlignment(MTI->getAlignment());
+      } else {
+        // Noop transfer. Src == Dst
+      }
+          
+
+      MTI->eraseFromParent();
+      continue;
+    }
+    
+    // If user is a dbg info intrinsic then it is safe to remove it.
+    if (isa<DbgInfoIntrinsic>(User)) {
+      User->eraseFromParent();
+      continue;
+    }
+
+    assert(0 && "Unsupported operation!");
+    abort();
+  }
+}
+
+/// ConvertScalar_ExtractValue - Extract a value of type ToType from an integer
+/// or vector value FromVal, extracting the bits from the offset specified by
+/// Offset.  This returns the value, which is of type ToType.
+///
+/// This happens when we are converting an "integer union" to a single
+/// integer scalar, or when we are converting a "vector union" to a vector with
+/// insert/extractelement instructions.
+///
+/// Offset is an offset from the original alloca, in bits that need to be
+/// shifted to the right.
+Value *SROA::ConvertScalar_ExtractValue(Value *FromVal, const Type *ToType,
+                                        uint64_t Offset, IRBuilder<> &Builder) {
+  // If the load is of the whole new alloca, no conversion is needed.
+  if (FromVal->getType() == ToType && Offset == 0)
+    return FromVal;
+
+  // If the result alloca is a vector type, this is either an element
+  // access or a bitcast to another vector type of the same size.
+  if (const VectorType *VTy = dyn_cast<VectorType>(FromVal->getType())) {
+    if (isa<VectorType>(ToType))
+      return Builder.CreateBitCast(FromVal, ToType, "tmp");
+
+    // Otherwise it must be an element access.
+    unsigned Elt = 0;
+    if (Offset) {
+      unsigned EltSize = TD->getTypeAllocSizeInBits(VTy->getElementType());
+      Elt = Offset/EltSize;
+      assert(EltSize*Elt == Offset && "Invalid modulus in validity checking");
+    }
+    // Return the element extracted out of it.
+    Value *V = Builder.CreateExtractElement(FromVal,
+                                            ConstantInt::get(Type::Int32Ty,Elt),
+                                            "tmp");
+    if (V->getType() != ToType)
+      V = Builder.CreateBitCast(V, ToType, "tmp");
+    return V;
+  }
+  
+  // If ToType is a first class aggregate, extract out each of the pieces and
+  // use insertvalue's to form the FCA.
+  if (const StructType *ST = dyn_cast<StructType>(ToType)) {
+    const StructLayout &Layout = *TD->getStructLayout(ST);
+    Value *Res = UndefValue::get(ST);
+    for (unsigned i = 0, e = ST->getNumElements(); i != e; ++i) {
+      Value *Elt = ConvertScalar_ExtractValue(FromVal, ST->getElementType(i),
+                                        Offset+Layout.getElementOffsetInBits(i),
+                                              Builder);
+      Res = Builder.CreateInsertValue(Res, Elt, i, "tmp");
+    }
+    return Res;
+  }
+  
+  if (const ArrayType *AT = dyn_cast<ArrayType>(ToType)) {
+    uint64_t EltSize = TD->getTypeAllocSizeInBits(AT->getElementType());
+    Value *Res = UndefValue::get(AT);
+    for (unsigned i = 0, e = AT->getNumElements(); i != e; ++i) {
+      Value *Elt = ConvertScalar_ExtractValue(FromVal, AT->getElementType(),
+                                              Offset+i*EltSize, Builder);
+      Res = Builder.CreateInsertValue(Res, Elt, i, "tmp");
+    }
+    return Res;
+  }
+
+  // Otherwise, this must be a union that was converted to an integer value.
+  const IntegerType *NTy = cast<IntegerType>(FromVal->getType());
+
+  // If this is a big-endian system and the load is narrower than the
+  // full alloca type, we need to do a shift to get the right bits.
+  int ShAmt = 0;
+  if (TD->isBigEndian()) {
+    // On big-endian machines, the lowest bit is stored at the bit offset
+    // from the pointer given by getTypeStoreSizeInBits.  This matters for
+    // integers with a bitwidth that is not a multiple of 8.
+    ShAmt = TD->getTypeStoreSizeInBits(NTy) -
+            TD->getTypeStoreSizeInBits(ToType) - Offset;
+  } else {
+    ShAmt = Offset;
+  }
+
+  // Note: we support negative bitwidths (with shl) which are not defined.
+  // We do this to support (f.e.) loads off the end of a structure where
+  // only some bits are used.
+  if (ShAmt > 0 && (unsigned)ShAmt < NTy->getBitWidth())
+    FromVal = Builder.CreateLShr(FromVal, ConstantInt::get(FromVal->getType(),
+                                                           ShAmt), "tmp");
+  else if (ShAmt < 0 && (unsigned)-ShAmt < NTy->getBitWidth())
+    FromVal = Builder.CreateShl(FromVal, ConstantInt::get(FromVal->getType(),
+                                                          -ShAmt), "tmp");
+
+  // Finally, unconditionally truncate the integer to the right width.
+  unsigned LIBitWidth = TD->getTypeSizeInBits(ToType);
+  if (LIBitWidth < NTy->getBitWidth())
+    FromVal = Builder.CreateTrunc(FromVal, IntegerType::get(LIBitWidth), "tmp");
+  else if (LIBitWidth > NTy->getBitWidth())
+    FromVal = Builder.CreateZExt(FromVal, IntegerType::get(LIBitWidth), "tmp");
+
+  // If the result is an integer, this is a trunc or bitcast.
+  if (isa<IntegerType>(ToType)) {
+    // Should be done.
+  } else if (ToType->isFloatingPoint() || isa<VectorType>(ToType)) {
+    // Just do a bitcast, we know the sizes match up.
+    FromVal = Builder.CreateBitCast(FromVal, ToType, "tmp");
+  } else {
+    // Otherwise must be a pointer.
+    FromVal = Builder.CreateIntToPtr(FromVal, ToType, "tmp");
+  }
+  assert(FromVal->getType() == ToType && "Didn't convert right?");
+  return FromVal;
+}
+
+
+/// ConvertScalar_InsertValue - Insert the value "SV" into the existing integer
+/// or vector value "Old" at the offset specified by Offset.
+///
+/// This happens when we are converting an "integer union" to a
+/// single integer scalar, or when we are converting a "vector union" to a
+/// vector with insert/extractelement instructions.
+///
+/// Offset is an offset from the original alloca, in bits that need to be
+/// shifted to the right.
+Value *SROA::ConvertScalar_InsertValue(Value *SV, Value *Old,
+                                       uint64_t Offset, IRBuilder<> &Builder) {
+
+  // Convert the stored type to the actual type, shift it left to insert
+  // then 'or' into place.
+  const Type *AllocaType = Old->getType();
+
+  if (const VectorType *VTy = dyn_cast<VectorType>(AllocaType)) {
+    uint64_t VecSize = TD->getTypeAllocSizeInBits(VTy);
+    uint64_t ValSize = TD->getTypeAllocSizeInBits(SV->getType());
+    
+    // Changing the whole vector with memset or with an access of a different
+    // vector type?
+    if (ValSize == VecSize)
+      return Builder.CreateBitCast(SV, AllocaType, "tmp");
+
+    uint64_t EltSize = TD->getTypeAllocSizeInBits(VTy->getElementType());
+
+    // Must be an element insertion.
+    unsigned Elt = Offset/EltSize;
+    
+    if (SV->getType() != VTy->getElementType())
+      SV = Builder.CreateBitCast(SV, VTy->getElementType(), "tmp");
+    
+    SV = Builder.CreateInsertElement(Old, SV, 
+                                     ConstantInt::get(Type::Int32Ty, Elt),
+                                     "tmp");
+    return SV;
+  }
+  
+  // If SV is a first-class aggregate value, insert each value recursively.
+  if (const StructType *ST = dyn_cast<StructType>(SV->getType())) {
+    const StructLayout &Layout = *TD->getStructLayout(ST);
+    for (unsigned i = 0, e = ST->getNumElements(); i != e; ++i) {
+      Value *Elt = Builder.CreateExtractValue(SV, i, "tmp");
+      Old = ConvertScalar_InsertValue(Elt, Old, 
+                                      Offset+Layout.getElementOffsetInBits(i),
+                                      Builder);
+    }
+    return Old;
+  }
+  
+  if (const ArrayType *AT = dyn_cast<ArrayType>(SV->getType())) {
+    uint64_t EltSize = TD->getTypeAllocSizeInBits(AT->getElementType());
+    for (unsigned i = 0, e = AT->getNumElements(); i != e; ++i) {
+      Value *Elt = Builder.CreateExtractValue(SV, i, "tmp");
+      Old = ConvertScalar_InsertValue(Elt, Old, Offset+i*EltSize, Builder);
+    }
+    return Old;
+  }
+
+  // If SV is a float, convert it to the appropriate integer type.
+  // If it is a pointer, do the same.
+  unsigned SrcWidth = TD->getTypeSizeInBits(SV->getType());
+  unsigned DestWidth = TD->getTypeSizeInBits(AllocaType);
+  unsigned SrcStoreWidth = TD->getTypeStoreSizeInBits(SV->getType());
+  unsigned DestStoreWidth = TD->getTypeStoreSizeInBits(AllocaType);
+  if (SV->getType()->isFloatingPoint() || isa<VectorType>(SV->getType()))
+    SV = Builder.CreateBitCast(SV, IntegerType::get(SrcWidth), "tmp");
+  else if (isa<PointerType>(SV->getType()))
+    SV = Builder.CreatePtrToInt(SV, TD->getIntPtrType(), "tmp");
+
+  // Zero extend or truncate the value if needed.
+  if (SV->getType() != AllocaType) {
+    if (SV->getType()->getPrimitiveSizeInBits() <
+             AllocaType->getPrimitiveSizeInBits())
+      SV = Builder.CreateZExt(SV, AllocaType, "tmp");
+    else {
+      // Truncation may be needed if storing more than the alloca can hold
+      // (undefined behavior).
+      SV = Builder.CreateTrunc(SV, AllocaType, "tmp");
+      SrcWidth = DestWidth;
+      SrcStoreWidth = DestStoreWidth;
+    }
+  }
+
+  // If this is a big-endian system and the store is narrower than the
+  // full alloca type, we need to do a shift to get the right bits.
+  int ShAmt = 0;
+  if (TD->isBigEndian()) {
+    // On big-endian machines, the lowest bit is stored at the bit offset
+    // from the pointer given by getTypeStoreSizeInBits.  This matters for
+    // integers with a bitwidth that is not a multiple of 8.
+    ShAmt = DestStoreWidth - SrcStoreWidth - Offset;
+  } else {
+    ShAmt = Offset;
+  }
+
+  // Note: we support negative bitwidths (with shr) which are not defined.
+  // We do this to support (f.e.) stores off the end of a structure where
+  // only some bits in the structure are set.
+  APInt Mask(APInt::getLowBitsSet(DestWidth, SrcWidth));
+  if (ShAmt > 0 && (unsigned)ShAmt < DestWidth) {
+    SV = Builder.CreateShl(SV, ConstantInt::get(SV->getType(), ShAmt), "tmp");
+    Mask <<= ShAmt;
+  } else if (ShAmt < 0 && (unsigned)-ShAmt < DestWidth) {
+    SV = Builder.CreateLShr(SV, ConstantInt::get(SV->getType(), -ShAmt), "tmp");
+    Mask = Mask.lshr(-ShAmt);
+  }
+
+  // Mask out the bits we are about to insert from the old value, and or
+  // in the new bits.
+  if (SrcWidth != DestWidth) {
+    assert(DestWidth > SrcWidth);
+    Old = Builder.CreateAnd(Old, ConstantInt::get(~Mask), "mask");
+    SV = Builder.CreateOr(Old, SV, "ins");
+  }
+  return SV;
+}
+
+
+
+/// PointsToConstantGlobal - Return true if V (possibly indirectly) points to
+/// some part of a constant global variable.  This intentionally only accepts
+/// constant expressions because we don't can't rewrite arbitrary instructions.
+static bool PointsToConstantGlobal(Value *V) {
+  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(V))
+    return GV->isConstant();
+  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V))
+    if (CE->getOpcode() == Instruction::BitCast || 
+        CE->getOpcode() == Instruction::GetElementPtr)
+      return PointsToConstantGlobal(CE->getOperand(0));
+  return false;
+}
+
+/// isOnlyCopiedFromConstantGlobal - Recursively walk the uses of a (derived)
+/// pointer to an alloca.  Ignore any reads of the pointer, return false if we
+/// see any stores or other unknown uses.  If we see pointer arithmetic, keep
+/// track of whether it moves the pointer (with isOffset) but otherwise traverse
+/// the uses.  If we see a memcpy/memmove that targets an unoffseted pointer to
+/// the alloca, and if the source pointer is a pointer to a constant  global, we
+/// can optimize this.
+static bool isOnlyCopiedFromConstantGlobal(Value *V, Instruction *&TheCopy,
+                                           bool isOffset) {
+  for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); UI!=E; ++UI) {
+    if (LoadInst *LI = dyn_cast<LoadInst>(*UI))
+      // Ignore non-volatile loads, they are always ok.
+      if (!LI->isVolatile())
+        continue;
+    
+    if (BitCastInst *BCI = dyn_cast<BitCastInst>(*UI)) {
+      // If uses of the bitcast are ok, we are ok.
+      if (!isOnlyCopiedFromConstantGlobal(BCI, TheCopy, isOffset))
+        return false;
+      continue;
+    }
+    if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(*UI)) {
+      // If the GEP has all zero indices, it doesn't offset the pointer.  If it
+      // doesn't, it does.
+      if (!isOnlyCopiedFromConstantGlobal(GEP, TheCopy,
+                                         isOffset || !GEP->hasAllZeroIndices()))
+        return false;
+      continue;
+    }
+    
+    // If this is isn't our memcpy/memmove, reject it as something we can't
+    // handle.
+    if (!isa<MemTransferInst>(*UI))
+      return false;
+
+    // If we already have seen a copy, reject the second one.
+    if (TheCopy) return false;
+    
+    // If the pointer has been offset from the start of the alloca, we can't
+    // safely handle this.
+    if (isOffset) return false;
+
+    // If the memintrinsic isn't using the alloca as the dest, reject it.
+    if (UI.getOperandNo() != 1) return false;
+    
+    MemIntrinsic *MI = cast<MemIntrinsic>(*UI);
+    
+    // If the source of the memcpy/move is not a constant global, reject it.
+    if (!PointsToConstantGlobal(MI->getOperand(2)))
+      return false;
+    
+    // Otherwise, the transform is safe.  Remember the copy instruction.
+    TheCopy = MI;
+  }
+  return true;
+}
+
+/// isOnlyCopiedFromConstantGlobal - Return true if the specified alloca is only
+/// modified by a copy from a constant global.  If we can prove this, we can
+/// replace any uses of the alloca with uses of the global directly.
+Instruction *SROA::isOnlyCopiedFromConstantGlobal(AllocationInst *AI) {
+  Instruction *TheCopy = 0;
+  if (::isOnlyCopiedFromConstantGlobal(AI, TheCopy, false))
+    return TheCopy;
+  return 0;
+}
diff --git a/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/lib/Transforms/Scalar/SimplifyCFGPass.cpp
new file mode 100644
index 0000000..b499279
--- /dev/null
+++ b/lib/Transforms/Scalar/SimplifyCFGPass.cpp
@@ -0,0 +1,232 @@
+//===- SimplifyCFGPass.cpp - CFG Simplification Pass ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements dead code elimination and basic block merging, along
+// with a collection of other peephole control flow optimizations.  For example:
+//
+//   * Removes basic blocks with no predecessors.
+//   * Merges a basic block into its predecessor if there is only one and the
+//     predecessor only has one successor.
+//   * Eliminates PHI nodes for basic blocks with a single predecessor.
+//   * Eliminates a basic block that only contains an unconditional branch.
+//   * Changes invoke instructions to nounwind functions to be calls.
+//   * Change things like "if (x) if (y)" into "if (x&y)".
+//   * etc..
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "simplifycfg"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Constants.h"
+#include "llvm/Instructions.h"
+#include "llvm/Module.h"
+#include "llvm/Attributes.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Pass.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+using namespace llvm;
+
+STATISTIC(NumSimpl, "Number of blocks simplified");
+
+namespace {
+  struct VISIBILITY_HIDDEN CFGSimplifyPass : public FunctionPass {
+    static char ID; // Pass identification, replacement for typeid
+    CFGSimplifyPass() : FunctionPass(&ID) {}
+
+    virtual bool runOnFunction(Function &F);
+  };
+}
+
+char CFGSimplifyPass::ID = 0;
+static RegisterPass<CFGSimplifyPass> X("simplifycfg", "Simplify the CFG");
+
+// Public interface to the CFGSimplification pass
+FunctionPass *llvm::createCFGSimplificationPass() {
+  return new CFGSimplifyPass();
+}
+
+/// ChangeToUnreachable - Insert an unreachable instruction before the specified
+/// instruction, making it and the rest of the code in the block dead.
+static void ChangeToUnreachable(Instruction *I) {
+  BasicBlock *BB = I->getParent();
+  // Loop over all of the successors, removing BB's entry from any PHI
+  // nodes.
+  for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE; ++SI)
+    (*SI)->removePredecessor(BB);
+  
+  new UnreachableInst(I);
+  
+  // All instructions after this are dead.
+  BasicBlock::iterator BBI = I, BBE = BB->end();
+  while (BBI != BBE) {
+    if (!BBI->use_empty())
+      BBI->replaceAllUsesWith(UndefValue::get(BBI->getType()));
+    BB->getInstList().erase(BBI++);
+  }
+}
+
+/// ChangeToCall - Convert the specified invoke into a normal call.
+static void ChangeToCall(InvokeInst *II) {
+  BasicBlock *BB = II->getParent();
+  SmallVector<Value*, 8> Args(II->op_begin()+3, II->op_end());
+  CallInst *NewCall = CallInst::Create(II->getCalledValue(), Args.begin(),
+                                       Args.end(), "", II);
+  NewCall->takeName(II);
+  NewCall->setCallingConv(II->getCallingConv());
+  NewCall->setAttributes(II->getAttributes());
+  II->replaceAllUsesWith(NewCall);
+
+  // Follow the call by a branch to the normal destination.
+  BranchInst::Create(II->getNormalDest(), II);
+
+  // Update PHI nodes in the unwind destination
+  II->getUnwindDest()->removePredecessor(BB);
+  BB->getInstList().erase(II);
+}
+
+static bool MarkAliveBlocks(BasicBlock *BB,
+                            SmallPtrSet<BasicBlock*, 128> &Reachable) {
+  
+  SmallVector<BasicBlock*, 128> Worklist;
+  Worklist.push_back(BB);
+  bool Changed = false;
+  while (!Worklist.empty()) {
+    BB = Worklist.back();
+    Worklist.pop_back();
+    
+    if (!Reachable.insert(BB))
+      continue;
+
+    // Do a quick scan of the basic block, turning any obviously unreachable
+    // instructions into LLVM unreachable insts.  The instruction combining pass
+    // canonicalizes unreachable insts into stores to null or undef.
+    for (BasicBlock::iterator BBI = BB->begin(), E = BB->end(); BBI != E;++BBI){
+      if (CallInst *CI = dyn_cast<CallInst>(BBI)) {
+        if (CI->doesNotReturn()) {
+          // If we found a call to a no-return function, insert an unreachable
+          // instruction after it.  Make sure there isn't *already* one there
+          // though.
+          ++BBI;
+          if (!isa<UnreachableInst>(BBI)) {
+            ChangeToUnreachable(BBI);
+            Changed = true;
+          }
+          break;
+        }
+      }
+      
+      if (StoreInst *SI = dyn_cast<StoreInst>(BBI))
+        if (isa<ConstantPointerNull>(SI->getOperand(1)) ||
+            isa<UndefValue>(SI->getOperand(1))) {
+          ChangeToUnreachable(SI);
+          Changed = true;
+          break;
+        }
+    }
+
+    // Turn invokes that call 'nounwind' functions into ordinary calls.
+    if (InvokeInst *II = dyn_cast<InvokeInst>(BB->getTerminator()))
+      if (II->doesNotThrow()) {
+        ChangeToCall(II);
+        Changed = true;
+      }
+
+    Changed |= ConstantFoldTerminator(BB);
+    for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE; ++SI)
+      Worklist.push_back(*SI);
+  }
+  return Changed;
+}
+
+/// RemoveUnreachableBlocksFromFn - Remove blocks that are not reachable, even 
+/// if they are in a dead cycle.  Return true if a change was made, false 
+/// otherwise.
+static bool RemoveUnreachableBlocksFromFn(Function &F) {
+  SmallPtrSet<BasicBlock*, 128> Reachable;
+  bool Changed = MarkAliveBlocks(F.begin(), Reachable);
+  
+  // If there are unreachable blocks in the CFG...
+  if (Reachable.size() == F.size())
+    return Changed;
+  
+  assert(Reachable.size() < F.size());
+  NumSimpl += F.size()-Reachable.size();
+  
+  // Loop over all of the basic blocks that are not reachable, dropping all of
+  // their internal references...
+  for (Function::iterator BB = ++F.begin(), E = F.end(); BB != E; ++BB) {
+    if (Reachable.count(BB))
+      continue;
+    
+    for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE; ++SI)
+      if (Reachable.count(*SI))
+        (*SI)->removePredecessor(BB);
+    BB->dropAllReferences();
+  }
+  
+  for (Function::iterator I = ++F.begin(); I != F.end();)
+    if (!Reachable.count(I))
+      I = F.getBasicBlockList().erase(I);
+    else
+      ++I;
+  
+  return true;
+}
+
+/// IterativeSimplifyCFG - Call SimplifyCFG on all the blocks in the function,
+/// iterating until no more changes are made.
+static bool IterativeSimplifyCFG(Function &F) {
+  bool Changed = false;
+  bool LocalChange = true;
+  while (LocalChange) {
+    LocalChange = false;
+    
+    // Loop over all of the basic blocks (except the first one) and remove them
+    // if they are unneeded...
+    //
+    for (Function::iterator BBIt = ++F.begin(); BBIt != F.end(); ) {
+      if (SimplifyCFG(BBIt++)) {
+        LocalChange = true;
+        ++NumSimpl;
+      }
+    }
+    Changed |= LocalChange;
+  }
+  return Changed;
+}
+
+// It is possible that we may require multiple passes over the code to fully
+// simplify the CFG.
+//
+bool CFGSimplifyPass::runOnFunction(Function &F) {
+  bool EverChanged = RemoveUnreachableBlocksFromFn(F);
+  EverChanged |= IterativeSimplifyCFG(F);
+  
+  // If neither pass changed anything, we're done.
+  if (!EverChanged) return false;
+
+  // IterativeSimplifyCFG can (rarely) make some loops dead.  If this happens,
+  // RemoveUnreachableBlocksFromFn is needed to nuke them, which means we should
+  // iterate between the two optimizations.  We structure the code like this to
+  // avoid reruning IterativeSimplifyCFG if the second pass of 
+  // RemoveUnreachableBlocksFromFn doesn't do anything.
+  if (!RemoveUnreachableBlocksFromFn(F))
+    return true;
+  
+  do {
+    EverChanged = IterativeSimplifyCFG(F);
+    EverChanged |= RemoveUnreachableBlocksFromFn(F);
+  } while (EverChanged);
+  
+  return true;
+}
diff --git a/lib/Transforms/Scalar/SimplifyHalfPowrLibCalls.cpp b/lib/Transforms/Scalar/SimplifyHalfPowrLibCalls.cpp
new file mode 100644
index 0000000..4aad17d
--- /dev/null
+++ b/lib/Transforms/Scalar/SimplifyHalfPowrLibCalls.cpp
@@ -0,0 +1,159 @@
+//===- SimplifyHalfPowrLibCalls.cpp - Optimize specific half_powr calls ---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a simple pass that applies an experimental
+// transformation on calls to specific functions.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "simplify-libcalls-halfpowr"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Instructions.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Config/config.h"
+using namespace llvm;
+
+namespace {
+  /// This pass optimizes well half_powr function calls.
+  ///
+  class VISIBILITY_HIDDEN SimplifyHalfPowrLibCalls : public FunctionPass {
+    const TargetData *TD;
+  public:
+    static char ID; // Pass identification
+    SimplifyHalfPowrLibCalls() : FunctionPass(&ID) {}
+
+    bool runOnFunction(Function &F);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequired<TargetData>();
+    }
+
+    Instruction *
+    InlineHalfPowrs(const std::vector<Instruction *> &HalfPowrs,
+                    Instruction *InsertPt);
+  };
+  char SimplifyHalfPowrLibCalls::ID = 0;
+} // end anonymous namespace.
+
+static RegisterPass<SimplifyHalfPowrLibCalls>
+X("simplify-libcalls-halfpowr", "Simplify half_powr library calls");
+
+// Public interface to the Simplify HalfPowr LibCalls pass.
+FunctionPass *llvm::createSimplifyHalfPowrLibCallsPass() {
+  return new SimplifyHalfPowrLibCalls(); 
+}
+
+/// InlineHalfPowrs - Inline a sequence of adjacent half_powr calls, rearranging
+/// their control flow to better facilitate subsequent optimization.
+Instruction *
+SimplifyHalfPowrLibCalls::InlineHalfPowrs(const std::vector<Instruction *> &HalfPowrs,
+                                        Instruction *InsertPt) {
+  std::vector<BasicBlock *> Bodies;
+  BasicBlock *NewBlock = 0;
+
+  for (unsigned i = 0, e = HalfPowrs.size(); i != e; ++i) {
+    CallInst *Call = cast<CallInst>(HalfPowrs[i]);
+    Function *Callee = Call->getCalledFunction();
+
+    // Minimally sanity-check the CFG of half_powr to ensure that it contains
+    // the the kind of code we expect.  If we're running this pass, we have
+    // reason to believe it will be what we expect.
+    Function::iterator I = Callee->begin();
+    BasicBlock *Prologue = I++;
+    if (I == Callee->end()) break;
+    BasicBlock *SubnormalHandling = I++;
+    if (I == Callee->end()) break;
+    BasicBlock *Body = I++;
+    if (I != Callee->end()) break;
+    if (SubnormalHandling->getSinglePredecessor() != Prologue)
+      break;
+    BranchInst *PBI = dyn_cast<BranchInst>(Prologue->getTerminator());
+    if (!PBI || !PBI->isConditional())
+      break;
+    BranchInst *SNBI = dyn_cast<BranchInst>(SubnormalHandling->getTerminator());
+    if (!SNBI || SNBI->isConditional())
+      break;
+    if (!isa<ReturnInst>(Body->getTerminator()))
+      break;
+
+    Instruction *NextInst = next(BasicBlock::iterator(Call));
+
+    // Inline the call, taking care of what code ends up where.
+    NewBlock = SplitBlock(NextInst->getParent(), NextInst, this);
+
+    bool B = InlineFunction(Call, 0, TD);
+    assert(B && "half_powr didn't inline?"); B=B;
+
+    BasicBlock *NewBody = NewBlock->getSinglePredecessor();
+    assert(NewBody);
+    Bodies.push_back(NewBody);
+  }
+
+  if (!NewBlock)
+    return InsertPt;
+
+  // Put the code for all the bodies into one block, to facilitate
+  // subsequent optimization.
+  (void)SplitEdge(NewBlock->getSinglePredecessor(), NewBlock, this);
+  for (unsigned i = 0, e = Bodies.size(); i != e; ++i) {
+    BasicBlock *Body = Bodies[i];
+    Instruction *FNP = Body->getFirstNonPHI();
+    // Splice the insts from body into NewBlock.
+    NewBlock->getInstList().splice(NewBlock->begin(), Body->getInstList(),
+                                   FNP, Body->getTerminator());
+  }
+
+  return NewBlock->begin();
+}
+
+/// runOnFunction - Top level algorithm.
+///
+bool SimplifyHalfPowrLibCalls::runOnFunction(Function &F) {
+  TD = &getAnalysis<TargetData>();
+  
+  bool Changed = false;
+  std::vector<Instruction *> HalfPowrs;
+  for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
+    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
+      // Look for calls.
+      bool IsHalfPowr = false;
+      if (CallInst *CI = dyn_cast<CallInst>(I)) {
+        // Look for direct calls and calls to non-external functions.
+        Function *Callee = CI->getCalledFunction();
+        if (Callee && Callee->hasExternalLinkage()) {
+          // Look for calls with well-known names.
+          const char *CalleeName = Callee->getNameStart();
+          if (strcmp(CalleeName, "__half_powrf4") == 0)
+            IsHalfPowr = true;
+        }
+      }
+      if (IsHalfPowr)
+        HalfPowrs.push_back(I);
+      // We're looking for sequences of up to three such calls, which we'll
+      // simplify as a group.
+      if ((!IsHalfPowr && !HalfPowrs.empty()) || HalfPowrs.size() == 3) {
+        I = InlineHalfPowrs(HalfPowrs, I);
+        E = I->getParent()->end();
+        HalfPowrs.clear();
+        Changed = true;
+      }
+    }
+    assert(HalfPowrs.empty() && "Block had no terminator!");
+  }
+
+  return Changed;
+}
diff --git a/lib/Transforms/Scalar/SimplifyLibCalls.cpp b/lib/Transforms/Scalar/SimplifyLibCalls.cpp
new file mode 100644
index 0000000..4b00640
--- /dev/null
+++ b/lib/Transforms/Scalar/SimplifyLibCalls.cpp
@@ -0,0 +1,2429 @@
+//===- SimplifyLibCalls.cpp - Optimize specific well-known library calls --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a simple pass that applies a variety of small
+// optimizations for calls to specific well-known function calls (e.g. runtime
+// library functions). For example, a call to the function "exit(3)" that
+// occurs within the main() function can be transformed into a simple "return 3"
+// instruction. Any optimization that takes this form (replace call to library
+// function with simpler code that provides the same result) belongs in this
+// file.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "simplify-libcalls"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/IRBuilder.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Config/config.h"
+using namespace llvm;
+
+STATISTIC(NumSimplified, "Number of library calls simplified");
+STATISTIC(NumAnnotated, "Number of attributes added to library functions");
+
+//===----------------------------------------------------------------------===//
+// Optimizer Base Class
+//===----------------------------------------------------------------------===//
+
+/// This class is the abstract base class for the set of optimizations that
+/// corresponds to one library call.
+namespace {
+class VISIBILITY_HIDDEN LibCallOptimization {
+protected:
+  Function *Caller;
+  const TargetData *TD;
+public:
+  LibCallOptimization() { }
+  virtual ~LibCallOptimization() {}
+
+  /// CallOptimizer - This pure virtual method is implemented by base classes to
+  /// do various optimizations.  If this returns null then no transformation was
+  /// performed.  If it returns CI, then it transformed the call and CI is to be
+  /// deleted.  If it returns something else, replace CI with the new value and
+  /// delete CI.
+  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) 
+    =0;
+  
+  Value *OptimizeCall(CallInst *CI, const TargetData &TD, IRBuilder<> &B) {
+    Caller = CI->getParent()->getParent();
+    this->TD = &TD;
+    return CallOptimizer(CI->getCalledFunction(), CI, B);
+  }
+
+  /// CastToCStr - Return V if it is an i8*, otherwise cast it to i8*.
+  Value *CastToCStr(Value *V, IRBuilder<> &B);
+
+  /// EmitStrLen - Emit a call to the strlen function to the builder, for the
+  /// specified pointer.  Ptr is required to be some pointer type, and the
+  /// return value has 'intptr_t' type.
+  Value *EmitStrLen(Value *Ptr, IRBuilder<> &B);
+  
+  /// EmitMemCpy - Emit a call to the memcpy function to the builder.  This
+  /// always expects that the size has type 'intptr_t' and Dst/Src are pointers.
+  Value *EmitMemCpy(Value *Dst, Value *Src, Value *Len, 
+                    unsigned Align, IRBuilder<> &B);
+  
+  /// EmitMemChr - Emit a call to the memchr function.  This assumes that Ptr is
+  /// a pointer, Val is an i32 value, and Len is an 'intptr_t' value.
+  Value *EmitMemChr(Value *Ptr, Value *Val, Value *Len, IRBuilder<> &B);
+
+  /// EmitMemCmp - Emit a call to the memcmp function.
+  Value *EmitMemCmp(Value *Ptr1, Value *Ptr2, Value *Len, IRBuilder<> &B);
+
+  /// EmitMemSet - Emit a call to the memset function
+  Value *EmitMemSet(Value *Dst, Value *Val, Value *Len, IRBuilder<> &B);
+
+  /// EmitUnaryFloatFnCall - Emit a call to the unary function named 'Name' (e.g.
+  /// 'floor').  This function is known to take a single of type matching 'Op'
+  /// and returns one value with the same type.  If 'Op' is a long double, 'l'
+  /// is added as the suffix of name, if 'Op' is a float, we add a 'f' suffix.
+  Value *EmitUnaryFloatFnCall(Value *Op, const char *Name, IRBuilder<> &B);
+  
+  /// EmitPutChar - Emit a call to the putchar function.  This assumes that Char
+  /// is an integer.
+  void EmitPutChar(Value *Char, IRBuilder<> &B);
+  
+  /// EmitPutS - Emit a call to the puts function.  This assumes that Str is
+  /// some pointer.
+  void EmitPutS(Value *Str, IRBuilder<> &B);
+    
+  /// EmitFPutC - Emit a call to the fputc function.  This assumes that Char is
+  /// an i32, and File is a pointer to FILE.
+  void EmitFPutC(Value *Char, Value *File, IRBuilder<> &B);
+  
+  /// EmitFPutS - Emit a call to the puts function.  Str is required to be a
+  /// pointer and File is a pointer to FILE.
+  void EmitFPutS(Value *Str, Value *File, IRBuilder<> &B);
+  
+  /// EmitFWrite - Emit a call to the fwrite function.  This assumes that Ptr is
+  /// a pointer, Size is an 'intptr_t', and File is a pointer to FILE.
+  void EmitFWrite(Value *Ptr, Value *Size, Value *File, IRBuilder<> &B);
+  
+};
+} // End anonymous namespace.
+
+/// CastToCStr - Return V if it is an i8*, otherwise cast it to i8*.
+Value *LibCallOptimization::CastToCStr(Value *V, IRBuilder<> &B) {
+  return B.CreateBitCast(V, PointerType::getUnqual(Type::Int8Ty), "cstr");
+}
+
+/// EmitStrLen - Emit a call to the strlen function to the builder, for the
+/// specified pointer.  This always returns an integer value of size intptr_t.
+Value *LibCallOptimization::EmitStrLen(Value *Ptr, IRBuilder<> &B) {
+  Module *M = Caller->getParent();
+  AttributeWithIndex AWI[2];
+  AWI[0] = AttributeWithIndex::get(1, Attribute::NoCapture);
+  AWI[1] = AttributeWithIndex::get(~0u, Attribute::ReadOnly |
+                                   Attribute::NoUnwind);
+
+  Constant *StrLen =M->getOrInsertFunction("strlen", AttrListPtr::get(AWI, 2),
+                                           TD->getIntPtrType(),
+                                           PointerType::getUnqual(Type::Int8Ty),
+                                           NULL);
+  return B.CreateCall(StrLen, CastToCStr(Ptr, B), "strlen");
+}
+
+/// EmitMemCpy - Emit a call to the memcpy function to the builder.  This always
+/// expects that the size has type 'intptr_t' and Dst/Src are pointers.
+Value *LibCallOptimization::EmitMemCpy(Value *Dst, Value *Src, Value *Len,
+                                       unsigned Align, IRBuilder<> &B) {
+  Module *M = Caller->getParent();
+  Intrinsic::ID IID = Intrinsic::memcpy;
+  const Type *Tys[1];
+  Tys[0] = Len->getType();
+  Value *MemCpy = Intrinsic::getDeclaration(M, IID, Tys, 1);
+  return B.CreateCall4(MemCpy, CastToCStr(Dst, B), CastToCStr(Src, B), Len,
+                       ConstantInt::get(Type::Int32Ty, Align));
+}
+
+/// EmitMemChr - Emit a call to the memchr function.  This assumes that Ptr is
+/// a pointer, Val is an i32 value, and Len is an 'intptr_t' value.
+Value *LibCallOptimization::EmitMemChr(Value *Ptr, Value *Val,
+                                       Value *Len, IRBuilder<> &B) {
+  Module *M = Caller->getParent();
+  AttributeWithIndex AWI;
+  AWI = AttributeWithIndex::get(~0u, Attribute::ReadOnly | Attribute::NoUnwind);
+
+  Value *MemChr = M->getOrInsertFunction("memchr", AttrListPtr::get(&AWI, 1),
+                                         PointerType::getUnqual(Type::Int8Ty),
+                                         PointerType::getUnqual(Type::Int8Ty),
+                                         Type::Int32Ty, TD->getIntPtrType(),
+                                         NULL);
+  return B.CreateCall3(MemChr, CastToCStr(Ptr, B), Val, Len, "memchr");
+}
+
+/// EmitMemCmp - Emit a call to the memcmp function.
+Value *LibCallOptimization::EmitMemCmp(Value *Ptr1, Value *Ptr2,
+                                       Value *Len, IRBuilder<> &B) {
+  Module *M = Caller->getParent();
+  AttributeWithIndex AWI[3];
+  AWI[0] = AttributeWithIndex::get(1, Attribute::NoCapture);
+  AWI[1] = AttributeWithIndex::get(2, Attribute::NoCapture);
+  AWI[2] = AttributeWithIndex::get(~0u, Attribute::ReadOnly |
+                                   Attribute::NoUnwind);
+
+  Value *MemCmp = M->getOrInsertFunction("memcmp", AttrListPtr::get(AWI, 3),
+                                         Type::Int32Ty,
+                                         PointerType::getUnqual(Type::Int8Ty),
+                                         PointerType::getUnqual(Type::Int8Ty),
+                                         TD->getIntPtrType(), NULL);
+  return B.CreateCall3(MemCmp, CastToCStr(Ptr1, B), CastToCStr(Ptr2, B),
+                       Len, "memcmp");
+}
+
+/// EmitMemSet - Emit a call to the memset function
+Value *LibCallOptimization::EmitMemSet(Value *Dst, Value *Val,
+                                       Value *Len, IRBuilder<> &B) {
+ Module *M = Caller->getParent();
+ Intrinsic::ID IID = Intrinsic::memset;
+ const Type *Tys[1];
+ Tys[0] = Len->getType();
+ Value *MemSet = Intrinsic::getDeclaration(M, IID, Tys, 1);
+ Value *Align = ConstantInt::get(Type::Int32Ty, 1);
+ return B.CreateCall4(MemSet, CastToCStr(Dst, B), Val, Len, Align);
+}
+
+/// EmitUnaryFloatFnCall - Emit a call to the unary function named 'Name' (e.g.
+/// 'floor').  This function is known to take a single of type matching 'Op' and
+/// returns one value with the same type.  If 'Op' is a long double, 'l' is
+/// added as the suffix of name, if 'Op' is a float, we add a 'f' suffix.
+Value *LibCallOptimization::EmitUnaryFloatFnCall(Value *Op, const char *Name,
+                                                 IRBuilder<> &B) {
+  char NameBuffer[20];
+  if (Op->getType() != Type::DoubleTy) {
+    // If we need to add a suffix, copy into NameBuffer.
+    unsigned NameLen = strlen(Name);
+    assert(NameLen < sizeof(NameBuffer)-2);
+    memcpy(NameBuffer, Name, NameLen);
+    if (Op->getType() == Type::FloatTy)
+      NameBuffer[NameLen] = 'f';  // floorf
+    else
+      NameBuffer[NameLen] = 'l';  // floorl
+    NameBuffer[NameLen+1] = 0;
+    Name = NameBuffer;
+  }
+  
+  Module *M = Caller->getParent();
+  Value *Callee = M->getOrInsertFunction(Name, Op->getType(), 
+                                         Op->getType(), NULL);
+  return B.CreateCall(Callee, Op, Name);
+}
+
+/// EmitPutChar - Emit a call to the putchar function.  This assumes that Char
+/// is an integer.
+void LibCallOptimization::EmitPutChar(Value *Char, IRBuilder<> &B) {
+  Module *M = Caller->getParent();
+  Value *F = M->getOrInsertFunction("putchar", Type::Int32Ty,
+                                    Type::Int32Ty, NULL);
+  B.CreateCall(F, B.CreateIntCast(Char, Type::Int32Ty, "chari"), "putchar");
+}
+
+/// EmitPutS - Emit a call to the puts function.  This assumes that Str is
+/// some pointer.
+void LibCallOptimization::EmitPutS(Value *Str, IRBuilder<> &B) {
+  Module *M = Caller->getParent();
+  AttributeWithIndex AWI[2];
+  AWI[0] = AttributeWithIndex::get(1, Attribute::NoCapture);
+  AWI[1] = AttributeWithIndex::get(~0u, Attribute::NoUnwind);
+
+  Value *F = M->getOrInsertFunction("puts", AttrListPtr::get(AWI, 2),
+                                    Type::Int32Ty,
+                                    PointerType::getUnqual(Type::Int8Ty), NULL);
+  B.CreateCall(F, CastToCStr(Str, B), "puts");
+}
+
+/// EmitFPutC - Emit a call to the fputc function.  This assumes that Char is
+/// an integer and File is a pointer to FILE.
+void LibCallOptimization::EmitFPutC(Value *Char, Value *File, IRBuilder<> &B) {
+  Module *M = Caller->getParent();
+  AttributeWithIndex AWI[2];
+  AWI[0] = AttributeWithIndex::get(2, Attribute::NoCapture);
+  AWI[1] = AttributeWithIndex::get(~0u, Attribute::NoUnwind);
+  Constant *F;
+  if (isa<PointerType>(File->getType()))
+    F = M->getOrInsertFunction("fputc", AttrListPtr::get(AWI, 2), Type::Int32Ty,
+                               Type::Int32Ty, File->getType(), NULL);
+                                         
+  else
+    F = M->getOrInsertFunction("fputc", Type::Int32Ty, Type::Int32Ty,
+                               File->getType(), NULL);
+  Char = B.CreateIntCast(Char, Type::Int32Ty, "chari");
+  B.CreateCall2(F, Char, File, "fputc");
+}
+
+/// EmitFPutS - Emit a call to the puts function.  Str is required to be a
+/// pointer and File is a pointer to FILE.
+void LibCallOptimization::EmitFPutS(Value *Str, Value *File, IRBuilder<> &B) {
+  Module *M = Caller->getParent();
+  AttributeWithIndex AWI[3];
+  AWI[0] = AttributeWithIndex::get(1, Attribute::NoCapture);
+  AWI[1] = AttributeWithIndex::get(2, Attribute::NoCapture);
+  AWI[2] = AttributeWithIndex::get(~0u, Attribute::NoUnwind);
+  Constant *F;
+  if (isa<PointerType>(File->getType()))
+    F = M->getOrInsertFunction("fputs", AttrListPtr::get(AWI, 3), Type::Int32Ty,
+                               PointerType::getUnqual(Type::Int8Ty),
+                               File->getType(), NULL);
+  else
+    F = M->getOrInsertFunction("fputs", Type::Int32Ty,
+                               PointerType::getUnqual(Type::Int8Ty),
+                               File->getType(), NULL);
+  B.CreateCall2(F, CastToCStr(Str, B), File, "fputs");
+}
+
+/// EmitFWrite - Emit a call to the fwrite function.  This assumes that Ptr is
+/// a pointer, Size is an 'intptr_t', and File is a pointer to FILE.
+void LibCallOptimization::EmitFWrite(Value *Ptr, Value *Size, Value *File,
+                                     IRBuilder<> &B) {
+  Module *M = Caller->getParent();
+  AttributeWithIndex AWI[3];
+  AWI[0] = AttributeWithIndex::get(1, Attribute::NoCapture);
+  AWI[1] = AttributeWithIndex::get(4, Attribute::NoCapture);
+  AWI[2] = AttributeWithIndex::get(~0u, Attribute::NoUnwind);
+  Constant *F;
+  if (isa<PointerType>(File->getType()))
+    F = M->getOrInsertFunction("fwrite", AttrListPtr::get(AWI, 3),
+                               TD->getIntPtrType(),
+                               PointerType::getUnqual(Type::Int8Ty),
+                               TD->getIntPtrType(), TD->getIntPtrType(),
+                               File->getType(), NULL);
+  else
+    F = M->getOrInsertFunction("fwrite", TD->getIntPtrType(),
+                               PointerType::getUnqual(Type::Int8Ty),
+                               TD->getIntPtrType(), TD->getIntPtrType(),
+                               File->getType(), NULL);
+  B.CreateCall4(F, CastToCStr(Ptr, B), Size, 
+                ConstantInt::get(TD->getIntPtrType(), 1), File);
+}
+
+//===----------------------------------------------------------------------===//
+// Helper Functions
+//===----------------------------------------------------------------------===//
+
+/// GetStringLengthH - If we can compute the length of the string pointed to by
+/// the specified pointer, return 'len+1'.  If we can't, return 0.
+static uint64_t GetStringLengthH(Value *V, SmallPtrSet<PHINode*, 32> &PHIs) {
+  // Look through noop bitcast instructions.
+  if (BitCastInst *BCI = dyn_cast<BitCastInst>(V))
+    return GetStringLengthH(BCI->getOperand(0), PHIs);
+  
+  // If this is a PHI node, there are two cases: either we have already seen it
+  // or we haven't.
+  if (PHINode *PN = dyn_cast<PHINode>(V)) {
+    if (!PHIs.insert(PN))
+      return ~0ULL;  // already in the set.
+    
+    // If it was new, see if all the input strings are the same length.
+    uint64_t LenSoFar = ~0ULL;
+    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+      uint64_t Len = GetStringLengthH(PN->getIncomingValue(i), PHIs);
+      if (Len == 0) return 0; // Unknown length -> unknown.
+      
+      if (Len == ~0ULL) continue;
+      
+      if (Len != LenSoFar && LenSoFar != ~0ULL)
+        return 0;    // Disagree -> unknown.
+      LenSoFar = Len;
+    }
+    
+    // Success, all agree.
+    return LenSoFar;
+  }
+  
+  // strlen(select(c,x,y)) -> strlen(x) ^ strlen(y)
+  if (SelectInst *SI = dyn_cast<SelectInst>(V)) {
+    uint64_t Len1 = GetStringLengthH(SI->getTrueValue(), PHIs);
+    if (Len1 == 0) return 0;
+    uint64_t Len2 = GetStringLengthH(SI->getFalseValue(), PHIs);
+    if (Len2 == 0) return 0;
+    if (Len1 == ~0ULL) return Len2;
+    if (Len2 == ~0ULL) return Len1;
+    if (Len1 != Len2) return 0;
+    return Len1;
+  }
+  
+  // If the value is not a GEP instruction nor a constant expression with a
+  // GEP instruction, then return unknown.
+  User *GEP = 0;
+  if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(V)) {
+    GEP = GEPI;
+  } else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V)) {
+    if (CE->getOpcode() != Instruction::GetElementPtr)
+      return 0;
+    GEP = CE;
+  } else {
+    return 0;
+  }
+  
+  // Make sure the GEP has exactly three arguments.
+  if (GEP->getNumOperands() != 3)
+    return 0;
+  
+  // Check to make sure that the first operand of the GEP is an integer and
+  // has value 0 so that we are sure we're indexing into the initializer.
+  if (ConstantInt *Idx = dyn_cast<ConstantInt>(GEP->getOperand(1))) {
+    if (!Idx->isZero())
+      return 0;
+  } else
+    return 0;
+  
+  // If the second index isn't a ConstantInt, then this is a variable index
+  // into the array.  If this occurs, we can't say anything meaningful about
+  // the string.
+  uint64_t StartIdx = 0;
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(GEP->getOperand(2)))
+    StartIdx = CI->getZExtValue();
+  else
+    return 0;
+  
+  // The GEP instruction, constant or instruction, must reference a global
+  // variable that is a constant and is initialized. The referenced constant
+  // initializer is the array that we'll use for optimization.
+  GlobalVariable* GV = dyn_cast<GlobalVariable>(GEP->getOperand(0));
+  if (!GV || !GV->isConstant() || !GV->hasInitializer())
+    return 0;
+  Constant *GlobalInit = GV->getInitializer();
+  
+  // Handle the ConstantAggregateZero case, which is a degenerate case. The
+  // initializer is constant zero so the length of the string must be zero.
+  if (isa<ConstantAggregateZero>(GlobalInit))
+    return 1;  // Len = 0 offset by 1.
+  
+  // Must be a Constant Array
+  ConstantArray *Array = dyn_cast<ConstantArray>(GlobalInit);
+  if (!Array || Array->getType()->getElementType() != Type::Int8Ty)
+    return false;
+  
+  // Get the number of elements in the array
+  uint64_t NumElts = Array->getType()->getNumElements();
+  
+  // Traverse the constant array from StartIdx (derived above) which is
+  // the place the GEP refers to in the array.
+  for (unsigned i = StartIdx; i != NumElts; ++i) {
+    Constant *Elt = Array->getOperand(i);
+    ConstantInt *CI = dyn_cast<ConstantInt>(Elt);
+    if (!CI) // This array isn't suitable, non-int initializer.
+      return 0;
+    if (CI->isZero())
+      return i-StartIdx+1; // We found end of string, success!
+  }
+  
+  return 0; // The array isn't null terminated, conservatively return 'unknown'.
+}
+
+/// GetStringLength - If we can compute the length of the string pointed to by
+/// the specified pointer, return 'len+1'.  If we can't, return 0.
+static uint64_t GetStringLength(Value *V) {
+  if (!isa<PointerType>(V->getType())) return 0;
+  
+  SmallPtrSet<PHINode*, 32> PHIs;
+  uint64_t Len = GetStringLengthH(V, PHIs);
+  // If Len is ~0ULL, we had an infinite phi cycle: this is dead code, so return
+  // an empty string as a length.
+  return Len == ~0ULL ? 1 : Len;
+}
+
+/// IsOnlyUsedInZeroEqualityComparison - Return true if it only matters that the
+/// value is equal or not-equal to zero. 
+static bool IsOnlyUsedInZeroEqualityComparison(Value *V) {
+  for (Value::use_iterator UI = V->use_begin(), E = V->use_end();
+       UI != E; ++UI) {
+    if (ICmpInst *IC = dyn_cast<ICmpInst>(*UI))
+      if (IC->isEquality())
+        if (Constant *C = dyn_cast<Constant>(IC->getOperand(1)))
+          if (C->isNullValue())
+            continue;
+    // Unknown instruction.
+    return false;
+  }
+  return true;
+}
+
+//===----------------------------------------------------------------------===//
+// Miscellaneous LibCall Optimizations
+//===----------------------------------------------------------------------===//
+
+namespace {
+//===---------------------------------------===//
+// 'exit' Optimizations
+
+/// ExitOpt - int main() { exit(4); } --> int main() { return 4; }
+struct VISIBILITY_HIDDEN ExitOpt : public LibCallOptimization {
+  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    // Verify we have a reasonable prototype for exit.
+    if (Callee->arg_size() == 0 || !CI->use_empty())
+      return 0;
+
+    // Verify the caller is main, and that the result type of main matches the
+    // argument type of exit.
+    if (!Caller->isName("main") || !Caller->hasExternalLinkage() ||
+        Caller->getReturnType() != CI->getOperand(1)->getType())
+      return 0;
+
+    TerminatorInst *OldTI = CI->getParent()->getTerminator();
+    
+    // Create the return after the call.
+    ReturnInst *RI = B.CreateRet(CI->getOperand(1));
+
+    // Drop all successor phi node entries.
+    for (unsigned i = 0, e = OldTI->getNumSuccessors(); i != e; ++i)
+      OldTI->getSuccessor(i)->removePredecessor(CI->getParent());
+    
+    // Erase all instructions from after our return instruction until the end of
+    // the block.
+    BasicBlock::iterator FirstDead = RI; ++FirstDead;
+    CI->getParent()->getInstList().erase(FirstDead, CI->getParent()->end());
+    return CI;
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// String and Memory LibCall Optimizations
+//===----------------------------------------------------------------------===//
+
+//===---------------------------------------===//
+// 'strcat' Optimizations
+
+struct VISIBILITY_HIDDEN StrCatOpt : public LibCallOptimization {
+  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    // Verify the "strcat" function prototype.
+    const FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 2 ||
+        FT->getReturnType() != PointerType::getUnqual(Type::Int8Ty) ||
+        FT->getParamType(0) != FT->getReturnType() ||
+        FT->getParamType(1) != FT->getReturnType())
+      return 0;
+    
+    // Extract some information from the instruction
+    Value *Dst = CI->getOperand(1);
+    Value *Src = CI->getOperand(2);
+    
+    // See if we can get the length of the input string.
+    uint64_t Len = GetStringLength(Src);
+    if (Len == 0) return 0;
+    --Len;  // Unbias length.
+    
+    // Handle the simple, do-nothing case: strcat(x, "") -> x
+    if (Len == 0)
+      return Dst;
+    
+    EmitStrLenMemCpy(Src, Dst, Len, B);
+    return Dst;
+  }
+
+  void EmitStrLenMemCpy(Value *Src, Value *Dst, uint64_t Len, IRBuilder<> &B) {
+    // We need to find the end of the destination string.  That's where the
+    // memory is to be moved to. We just generate a call to strlen.
+    Value *DstLen = EmitStrLen(Dst, B);
+    
+    // Now that we have the destination's length, we must index into the
+    // destination's pointer to get the actual memcpy destination (end of
+    // the string .. we're concatenating).
+    Value *CpyDst = B.CreateGEP(Dst, DstLen, "endptr");
+    
+    // We have enough information to now generate the memcpy call to do the
+    // concatenation for us.  Make a memcpy to copy the nul byte with align = 1.
+    EmitMemCpy(CpyDst, Src, ConstantInt::get(TD->getIntPtrType(), Len+1), 1, B);
+  }
+};
+
+//===---------------------------------------===//
+// 'strncat' Optimizations
+
+struct VISIBILITY_HIDDEN StrNCatOpt : public StrCatOpt {
+  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    // Verify the "strncat" function prototype.
+    const FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 3 ||
+        FT->getReturnType() != PointerType::getUnqual(Type::Int8Ty) ||
+        FT->getParamType(0) != FT->getReturnType() ||
+        FT->getParamType(1) != FT->getReturnType() ||
+        !isa<IntegerType>(FT->getParamType(2)))
+      return 0;
+
+    // Extract some information from the instruction
+    Value *Dst = CI->getOperand(1);
+    Value *Src = CI->getOperand(2);
+    uint64_t Len;
+
+    // We don't do anything if length is not constant
+    if (ConstantInt *LengthArg = dyn_cast<ConstantInt>(CI->getOperand(3)))
+      Len = LengthArg->getZExtValue();
+    else
+      return 0;
+
+    // See if we can get the length of the input string.
+    uint64_t SrcLen = GetStringLength(Src);
+    if (SrcLen == 0) return 0;
+    --SrcLen;  // Unbias length.
+
+    // Handle the simple, do-nothing cases:
+    // strncat(x, "", c) -> x
+    // strncat(x,  c, 0) -> x
+    if (SrcLen == 0 || Len == 0) return Dst;
+
+    // We don't optimize this case
+    if (Len < SrcLen) return 0;
+
+    // strncat(x, s, c) -> strcat(x, s)
+    // s is constant so the strcat can be optimized further
+    EmitStrLenMemCpy(Src, Dst, SrcLen, B);
+    return Dst;
+  }
+};
+
+//===---------------------------------------===//
+// 'strchr' Optimizations
+
+struct VISIBILITY_HIDDEN StrChrOpt : public LibCallOptimization {
+  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    // Verify the "strchr" function prototype.
+    const FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 2 ||
+        FT->getReturnType() != PointerType::getUnqual(Type::Int8Ty) ||
+        FT->getParamType(0) != FT->getReturnType())
+      return 0;
+    
+    Value *SrcStr = CI->getOperand(1);
+    
+    // If the second operand is non-constant, see if we can compute the length
+    // of the input string and turn this into memchr.
+    ConstantInt *CharC = dyn_cast<ConstantInt>(CI->getOperand(2));
+    if (CharC == 0) {
+      uint64_t Len = GetStringLength(SrcStr);
+      if (Len == 0 || FT->getParamType(1) != Type::Int32Ty) // memchr needs i32.
+        return 0;
+      
+      return EmitMemChr(SrcStr, CI->getOperand(2), // include nul.
+                        ConstantInt::get(TD->getIntPtrType(), Len), B);
+    }
+
+    // Otherwise, the character is a constant, see if the first argument is
+    // a string literal.  If so, we can constant fold.
+    std::string Str;
+    if (!GetConstantStringInfo(SrcStr, Str))
+      return 0;
+    
+    // strchr can find the nul character.
+    Str += '\0';
+    char CharValue = CharC->getSExtValue();
+    
+    // Compute the offset.
+    uint64_t i = 0;
+    while (1) {
+      if (i == Str.size())    // Didn't find the char.  strchr returns null.
+        return Constant::getNullValue(CI->getType());
+      // Did we find our match?
+      if (Str[i] == CharValue)
+        break;
+      ++i;
+    }
+    
+    // strchr(s+n,c)  -> gep(s+n+i,c)
+    Value *Idx = ConstantInt::get(Type::Int64Ty, i);
+    return B.CreateGEP(SrcStr, Idx, "strchr");
+  }
+};
+
+//===---------------------------------------===//
+// 'strcmp' Optimizations
+
+struct VISIBILITY_HIDDEN StrCmpOpt : public LibCallOptimization {
+  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    // Verify the "strcmp" function prototype.
+    const FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 2 || FT->getReturnType() != Type::Int32Ty ||
+        FT->getParamType(0) != FT->getParamType(1) ||
+        FT->getParamType(0) != PointerType::getUnqual(Type::Int8Ty))
+      return 0;
+    
+    Value *Str1P = CI->getOperand(1), *Str2P = CI->getOperand(2);
+    if (Str1P == Str2P)      // strcmp(x,x)  -> 0
+      return ConstantInt::get(CI->getType(), 0);
+    
+    std::string Str1, Str2;
+    bool HasStr1 = GetConstantStringInfo(Str1P, Str1);
+    bool HasStr2 = GetConstantStringInfo(Str2P, Str2);
+    
+    if (HasStr1 && Str1.empty()) // strcmp("", x) -> *x
+      return B.CreateZExt(B.CreateLoad(Str2P, "strcmpload"), CI->getType());
+    
+    if (HasStr2 && Str2.empty()) // strcmp(x,"") -> *x
+      return B.CreateZExt(B.CreateLoad(Str1P, "strcmpload"), CI->getType());
+    
+    // strcmp(x, y)  -> cnst  (if both x and y are constant strings)
+    if (HasStr1 && HasStr2)
+      return ConstantInt::get(CI->getType(), strcmp(Str1.c_str(),Str2.c_str()));
+
+    // strcmp(P, "x") -> memcmp(P, "x", 2)
+    uint64_t Len1 = GetStringLength(Str1P);
+    uint64_t Len2 = GetStringLength(Str2P);
+    if (Len1 || Len2) {
+      // Choose the smallest Len excluding 0 which means 'unknown'.
+      if (!Len1 || (Len2 && Len2 < Len1))
+        Len1 = Len2;
+      return EmitMemCmp(Str1P, Str2P,
+                        ConstantInt::get(TD->getIntPtrType(), Len1), B);
+    }
+
+    return 0;
+  }
+};
+
+//===---------------------------------------===//
+// 'strncmp' Optimizations
+
+struct VISIBILITY_HIDDEN StrNCmpOpt : public LibCallOptimization {
+  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    // Verify the "strncmp" function prototype.
+    const FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 3 || FT->getReturnType() != Type::Int32Ty ||
+        FT->getParamType(0) != FT->getParamType(1) ||
+        FT->getParamType(0) != PointerType::getUnqual(Type::Int8Ty) ||
+        !isa<IntegerType>(FT->getParamType(2)))
+      return 0;
+    
+    Value *Str1P = CI->getOperand(1), *Str2P = CI->getOperand(2);
+    if (Str1P == Str2P)      // strncmp(x,x,n)  -> 0
+      return ConstantInt::get(CI->getType(), 0);
+    
+    // Get the length argument if it is constant.
+    uint64_t Length;
+    if (ConstantInt *LengthArg = dyn_cast<ConstantInt>(CI->getOperand(3)))
+      Length = LengthArg->getZExtValue();
+    else
+      return 0;
+    
+    if (Length == 0) // strncmp(x,y,0)   -> 0
+      return ConstantInt::get(CI->getType(), 0);
+    
+    std::string Str1, Str2;
+    bool HasStr1 = GetConstantStringInfo(Str1P, Str1);
+    bool HasStr2 = GetConstantStringInfo(Str2P, Str2);
+    
+    if (HasStr1 && Str1.empty())  // strncmp("", x, n) -> *x
+      return B.CreateZExt(B.CreateLoad(Str2P, "strcmpload"), CI->getType());
+    
+    if (HasStr2 && Str2.empty())  // strncmp(x, "", n) -> *x
+      return B.CreateZExt(B.CreateLoad(Str1P, "strcmpload"), CI->getType());
+    
+    // strncmp(x, y)  -> cnst  (if both x and y are constant strings)
+    if (HasStr1 && HasStr2)
+      return ConstantInt::get(CI->getType(),
+                              strncmp(Str1.c_str(), Str2.c_str(), Length));
+    return 0;
+  }
+};
+
+
+//===---------------------------------------===//
+// 'strcpy' Optimizations
+
+struct VISIBILITY_HIDDEN StrCpyOpt : public LibCallOptimization {
+  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    // Verify the "strcpy" function prototype.
+    const FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 2 || FT->getReturnType() != FT->getParamType(0) ||
+        FT->getParamType(0) != FT->getParamType(1) ||
+        FT->getParamType(0) != PointerType::getUnqual(Type::Int8Ty))
+      return 0;
+    
+    Value *Dst = CI->getOperand(1), *Src = CI->getOperand(2);
+    if (Dst == Src)      // strcpy(x,x)  -> x
+      return Src;
+    
+    // See if we can get the length of the input string.
+    uint64_t Len = GetStringLength(Src);
+    if (Len == 0) return 0;
+    
+    // We have enough information to now generate the memcpy call to do the
+    // concatenation for us.  Make a memcpy to copy the nul byte with align = 1.
+    EmitMemCpy(Dst, Src, ConstantInt::get(TD->getIntPtrType(), Len), 1, B);
+    return Dst;
+  }
+};
+
+//===---------------------------------------===//
+// 'strncpy' Optimizations
+
+struct VISIBILITY_HIDDEN StrNCpyOpt : public LibCallOptimization {
+  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    const FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 3 || FT->getReturnType() != FT->getParamType(0) ||
+        FT->getParamType(0) != FT->getParamType(1) ||
+        FT->getParamType(0) != PointerType::getUnqual(Type::Int8Ty) ||
+        !isa<IntegerType>(FT->getParamType(2)))
+      return 0;
+
+    Value *Dst = CI->getOperand(1);
+    Value *Src = CI->getOperand(2);
+    Value *LenOp = CI->getOperand(3);
+
+    // See if we can get the length of the input string.
+    uint64_t SrcLen = GetStringLength(Src);
+    if (SrcLen == 0) return 0;
+    --SrcLen;
+
+    if (SrcLen == 0) {
+      // strncpy(x, "", y) -> memset(x, '\0', y, 1)
+      EmitMemSet(Dst, ConstantInt::get(Type::Int8Ty, '\0'), LenOp, B);
+      return Dst;
+    }
+
+    uint64_t Len;
+    if (ConstantInt *LengthArg = dyn_cast<ConstantInt>(LenOp))
+      Len = LengthArg->getZExtValue();
+    else
+      return 0;
+
+    if (Len == 0) return Dst; // strncpy(x, y, 0) -> x
+
+    // Let strncpy handle the zero padding
+    if (Len > SrcLen+1) return 0;
+
+    // strncpy(x, s, c) -> memcpy(x, s, c, 1) [s and c are constant]
+    EmitMemCpy(Dst, Src, ConstantInt::get(TD->getIntPtrType(), Len), 1, B);
+
+    return Dst;
+  }
+};
+
+//===---------------------------------------===//
+// 'strlen' Optimizations
+
+struct VISIBILITY_HIDDEN StrLenOpt : public LibCallOptimization {
+  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    const FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 1 ||
+        FT->getParamType(0) != PointerType::getUnqual(Type::Int8Ty) ||
+        !isa<IntegerType>(FT->getReturnType()))
+      return 0;
+    
+    Value *Src = CI->getOperand(1);
+
+    // Constant folding: strlen("xyz") -> 3
+    if (uint64_t Len = GetStringLength(Src))
+      return ConstantInt::get(CI->getType(), Len-1);
+
+    // Handle strlen(p) != 0.
+    if (!IsOnlyUsedInZeroEqualityComparison(CI)) return 0;
+
+    // strlen(x) != 0 --> *x != 0
+    // strlen(x) == 0 --> *x == 0
+    return B.CreateZExt(B.CreateLoad(Src, "strlenfirst"), CI->getType());
+  }
+};
+
+//===---------------------------------------===//
+// 'strto*' Optimizations
+
+struct VISIBILITY_HIDDEN StrToOpt : public LibCallOptimization {
+  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    const FunctionType *FT = Callee->getFunctionType();
+    if ((FT->getNumParams() != 2 && FT->getNumParams() != 3) ||
+        !isa<PointerType>(FT->getParamType(0)) ||
+        !isa<PointerType>(FT->getParamType(1)))
+      return 0;
+
+    Value *EndPtr = CI->getOperand(2);
+    if (isa<ConstantPointerNull>(EndPtr)) {
+      CI->setOnlyReadsMemory();
+      CI->addAttribute(1, Attribute::NoCapture);
+    }
+
+    return 0;
+  }
+};
+
+
+//===---------------------------------------===//
+// 'memcmp' Optimizations
+
+struct VISIBILITY_HIDDEN MemCmpOpt : public LibCallOptimization {
+  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    const FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 3 || !isa<PointerType>(FT->getParamType(0)) ||
+        !isa<PointerType>(FT->getParamType(1)) ||
+        FT->getReturnType() != Type::Int32Ty)
+      return 0;
+
+    Value *LHS = CI->getOperand(1), *RHS = CI->getOperand(2);
+
+    if (LHS == RHS)  // memcmp(s,s,x) -> 0
+      return Constant::getNullValue(CI->getType());
+
+    // Make sure we have a constant length.
+    ConstantInt *LenC = dyn_cast<ConstantInt>(CI->getOperand(3));
+    if (!LenC) return 0;
+    uint64_t Len = LenC->getZExtValue();
+
+    if (Len == 0) // memcmp(s1,s2,0) -> 0
+      return Constant::getNullValue(CI->getType());
+
+    if (Len == 1) { // memcmp(S1,S2,1) -> *LHS - *RHS
+      Value *LHSV = B.CreateLoad(CastToCStr(LHS, B), "lhsv");
+      Value *RHSV = B.CreateLoad(CastToCStr(RHS, B), "rhsv");
+      return B.CreateSExt(B.CreateSub(LHSV, RHSV, "chardiff"), CI->getType());
+    }
+
+    // memcmp(S1,S2,2) != 0 -> (*(short*)LHS ^ *(short*)RHS)  != 0
+    // memcmp(S1,S2,4) != 0 -> (*(int*)LHS ^ *(int*)RHS)  != 0
+    if ((Len == 2 || Len == 4) && IsOnlyUsedInZeroEqualityComparison(CI)) {
+      const Type *PTy = PointerType::getUnqual(Len == 2 ?
+                                               Type::Int16Ty : Type::Int32Ty);
+      LHS = B.CreateBitCast(LHS, PTy, "tmp");
+      RHS = B.CreateBitCast(RHS, PTy, "tmp");
+      LoadInst *LHSV = B.CreateLoad(LHS, "lhsv");
+      LoadInst *RHSV = B.CreateLoad(RHS, "rhsv");
+      LHSV->setAlignment(1); RHSV->setAlignment(1);  // Unaligned loads.
+      return B.CreateZExt(B.CreateXor(LHSV, RHSV, "shortdiff"), CI->getType());
+    }
+
+    return 0;
+  }
+};
+
+//===---------------------------------------===//
+// 'memcpy' Optimizations
+
+struct VISIBILITY_HIDDEN MemCpyOpt : public LibCallOptimization {
+  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    const FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 3 || FT->getReturnType() != FT->getParamType(0) ||
+        !isa<PointerType>(FT->getParamType(0)) ||
+        !isa<PointerType>(FT->getParamType(1)) ||
+        FT->getParamType(2) != TD->getIntPtrType())
+      return 0;
+
+    // memcpy(x, y, n) -> llvm.memcpy(x, y, n, 1)
+    EmitMemCpy(CI->getOperand(1), CI->getOperand(2), CI->getOperand(3), 1, B);
+    return CI->getOperand(1);
+  }
+};
+
+//===---------------------------------------===//
+// 'memmove' Optimizations
+
+struct VISIBILITY_HIDDEN MemMoveOpt : public LibCallOptimization {
+  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    const FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 3 || FT->getReturnType() != FT->getParamType(0) ||
+        !isa<PointerType>(FT->getParamType(0)) ||
+        !isa<PointerType>(FT->getParamType(1)) ||
+        FT->getParamType(2) != TD->getIntPtrType())
+      return 0;
+
+    // memmove(x, y, n) -> llvm.memmove(x, y, n, 1)
+    Module *M = Caller->getParent();
+    Intrinsic::ID IID = Intrinsic::memmove;
+    const Type *Tys[1];
+    Tys[0] = TD->getIntPtrType();
+    Value *MemMove = Intrinsic::getDeclaration(M, IID, Tys, 1);
+    Value *Dst = CastToCStr(CI->getOperand(1), B);
+    Value *Src = CastToCStr(CI->getOperand(2), B);
+    Value *Size = CI->getOperand(3);
+    Value *Align = ConstantInt::get(Type::Int32Ty, 1);
+    B.CreateCall4(MemMove, Dst, Src, Size, Align);
+    return CI->getOperand(1);
+  }
+};
+
+//===---------------------------------------===//
+// 'memset' Optimizations
+
+struct VISIBILITY_HIDDEN MemSetOpt : public LibCallOptimization {
+  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    const FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 3 || FT->getReturnType() != FT->getParamType(0) ||
+        !isa<PointerType>(FT->getParamType(0)) ||
+        FT->getParamType(1) != TD->getIntPtrType() ||
+        FT->getParamType(2) != TD->getIntPtrType())
+      return 0;
+
+    // memset(p, v, n) -> llvm.memset(p, v, n, 1)
+    Value *Val = B.CreateTrunc(CI->getOperand(2), Type::Int8Ty);
+    EmitMemSet(CI->getOperand(1), Val,  CI->getOperand(3), B);
+    return CI->getOperand(1);
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// Math Library Optimizations
+//===----------------------------------------------------------------------===//
+
+//===---------------------------------------===//
+// 'pow*' Optimizations
+
+struct VISIBILITY_HIDDEN PowOpt : public LibCallOptimization {
+  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    const FunctionType *FT = Callee->getFunctionType();
+    // Just make sure this has 2 arguments of the same FP type, which match the
+    // result type.
+    if (FT->getNumParams() != 2 || FT->getReturnType() != FT->getParamType(0) ||
+        FT->getParamType(0) != FT->getParamType(1) ||
+        !FT->getParamType(0)->isFloatingPoint())
+      return 0;
+    
+    Value *Op1 = CI->getOperand(1), *Op2 = CI->getOperand(2);
+    if (ConstantFP *Op1C = dyn_cast<ConstantFP>(Op1)) {
+      if (Op1C->isExactlyValue(1.0))  // pow(1.0, x) -> 1.0
+        return Op1C;
+      if (Op1C->isExactlyValue(2.0))  // pow(2.0, x) -> exp2(x)
+        return EmitUnaryFloatFnCall(Op2, "exp2", B);
+    }
+    
+    ConstantFP *Op2C = dyn_cast<ConstantFP>(Op2);
+    if (Op2C == 0) return 0;
+    
+    if (Op2C->getValueAPF().isZero())  // pow(x, 0.0) -> 1.0
+      return ConstantFP::get(CI->getType(), 1.0);
+    
+    if (Op2C->isExactlyValue(0.5)) {
+      // FIXME: This is not safe for -0.0 and -inf.  This can only be done when
+      // 'unsafe' math optimizations are allowed.
+      // x    pow(x, 0.5)  sqrt(x)
+      // ---------------------------------------------
+      // -0.0    +0.0       -0.0
+      // -inf    +inf       NaN
+#if 0
+      // pow(x, 0.5) -> sqrt(x)
+      return B.CreateCall(get_sqrt(), Op1, "sqrt");
+#endif
+    }
+    
+    if (Op2C->isExactlyValue(1.0))  // pow(x, 1.0) -> x
+      return Op1;
+    if (Op2C->isExactlyValue(2.0))  // pow(x, 2.0) -> x*x
+      return B.CreateMul(Op1, Op1, "pow2");
+    if (Op2C->isExactlyValue(-1.0)) // pow(x, -1.0) -> 1.0/x
+      return B.CreateFDiv(ConstantFP::get(CI->getType(), 1.0), Op1, "powrecip");
+    return 0;
+  }
+};
+
+//===---------------------------------------===//
+// 'exp2' Optimizations
+
+struct VISIBILITY_HIDDEN Exp2Opt : public LibCallOptimization {
+  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    const FunctionType *FT = Callee->getFunctionType();
+    // Just make sure this has 1 argument of FP type, which matches the
+    // result type.
+    if (FT->getNumParams() != 1 || FT->getReturnType() != FT->getParamType(0) ||
+        !FT->getParamType(0)->isFloatingPoint())
+      return 0;
+    
+    Value *Op = CI->getOperand(1);
+    // Turn exp2(sitofp(x)) -> ldexp(1.0, sext(x))  if sizeof(x) <= 32
+    // Turn exp2(uitofp(x)) -> ldexp(1.0, zext(x))  if sizeof(x) < 32
+    Value *LdExpArg = 0;
+    if (SIToFPInst *OpC = dyn_cast<SIToFPInst>(Op)) {
+      if (OpC->getOperand(0)->getType()->getPrimitiveSizeInBits() <= 32)
+        LdExpArg = B.CreateSExt(OpC->getOperand(0), Type::Int32Ty, "tmp");
+    } else if (UIToFPInst *OpC = dyn_cast<UIToFPInst>(Op)) {
+      if (OpC->getOperand(0)->getType()->getPrimitiveSizeInBits() < 32)
+        LdExpArg = B.CreateZExt(OpC->getOperand(0), Type::Int32Ty, "tmp");
+    }
+    
+    if (LdExpArg) {
+      const char *Name;
+      if (Op->getType() == Type::FloatTy)
+        Name = "ldexpf";
+      else if (Op->getType() == Type::DoubleTy)
+        Name = "ldexp";
+      else
+        Name = "ldexpl";
+
+      Constant *One = ConstantFP::get(APFloat(1.0f));
+      if (Op->getType() != Type::FloatTy)
+        One = ConstantExpr::getFPExtend(One, Op->getType());
+
+      Module *M = Caller->getParent();
+      Value *Callee = M->getOrInsertFunction(Name, Op->getType(),
+                                             Op->getType(), Type::Int32Ty,NULL);
+      return B.CreateCall2(Callee, One, LdExpArg);
+    }
+    return 0;
+  }
+};
+    
+
+//===---------------------------------------===//
+// Double -> Float Shrinking Optimizations for Unary Functions like 'floor'
+
+struct VISIBILITY_HIDDEN UnaryDoubleFPOpt : public LibCallOptimization {
+  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    const FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 1 || FT->getReturnType() != Type::DoubleTy ||
+        FT->getParamType(0) != Type::DoubleTy)
+      return 0;
+    
+    // If this is something like 'floor((double)floatval)', convert to floorf.
+    FPExtInst *Cast = dyn_cast<FPExtInst>(CI->getOperand(1));
+    if (Cast == 0 || Cast->getOperand(0)->getType() != Type::FloatTy)
+      return 0;
+
+    // floor((double)floatval) -> (double)floorf(floatval)
+    Value *V = Cast->getOperand(0);
+    V = EmitUnaryFloatFnCall(V, Callee->getNameStart(), B);
+    return B.CreateFPExt(V, Type::DoubleTy);
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// Integer Optimizations
+//===----------------------------------------------------------------------===//
+
+//===---------------------------------------===//
+// 'ffs*' Optimizations
+
+struct VISIBILITY_HIDDEN FFSOpt : public LibCallOptimization {
+  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    const FunctionType *FT = Callee->getFunctionType();
+    // Just make sure this has 2 arguments of the same FP type, which match the
+    // result type.
+    if (FT->getNumParams() != 1 || FT->getReturnType() != Type::Int32Ty ||
+        !isa<IntegerType>(FT->getParamType(0)))
+      return 0;
+    
+    Value *Op = CI->getOperand(1);
+    
+    // Constant fold.
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(Op)) {
+      if (CI->getValue() == 0)  // ffs(0) -> 0.
+        return Constant::getNullValue(CI->getType());
+      return ConstantInt::get(Type::Int32Ty, // ffs(c) -> cttz(c)+1
+                              CI->getValue().countTrailingZeros()+1);
+    }
+    
+    // ffs(x) -> x != 0 ? (i32)llvm.cttz(x)+1 : 0
+    const Type *ArgType = Op->getType();
+    Value *F = Intrinsic::getDeclaration(Callee->getParent(),
+                                         Intrinsic::cttz, &ArgType, 1);
+    Value *V = B.CreateCall(F, Op, "cttz");
+    V = B.CreateAdd(V, ConstantInt::get(V->getType(), 1), "tmp");
+    V = B.CreateIntCast(V, Type::Int32Ty, false, "tmp");
+    
+    Value *Cond = B.CreateICmpNE(Op, Constant::getNullValue(ArgType), "tmp");
+    return B.CreateSelect(Cond, V, ConstantInt::get(Type::Int32Ty, 0));
+  }
+};
+
+//===---------------------------------------===//
+// 'isdigit' Optimizations
+
+struct VISIBILITY_HIDDEN IsDigitOpt : public LibCallOptimization {
+  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    const FunctionType *FT = Callee->getFunctionType();
+    // We require integer(i32)
+    if (FT->getNumParams() != 1 || !isa<IntegerType>(FT->getReturnType()) ||
+        FT->getParamType(0) != Type::Int32Ty)
+      return 0;
+    
+    // isdigit(c) -> (c-'0') <u 10
+    Value *Op = CI->getOperand(1);
+    Op = B.CreateSub(Op, ConstantInt::get(Type::Int32Ty, '0'), "isdigittmp");
+    Op = B.CreateICmpULT(Op, ConstantInt::get(Type::Int32Ty, 10), "isdigit");
+    return B.CreateZExt(Op, CI->getType());
+  }
+};
+
+//===---------------------------------------===//
+// 'isascii' Optimizations
+
+struct VISIBILITY_HIDDEN IsAsciiOpt : public LibCallOptimization {
+  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    const FunctionType *FT = Callee->getFunctionType();
+    // We require integer(i32)
+    if (FT->getNumParams() != 1 || !isa<IntegerType>(FT->getReturnType()) ||
+        FT->getParamType(0) != Type::Int32Ty)
+      return 0;
+    
+    // isascii(c) -> c <u 128
+    Value *Op = CI->getOperand(1);
+    Op = B.CreateICmpULT(Op, ConstantInt::get(Type::Int32Ty, 128), "isascii");
+    return B.CreateZExt(Op, CI->getType());
+  }
+};
+  
+//===---------------------------------------===//
+// 'abs', 'labs', 'llabs' Optimizations
+
+struct VISIBILITY_HIDDEN AbsOpt : public LibCallOptimization {
+  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    const FunctionType *FT = Callee->getFunctionType();
+    // We require integer(integer) where the types agree.
+    if (FT->getNumParams() != 1 || !isa<IntegerType>(FT->getReturnType()) ||
+        FT->getParamType(0) != FT->getReturnType())
+      return 0;
+    
+    // abs(x) -> x >s -1 ? x : -x
+    Value *Op = CI->getOperand(1);
+    Value *Pos = B.CreateICmpSGT(Op,ConstantInt::getAllOnesValue(Op->getType()),
+                                 "ispos");
+    Value *Neg = B.CreateNeg(Op, "neg");
+    return B.CreateSelect(Pos, Op, Neg);
+  }
+};
+  
+
+//===---------------------------------------===//
+// 'toascii' Optimizations
+
+struct VISIBILITY_HIDDEN ToAsciiOpt : public LibCallOptimization {
+  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    const FunctionType *FT = Callee->getFunctionType();
+    // We require i32(i32)
+    if (FT->getNumParams() != 1 || FT->getReturnType() != FT->getParamType(0) ||
+        FT->getParamType(0) != Type::Int32Ty)
+      return 0;
+    
+    // isascii(c) -> c & 0x7f
+    return B.CreateAnd(CI->getOperand(1), ConstantInt::get(CI->getType(),0x7F));
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// Formatting and IO Optimizations
+//===----------------------------------------------------------------------===//
+
+//===---------------------------------------===//
+// 'printf' Optimizations
+
+struct VISIBILITY_HIDDEN PrintFOpt : public LibCallOptimization {
+  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    // Require one fixed pointer argument and an integer/void result.
+    const FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() < 1 || !isa<PointerType>(FT->getParamType(0)) ||
+        !(isa<IntegerType>(FT->getReturnType()) ||
+          FT->getReturnType() == Type::VoidTy))
+      return 0;
+    
+    // Check for a fixed format string.
+    std::string FormatStr;
+    if (!GetConstantStringInfo(CI->getOperand(1), FormatStr))
+      return 0;
+
+    // Empty format string -> noop.
+    if (FormatStr.empty())  // Tolerate printf's declared void.
+      return CI->use_empty() ? (Value*)CI : ConstantInt::get(CI->getType(), 0);
+    
+    // printf("x") -> putchar('x'), even for '%'.
+    if (FormatStr.size() == 1) {
+      EmitPutChar(ConstantInt::get(Type::Int32Ty, FormatStr[0]), B);
+      return CI->use_empty() ? (Value*)CI : ConstantInt::get(CI->getType(), 1);
+    }
+    
+    // printf("foo\n") --> puts("foo")
+    if (FormatStr[FormatStr.size()-1] == '\n' &&
+        FormatStr.find('%') == std::string::npos) {  // no format characters.
+      // Create a string literal with no \n on it.  We expect the constant merge
+      // pass to be run after this pass, to merge duplicate strings.
+      FormatStr.erase(FormatStr.end()-1);
+      Constant *C = ConstantArray::get(FormatStr, true);
+      C = new GlobalVariable(C->getType(), true,GlobalVariable::InternalLinkage,
+                             C, "str", Callee->getParent());
+      EmitPutS(C, B);
+      return CI->use_empty() ? (Value*)CI : 
+                          ConstantInt::get(CI->getType(), FormatStr.size()+1);
+    }
+    
+    // Optimize specific format strings.
+    // printf("%c", chr) --> putchar(*(i8*)dst)
+    if (FormatStr == "%c" && CI->getNumOperands() > 2 &&
+        isa<IntegerType>(CI->getOperand(2)->getType())) {
+      EmitPutChar(CI->getOperand(2), B);
+      return CI->use_empty() ? (Value*)CI : ConstantInt::get(CI->getType(), 1);
+    }
+    
+    // printf("%s\n", str) --> puts(str)
+    if (FormatStr == "%s\n" && CI->getNumOperands() > 2 &&
+        isa<PointerType>(CI->getOperand(2)->getType()) &&
+        CI->use_empty()) {
+      EmitPutS(CI->getOperand(2), B);
+      return CI;
+    }
+    return 0;
+  }
+};
+
+//===---------------------------------------===//
+// 'sprintf' Optimizations
+
+struct VISIBILITY_HIDDEN SPrintFOpt : public LibCallOptimization {
+  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    // Require two fixed pointer arguments and an integer result.
+    const FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 2 || !isa<PointerType>(FT->getParamType(0)) ||
+        !isa<PointerType>(FT->getParamType(1)) ||
+        !isa<IntegerType>(FT->getReturnType()))
+      return 0;
+
+    // Check for a fixed format string.
+    std::string FormatStr;
+    if (!GetConstantStringInfo(CI->getOperand(2), FormatStr))
+      return 0;
+    
+    // If we just have a format string (nothing else crazy) transform it.
+    if (CI->getNumOperands() == 3) {
+      // Make sure there's no % in the constant array.  We could try to handle
+      // %% -> % in the future if we cared.
+      for (unsigned i = 0, e = FormatStr.size(); i != e; ++i)
+        if (FormatStr[i] == '%')
+          return 0; // we found a format specifier, bail out.
+      
+      // sprintf(str, fmt) -> llvm.memcpy(str, fmt, strlen(fmt)+1, 1)
+      EmitMemCpy(CI->getOperand(1), CI->getOperand(2), // Copy the nul byte.
+                 ConstantInt::get(TD->getIntPtrType(), FormatStr.size()+1),1,B);
+      return ConstantInt::get(CI->getType(), FormatStr.size());
+    }
+    
+    // The remaining optimizations require the format string to be "%s" or "%c"
+    // and have an extra operand.
+    if (FormatStr.size() != 2 || FormatStr[0] != '%' || CI->getNumOperands() <4)
+      return 0;
+    
+    // Decode the second character of the format string.
+    if (FormatStr[1] == 'c') {
+      // sprintf(dst, "%c", chr) --> *(i8*)dst = chr; *((i8*)dst+1) = 0
+      if (!isa<IntegerType>(CI->getOperand(3)->getType())) return 0;
+      Value *V = B.CreateTrunc(CI->getOperand(3), Type::Int8Ty, "char");
+      Value *Ptr = CastToCStr(CI->getOperand(1), B);
+      B.CreateStore(V, Ptr);
+      Ptr = B.CreateGEP(Ptr, ConstantInt::get(Type::Int32Ty, 1), "nul");
+      B.CreateStore(Constant::getNullValue(Type::Int8Ty), Ptr);
+      
+      return ConstantInt::get(CI->getType(), 1);
+    }
+    
+    if (FormatStr[1] == 's') {
+      // sprintf(dest, "%s", str) -> llvm.memcpy(dest, str, strlen(str)+1, 1)
+      if (!isa<PointerType>(CI->getOperand(3)->getType())) return 0;
+
+      Value *Len = EmitStrLen(CI->getOperand(3), B);
+      Value *IncLen = B.CreateAdd(Len, ConstantInt::get(Len->getType(), 1),
+                                  "leninc");
+      EmitMemCpy(CI->getOperand(1), CI->getOperand(3), IncLen, 1, B);
+      
+      // The sprintf result is the unincremented number of bytes in the string.
+      return B.CreateIntCast(Len, CI->getType(), false);
+    }
+    return 0;
+  }
+};
+
+//===---------------------------------------===//
+// 'fwrite' Optimizations
+
+struct VISIBILITY_HIDDEN FWriteOpt : public LibCallOptimization {
+  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    // Require a pointer, an integer, an integer, a pointer, returning integer.
+    const FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 4 || !isa<PointerType>(FT->getParamType(0)) ||
+        !isa<IntegerType>(FT->getParamType(1)) ||
+        !isa<IntegerType>(FT->getParamType(2)) ||
+        !isa<PointerType>(FT->getParamType(3)) ||
+        !isa<IntegerType>(FT->getReturnType()))
+      return 0;
+    
+    // Get the element size and count.
+    ConstantInt *SizeC = dyn_cast<ConstantInt>(CI->getOperand(2));
+    ConstantInt *CountC = dyn_cast<ConstantInt>(CI->getOperand(3));
+    if (!SizeC || !CountC) return 0;
+    uint64_t Bytes = SizeC->getZExtValue()*CountC->getZExtValue();
+    
+    // If this is writing zero records, remove the call (it's a noop).
+    if (Bytes == 0)
+      return ConstantInt::get(CI->getType(), 0);
+    
+    // If this is writing one byte, turn it into fputc.
+    if (Bytes == 1) {  // fwrite(S,1,1,F) -> fputc(S[0],F)
+      Value *Char = B.CreateLoad(CastToCStr(CI->getOperand(1), B), "char");
+      EmitFPutC(Char, CI->getOperand(4), B);
+      return ConstantInt::get(CI->getType(), 1);
+    }
+
+    return 0;
+  }
+};
+
+//===---------------------------------------===//
+// 'fputs' Optimizations
+
+struct VISIBILITY_HIDDEN FPutsOpt : public LibCallOptimization {
+  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    // Require two pointers.  Also, we can't optimize if return value is used.
+    const FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 2 || !isa<PointerType>(FT->getParamType(0)) ||
+        !isa<PointerType>(FT->getParamType(1)) ||
+        !CI->use_empty())
+      return 0;
+    
+    // fputs(s,F) --> fwrite(s,1,strlen(s),F)
+    uint64_t Len = GetStringLength(CI->getOperand(1));
+    if (!Len) return 0;
+    EmitFWrite(CI->getOperand(1), ConstantInt::get(TD->getIntPtrType(), Len-1),
+               CI->getOperand(2), B);
+    return CI;  // Known to have no uses (see above).
+  }
+};
+
+//===---------------------------------------===//
+// 'fprintf' Optimizations
+
+struct VISIBILITY_HIDDEN FPrintFOpt : public LibCallOptimization {
+  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    // Require two fixed paramters as pointers and integer result.
+    const FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 2 || !isa<PointerType>(FT->getParamType(0)) ||
+        !isa<PointerType>(FT->getParamType(1)) ||
+        !isa<IntegerType>(FT->getReturnType()))
+      return 0;
+    
+    // All the optimizations depend on the format string.
+    std::string FormatStr;
+    if (!GetConstantStringInfo(CI->getOperand(2), FormatStr))
+      return 0;
+
+    // fprintf(F, "foo") --> fwrite("foo", 3, 1, F)
+    if (CI->getNumOperands() == 3) {
+      for (unsigned i = 0, e = FormatStr.size(); i != e; ++i)
+        if (FormatStr[i] == '%')  // Could handle %% -> % if we cared.
+          return 0; // We found a format specifier.
+      
+      EmitFWrite(CI->getOperand(2), ConstantInt::get(TD->getIntPtrType(),
+                                                     FormatStr.size()),
+                 CI->getOperand(1), B);
+      return ConstantInt::get(CI->getType(), FormatStr.size());
+    }
+    
+    // The remaining optimizations require the format string to be "%s" or "%c"
+    // and have an extra operand.
+    if (FormatStr.size() != 2 || FormatStr[0] != '%' || CI->getNumOperands() <4)
+      return 0;
+    
+    // Decode the second character of the format string.
+    if (FormatStr[1] == 'c') {
+      // fprintf(F, "%c", chr) --> *(i8*)dst = chr
+      if (!isa<IntegerType>(CI->getOperand(3)->getType())) return 0;
+      EmitFPutC(CI->getOperand(3), CI->getOperand(1), B);
+      return ConstantInt::get(CI->getType(), 1);
+    }
+    
+    if (FormatStr[1] == 's') {
+      // fprintf(F, "%s", str) -> fputs(str, F)
+      if (!isa<PointerType>(CI->getOperand(3)->getType()) || !CI->use_empty())
+        return 0;
+      EmitFPutS(CI->getOperand(3), CI->getOperand(1), B);
+      return CI;
+    }
+    return 0;
+  }
+};
+
+} // end anonymous namespace.
+
+//===----------------------------------------------------------------------===//
+// SimplifyLibCalls Pass Implementation
+//===----------------------------------------------------------------------===//
+
+namespace {
+  /// This pass optimizes well known library functions from libc and libm.
+  ///
+  class VISIBILITY_HIDDEN SimplifyLibCalls : public FunctionPass {
+    StringMap<LibCallOptimization*> Optimizations;
+    // Miscellaneous LibCall Optimizations
+    ExitOpt Exit; 
+    // String and Memory LibCall Optimizations
+    StrCatOpt StrCat; StrNCatOpt StrNCat; StrChrOpt StrChr; StrCmpOpt StrCmp;
+    StrNCmpOpt StrNCmp; StrCpyOpt StrCpy; StrNCpyOpt StrNCpy; StrLenOpt StrLen;
+    StrToOpt StrTo; MemCmpOpt MemCmp; MemCpyOpt MemCpy; MemMoveOpt MemMove;
+    MemSetOpt MemSet;
+    // Math Library Optimizations
+    PowOpt Pow; Exp2Opt Exp2; UnaryDoubleFPOpt UnaryDoubleFP;
+    // Integer Optimizations
+    FFSOpt FFS; AbsOpt Abs; IsDigitOpt IsDigit; IsAsciiOpt IsAscii;
+    ToAsciiOpt ToAscii;
+    // Formatting and IO Optimizations
+    SPrintFOpt SPrintF; PrintFOpt PrintF;
+    FWriteOpt FWrite; FPutsOpt FPuts; FPrintFOpt FPrintF;
+
+    bool Modified;  // This is only used by doInitialization.
+  public:
+    static char ID; // Pass identification
+    SimplifyLibCalls() : FunctionPass(&ID) {}
+
+    void InitOptimizations();
+    bool runOnFunction(Function &F);
+
+    void setDoesNotAccessMemory(Function &F);
+    void setOnlyReadsMemory(Function &F);
+    void setDoesNotThrow(Function &F);
+    void setDoesNotCapture(Function &F, unsigned n);
+    void setDoesNotAlias(Function &F, unsigned n);
+    bool doInitialization(Module &M);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequired<TargetData>();
+    }
+  };
+  char SimplifyLibCalls::ID = 0;
+} // end anonymous namespace.
+
+static RegisterPass<SimplifyLibCalls>
+X("simplify-libcalls", "Simplify well-known library calls");
+
+// Public interface to the Simplify LibCalls pass.
+FunctionPass *llvm::createSimplifyLibCallsPass() {
+  return new SimplifyLibCalls(); 
+}
+
+/// Optimizations - Populate the Optimizations map with all the optimizations
+/// we know.
+void SimplifyLibCalls::InitOptimizations() {
+  // Miscellaneous LibCall Optimizations
+  Optimizations["exit"] = &Exit;
+  
+  // String and Memory LibCall Optimizations
+  Optimizations["strcat"] = &StrCat;
+  Optimizations["strncat"] = &StrNCat;
+  Optimizations["strchr"] = &StrChr;
+  Optimizations["strcmp"] = &StrCmp;
+  Optimizations["strncmp"] = &StrNCmp;
+  Optimizations["strcpy"] = &StrCpy;
+  Optimizations["strncpy"] = &StrNCpy;
+  Optimizations["strlen"] = &StrLen;
+  Optimizations["strtol"] = &StrTo;
+  Optimizations["strtod"] = &StrTo;
+  Optimizations["strtof"] = &StrTo;
+  Optimizations["strtoul"] = &StrTo;
+  Optimizations["strtoll"] = &StrTo;
+  Optimizations["strtold"] = &StrTo;
+  Optimizations["strtoull"] = &StrTo;
+  Optimizations["memcmp"] = &MemCmp;
+  Optimizations["memcpy"] = &MemCpy;
+  Optimizations["memmove"] = &MemMove;
+  Optimizations["memset"] = &MemSet;
+  
+  // Math Library Optimizations
+  Optimizations["powf"] = &Pow;
+  Optimizations["pow"] = &Pow;
+  Optimizations["powl"] = &Pow;
+  Optimizations["llvm.pow.f32"] = &Pow;
+  Optimizations["llvm.pow.f64"] = &Pow;
+  Optimizations["llvm.pow.f80"] = &Pow;
+  Optimizations["llvm.pow.f128"] = &Pow;
+  Optimizations["llvm.pow.ppcf128"] = &Pow;
+  Optimizations["exp2l"] = &Exp2;
+  Optimizations["exp2"] = &Exp2;
+  Optimizations["exp2f"] = &Exp2;
+  Optimizations["llvm.exp2.ppcf128"] = &Exp2;
+  Optimizations["llvm.exp2.f128"] = &Exp2;
+  Optimizations["llvm.exp2.f80"] = &Exp2;
+  Optimizations["llvm.exp2.f64"] = &Exp2;
+  Optimizations["llvm.exp2.f32"] = &Exp2;
+  
+#ifdef HAVE_FLOORF
+  Optimizations["floor"] = &UnaryDoubleFP;
+#endif
+#ifdef HAVE_CEILF
+  Optimizations["ceil"] = &UnaryDoubleFP;
+#endif
+#ifdef HAVE_ROUNDF
+  Optimizations["round"] = &UnaryDoubleFP;
+#endif
+#ifdef HAVE_RINTF
+  Optimizations["rint"] = &UnaryDoubleFP;
+#endif
+#ifdef HAVE_NEARBYINTF
+  Optimizations["nearbyint"] = &UnaryDoubleFP;
+#endif
+  
+  // Integer Optimizations
+  Optimizations["ffs"] = &FFS;
+  Optimizations["ffsl"] = &FFS;
+  Optimizations["ffsll"] = &FFS;
+  Optimizations["abs"] = &Abs;
+  Optimizations["labs"] = &Abs;
+  Optimizations["llabs"] = &Abs;
+  Optimizations["isdigit"] = &IsDigit;
+  Optimizations["isascii"] = &IsAscii;
+  Optimizations["toascii"] = &ToAscii;
+  
+  // Formatting and IO Optimizations
+  Optimizations["sprintf"] = &SPrintF;
+  Optimizations["printf"] = &PrintF;
+  Optimizations["fwrite"] = &FWrite;
+  Optimizations["fputs"] = &FPuts;
+  Optimizations["fprintf"] = &FPrintF;
+}
+
+
+/// runOnFunction - Top level algorithm.
+///
+bool SimplifyLibCalls::runOnFunction(Function &F) {
+  if (Optimizations.empty())
+    InitOptimizations();
+  
+  const TargetData &TD = getAnalysis<TargetData>();
+  
+  IRBuilder<> Builder;
+
+  bool Changed = false;
+  for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
+    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ) {
+      // Ignore non-calls.
+      CallInst *CI = dyn_cast<CallInst>(I++);
+      if (!CI) continue;
+      
+      // Ignore indirect calls and calls to non-external functions.
+      Function *Callee = CI->getCalledFunction();
+      if (Callee == 0 || !Callee->isDeclaration() ||
+          !(Callee->hasExternalLinkage() || Callee->hasDLLImportLinkage()))
+        continue;
+      
+      // Ignore unknown calls.
+      const char *CalleeName = Callee->getNameStart();
+      StringMap<LibCallOptimization*>::iterator OMI =
+        Optimizations.find(CalleeName, CalleeName+Callee->getNameLen());
+      if (OMI == Optimizations.end()) continue;
+      
+      // Set the builder to the instruction after the call.
+      Builder.SetInsertPoint(BB, I);
+      
+      // Try to optimize this call.
+      Value *Result = OMI->second->OptimizeCall(CI, TD, Builder);
+      if (Result == 0) continue;
+
+      DEBUG(DOUT << "SimplifyLibCalls simplified: " << *CI;
+            DOUT << "  into: " << *Result << "\n");
+      
+      // Something changed!
+      Changed = true;
+      ++NumSimplified;
+      
+      // Inspect the instruction after the call (which was potentially just
+      // added) next.
+      I = CI; ++I;
+      
+      if (CI != Result && !CI->use_empty()) {
+        CI->replaceAllUsesWith(Result);
+        if (!Result->hasName())
+          Result->takeName(CI);
+      }
+      CI->eraseFromParent();
+    }
+  }
+  return Changed;
+}
+
+// Utility methods for doInitialization.
+
+void SimplifyLibCalls::setDoesNotAccessMemory(Function &F) {
+  if (!F.doesNotAccessMemory()) {
+    F.setDoesNotAccessMemory();
+    ++NumAnnotated;
+    Modified = true;
+  }
+}
+void SimplifyLibCalls::setOnlyReadsMemory(Function &F) {
+  if (!F.onlyReadsMemory()) {
+    F.setOnlyReadsMemory();
+    ++NumAnnotated;
+    Modified = true;
+  }
+}
+void SimplifyLibCalls::setDoesNotThrow(Function &F) {
+  if (!F.doesNotThrow()) {
+    F.setDoesNotThrow();
+    ++NumAnnotated;
+    Modified = true;
+  }
+}
+void SimplifyLibCalls::setDoesNotCapture(Function &F, unsigned n) {
+  if (!F.doesNotCapture(n)) {
+    F.setDoesNotCapture(n);
+    ++NumAnnotated;
+    Modified = true;
+  }
+}
+void SimplifyLibCalls::setDoesNotAlias(Function &F, unsigned n) {
+  if (!F.doesNotAlias(n)) {
+    F.setDoesNotAlias(n);
+    ++NumAnnotated;
+    Modified = true;
+  }
+}
+
+/// doInitialization - Add attributes to well-known functions.
+///
+bool SimplifyLibCalls::doInitialization(Module &M) {
+  Modified = false;
+  for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) {
+    Function &F = *I;
+    if (!F.isDeclaration())
+      continue;
+
+    unsigned NameLen = F.getNameLen();
+    if (!NameLen)
+      continue;
+
+    const FunctionType *FTy = F.getFunctionType();
+
+    const char *NameStr = F.getNameStart();
+    switch (NameStr[0]) {
+      case 's':
+        if (NameLen == 6 && !strcmp(NameStr, "strlen")) {
+          if (FTy->getNumParams() != 1 ||
+              !isa<PointerType>(FTy->getParamType(0)))
+            continue;
+          setOnlyReadsMemory(F);
+          setDoesNotThrow(F);
+          setDoesNotCapture(F, 1);
+        } else if ((NameLen == 6 && !strcmp(NameStr, "strcpy")) ||
+                   (NameLen == 6 && !strcmp(NameStr, "stpcpy")) ||
+                   (NameLen == 6 && !strcmp(NameStr, "strcat")) ||
+                   (NameLen == 6 && !strcmp(NameStr, "strtol")) ||
+                   (NameLen == 6 && !strcmp(NameStr, "strtod")) ||
+                   (NameLen == 6 && !strcmp(NameStr, "strtof")) ||
+                   (NameLen == 7 && !strcmp(NameStr, "strtoul")) ||
+                   (NameLen == 7 && !strcmp(NameStr, "strtoll")) ||
+                   (NameLen == 7 && !strcmp(NameStr, "strtold")) ||
+                   (NameLen == 7 && !strcmp(NameStr, "strncat")) ||
+                   (NameLen == 7 && !strcmp(NameStr, "strncpy")) ||
+                   (NameLen == 8 && !strcmp(NameStr, "strtoull"))) {
+          if (FTy->getNumParams() < 2 ||
+              !isa<PointerType>(FTy->getParamType(1)))
+            continue;
+          setDoesNotThrow(F);
+          setDoesNotCapture(F, 2);
+        } else if (NameLen == 7 && !strcmp(NameStr, "strxfrm")) {
+          if (FTy->getNumParams() != 3 ||
+              !isa<PointerType>(FTy->getParamType(0)) ||
+              !isa<PointerType>(FTy->getParamType(1)))
+            continue;
+          setDoesNotThrow(F);
+          setDoesNotCapture(F, 1);
+          setDoesNotCapture(F, 2);
+        } else if ((NameLen == 6 && !strcmp(NameStr, "strcmp")) ||
+                   (NameLen == 6 && !strcmp(NameStr, "strspn")) ||
+                   (NameLen == 7 && !strcmp(NameStr, "strncmp")) ||
+                   (NameLen == 7 && !strcmp(NameStr, "strcspn")) ||
+                   (NameLen == 7 && !strcmp(NameStr, "strcoll")) ||
+                   (NameLen == 10 && !strcmp(NameStr, "strcasecmp")) ||
+                   (NameLen == 11 && !strcmp(NameStr, "strncasecmp"))) {
+          if (FTy->getNumParams() < 2 ||
+              !isa<PointerType>(FTy->getParamType(0)) ||
+              !isa<PointerType>(FTy->getParamType(1)))
+            continue;
+          setOnlyReadsMemory(F);
+          setDoesNotThrow(F);
+          setDoesNotCapture(F, 1);
+          setDoesNotCapture(F, 2);
+        } else if ((NameLen == 6 && !strcmp(NameStr, "strstr")) ||
+                   (NameLen == 7 && !strcmp(NameStr, "strpbrk"))) {
+          if (FTy->getNumParams() != 2 ||
+              !isa<PointerType>(FTy->getParamType(1)))
+            continue;
+          setOnlyReadsMemory(F);
+          setDoesNotThrow(F);
+          setDoesNotCapture(F, 2);
+        } else if ((NameLen == 6 && !strcmp(NameStr, "strtok")) ||
+                   (NameLen == 8 && !strcmp(NameStr, "strtok_r"))) {
+          if (FTy->getNumParams() < 2 ||
+              !isa<PointerType>(FTy->getParamType(1)))
+            continue;
+          setDoesNotThrow(F);
+          setDoesNotCapture(F, 2);
+        } else if ((NameLen == 5 && !strcmp(NameStr, "scanf")) ||
+                   (NameLen == 6 && !strcmp(NameStr, "setbuf")) ||
+                   (NameLen == 7 && !strcmp(NameStr, "setvbuf"))) {
+          if (FTy->getNumParams() < 1 ||
+              !isa<PointerType>(FTy->getParamType(0)))
+            continue;
+          setDoesNotThrow(F);
+          setDoesNotCapture(F, 1);
+        } else if ((NameLen == 6 && !strcmp(NameStr, "strdup")) ||
+                   (NameLen == 7 && !strcmp(NameStr, "strndup"))) {
+          if (FTy->getNumParams() < 1 ||
+              !isa<PointerType>(FTy->getReturnType()) ||
+              !isa<PointerType>(FTy->getParamType(0)))
+            continue;
+          setDoesNotThrow(F);
+          setDoesNotAlias(F, 0);
+          setDoesNotCapture(F, 1);
+        } else if ((NameLen == 4 && !strcmp(NameStr, "stat")) ||
+                   (NameLen == 6 && !strcmp(NameStr, "sscanf")) ||
+                   (NameLen == 7 && !strcmp(NameStr, "sprintf")) ||
+                   (NameLen == 7 && !strcmp(NameStr, "statvfs"))) {
+          if (FTy->getNumParams() < 2 ||
+              !isa<PointerType>(FTy->getParamType(0)) ||
+              !isa<PointerType>(FTy->getParamType(1)))
+            continue;
+          setDoesNotThrow(F);
+          setDoesNotCapture(F, 1);
+          setDoesNotCapture(F, 2);
+        } else if (NameLen == 8 && !strcmp(NameStr, "snprintf")) {
+          if (FTy->getNumParams() != 3 ||
+              !isa<PointerType>(FTy->getParamType(0)) ||
+              !isa<PointerType>(FTy->getParamType(2)))
+            continue;
+          setDoesNotThrow(F);
+          setDoesNotCapture(F, 1);
+          setDoesNotCapture(F, 3);
+        } else if (NameLen == 9 && !strcmp(NameStr, "setitimer")) {
+          if (FTy->getNumParams() != 3 ||
+              !isa<PointerType>(FTy->getParamType(1)) ||
+              !isa<PointerType>(FTy->getParamType(2)))
+            continue;
+          setDoesNotThrow(F);
+          setDoesNotCapture(F, 2);
+          setDoesNotCapture(F, 3);
+        } else if (NameLen == 6 && !strcmp(NameStr, "system")) {
+          if (FTy->getNumParams() != 1 ||
+              !isa<PointerType>(FTy->getParamType(0)))
+            continue;
+          // May throw; "system" is a valid pthread cancellation point.
+          setDoesNotCapture(F, 1);
+        }
+        break;
+      case 'm':
+        if (NameLen == 6 && !strcmp(NameStr, "memcmp")) {
+          if (FTy->getNumParams() != 3 ||
+              !isa<PointerType>(FTy->getParamType(0)) ||
+              !isa<PointerType>(FTy->getParamType(1)))
+            continue;
+          setOnlyReadsMemory(F);
+          setDoesNotThrow(F);
+          setDoesNotCapture(F, 1);
+          setDoesNotCapture(F, 2);
+        } else if ((NameLen == 6 && !strcmp(NameStr, "memchr")) ||
+                   (NameLen == 7 && !strcmp(NameStr, "memrchr"))) {
+          if (FTy->getNumParams() != 3)
+            continue;
+          setOnlyReadsMemory(F);
+          setDoesNotThrow(F);
+        } else if ((NameLen == 4 && !strcmp(NameStr, "modf")) ||
+                   (NameLen == 5 && !strcmp(NameStr, "modff")) ||
+                   (NameLen == 5 && !strcmp(NameStr, "modfl")) ||
+                   (NameLen == 6 && !strcmp(NameStr, "memcpy")) ||
+                   (NameLen == 7 && !strcmp(NameStr, "memccpy")) ||
+                   (NameLen == 7 && !strcmp(NameStr, "memmove"))) {
+          if (FTy->getNumParams() < 2 ||
+              !isa<PointerType>(FTy->getParamType(1)))
+            continue;
+          setDoesNotThrow(F);
+          setDoesNotCapture(F, 2);
+        } else if (NameLen == 8 && !strcmp(NameStr, "memalign")) {
+          if (!isa<PointerType>(FTy->getReturnType()))
+            continue;
+          setDoesNotAlias(F, 0);
+        } else if ((NameLen == 5 && !strcmp(NameStr, "mkdir")) ||
+                   (NameLen == 6 && !strcmp(NameStr, "mktime"))) {
+          if (FTy->getNumParams() == 0 ||
+              !isa<PointerType>(FTy->getParamType(0)))
+            continue;
+          setDoesNotThrow(F);
+          setDoesNotCapture(F, 1);
+        }
+        break;
+      case 'r':
+        if (NameLen == 7 && !strcmp(NameStr, "realloc")) {
+          if (FTy->getNumParams() != 2 ||
+              !isa<PointerType>(FTy->getParamType(0)) ||
+              !isa<PointerType>(FTy->getReturnType()))
+            continue;
+          setDoesNotThrow(F);
+          setDoesNotAlias(F, 0);
+          setDoesNotCapture(F, 1);
+        } else if (NameLen == 4 && !strcmp(NameStr, "read")) {
+          if (FTy->getNumParams() != 3 ||
+              !isa<PointerType>(FTy->getParamType(1)))
+            continue;
+          // May throw; "read" is a valid pthread cancellation point.
+          setDoesNotCapture(F, 2);
+        } else if ((NameLen == 5 && !strcmp(NameStr, "rmdir")) ||
+                   (NameLen == 6 && !strcmp(NameStr, "rewind")) ||
+                   (NameLen == 6 && !strcmp(NameStr, "remove")) ||
+                   (NameLen == 8 && !strcmp(NameStr, "realpath"))) {
+          if (FTy->getNumParams() < 1 ||
+              !isa<PointerType>(FTy->getParamType(0)))
+            continue;
+          setDoesNotThrow(F);
+          setDoesNotCapture(F, 1);
+        } else if ((NameLen == 6 && !strcmp(NameStr, "rename")) ||
+                   (NameLen == 8 && !strcmp(NameStr, "readlink"))) {
+          if (FTy->getNumParams() < 2 ||
+              !isa<PointerType>(FTy->getParamType(0)) ||
+              !isa<PointerType>(FTy->getParamType(1)))
+            continue;
+          setDoesNotThrow(F);
+          setDoesNotCapture(F, 1);
+          setDoesNotCapture(F, 2);
+        }
+        break;
+      case 'w':
+        if (NameLen == 5 && !strcmp(NameStr, "write")) {
+          if (FTy->getNumParams() != 3 ||
+              !isa<PointerType>(FTy->getParamType(1)))
+            continue;
+          // May throw; "write" is a valid pthread cancellation point.
+          setDoesNotCapture(F, 2);
+        }
+        break;
+      case 'b':
+        if (NameLen == 5 && !strcmp(NameStr, "bcopy")) {
+          if (FTy->getNumParams() != 3 ||
+              !isa<PointerType>(FTy->getParamType(0)) ||
+              !isa<PointerType>(FTy->getParamType(1)))
+            continue;
+          setDoesNotThrow(F);
+          setDoesNotCapture(F, 1);
+          setDoesNotCapture(F, 2);
+        } else if (NameLen == 4 && !strcmp(NameStr, "bcmp")) {
+          if (FTy->getNumParams() != 3 ||
+              !isa<PointerType>(FTy->getParamType(0)) ||
+              !isa<PointerType>(FTy->getParamType(1)))
+            continue;
+          setDoesNotThrow(F);
+          setOnlyReadsMemory(F);
+          setDoesNotCapture(F, 1);
+          setDoesNotCapture(F, 2);
+        } else if (NameLen == 5 && !strcmp(NameStr, "bzero")) {
+          if (FTy->getNumParams() != 2 ||
+              !isa<PointerType>(FTy->getParamType(0)))
+            continue;
+          setDoesNotThrow(F);
+          setDoesNotCapture(F, 1);
+        }
+        break;
+      case 'c':
+        if (NameLen == 6 && !strcmp(NameStr, "calloc")) {
+          if (FTy->getNumParams() != 2 ||
+              !isa<PointerType>(FTy->getReturnType()))
+            continue;
+          setDoesNotThrow(F);
+          setDoesNotAlias(F, 0);
+        } else if ((NameLen == 5 && !strcmp(NameStr, "chmod")) ||
+                   (NameLen == 5 && !strcmp(NameStr, "chown")) ||
+                   (NameLen == 7 && !strcmp(NameStr, "ctermid")) ||
+                   (NameLen == 8 && !strcmp(NameStr, "clearerr")) ||
+                   (NameLen == 8 && !strcmp(NameStr, "closedir"))) {
+          if (FTy->getNumParams() == 0 ||
+              !isa<PointerType>(FTy->getParamType(0)))
+            continue;
+          setDoesNotThrow(F);
+          setDoesNotCapture(F, 1);
+        }
+        break;
+      case 'a':
+        if ((NameLen == 4 && !strcmp(NameStr, "atoi")) ||
+            (NameLen == 4 && !strcmp(NameStr, "atol")) ||
+            (NameLen == 4 && !strcmp(NameStr, "atof")) ||
+            (NameLen == 5 && !strcmp(NameStr, "atoll"))) {
+          if (FTy->getNumParams() != 1 ||
+              !isa<PointerType>(FTy->getParamType(0)))
+            continue;
+          setDoesNotThrow(F);
+          setOnlyReadsMemory(F);
+          setDoesNotCapture(F, 1);
+        } else if (NameLen == 6 && !strcmp(NameStr, "access")) {
+          if (FTy->getNumParams() != 2 ||
+              !isa<PointerType>(FTy->getParamType(0)))
+            continue;
+          setDoesNotThrow(F);
+          setDoesNotCapture(F, 1);
+        }
+        break;
+      case 'f':
+        if (NameLen == 5 && !strcmp(NameStr, "fopen")) {
+          if (FTy->getNumParams() != 2 ||
+              !isa<PointerType>(FTy->getReturnType()) ||
+              !isa<PointerType>(FTy->getParamType(0)) ||
+              !isa<PointerType>(FTy->getParamType(1)))
+            continue;
+          setDoesNotThrow(F);
+          setDoesNotAlias(F, 0);
+          setDoesNotCapture(F, 1);
+          setDoesNotCapture(F, 2);
+        } else if (NameLen == 6 && !strcmp(NameStr, "fdopen")) {
+          if (FTy->getNumParams() != 2 ||
+              !isa<PointerType>(FTy->getReturnType()) ||
+              !isa<PointerType>(FTy->getParamType(1)))
+            continue;
+          setDoesNotThrow(F);
+          setDoesNotAlias(F, 0);
+          setDoesNotCapture(F, 2);
+        } else if ((NameLen == 4 && !strcmp(NameStr, "feof")) ||
+                   (NameLen == 4 && !strcmp(NameStr, "free")) ||
+                   (NameLen == 5 && !strcmp(NameStr, "fseek")) ||
+                   (NameLen == 5 && !strcmp(NameStr, "ftell")) ||
+                   (NameLen == 5 && !strcmp(NameStr, "fgetc")) ||
+                   (NameLen == 6 && !strcmp(NameStr, "fseeko")) ||
+                   (NameLen == 6 && !strcmp(NameStr, "ftello")) ||
+                   (NameLen == 6 && !strcmp(NameStr, "fileno")) ||
+                   (NameLen == 6 && !strcmp(NameStr, "fflush")) ||
+                   (NameLen == 6 && !strcmp(NameStr, "fclose")) ||
+                   (NameLen == 7 && !strcmp(NameStr, "fsetpos")) ||
+                   (NameLen == 9 && !strcmp(NameStr, "flockfile")) ||
+                   (NameLen == 11 && !strcmp(NameStr, "funlockfile")) ||
+                   (NameLen == 12 && !strcmp(NameStr, "ftrylockfile"))) {
+          if (FTy->getNumParams() == 0 ||
+              !isa<PointerType>(FTy->getParamType(0)))
+            continue;
+          setDoesNotThrow(F);
+          setDoesNotCapture(F, 1);
+        } else if (NameLen == 6 && !strcmp(NameStr, "ferror")) {
+          if (FTy->getNumParams() != 1 ||
+              !isa<PointerType>(FTy->getParamType(0)))
+            continue;
+          setDoesNotThrow(F);
+          setDoesNotCapture(F, 1);
+          setOnlyReadsMemory(F);
+        } else if ((NameLen == 5 && !strcmp(NameStr, "fputc")) ||
+                   (NameLen == 5 && !strcmp(NameStr, "fstat")) ||
+                   (NameLen == 5 && !strcmp(NameStr, "frexp")) ||
+                   (NameLen == 6 && !strcmp(NameStr, "frexpf")) ||
+                   (NameLen == 6 && !strcmp(NameStr, "frexpl")) ||
+                   (NameLen == 8 && !strcmp(NameStr, "fstatvfs"))) {
+          if (FTy->getNumParams() != 2 ||
+              !isa<PointerType>(FTy->getParamType(1)))
+            continue;
+          setDoesNotThrow(F);
+          setDoesNotCapture(F, 2);
+        } else if (NameLen == 5 && !strcmp(NameStr, "fgets")) {
+          if (FTy->getNumParams() != 3 ||
+              !isa<PointerType>(FTy->getParamType(0)) ||
+              !isa<PointerType>(FTy->getParamType(2)))
+            continue;
+          setDoesNotThrow(F);
+          setDoesNotCapture(F, 3);
+        } else if ((NameLen == 5 && !strcmp(NameStr, "fread")) ||
+                   (NameLen == 6 && !strcmp(NameStr, "fwrite"))) {
+          if (FTy->getNumParams() != 4 ||
+              !isa<PointerType>(FTy->getParamType(0)) ||
+              !isa<PointerType>(FTy->getParamType(3)))
+            continue;
+          setDoesNotThrow(F);
+          setDoesNotCapture(F, 1);
+          setDoesNotCapture(F, 4);
+        } else if ((NameLen == 5 && !strcmp(NameStr, "fputs")) ||
+                   (NameLen == 6 && !strcmp(NameStr, "fscanf")) ||
+                   (NameLen == 7 && !strcmp(NameStr, "fprintf")) ||
+                   (NameLen == 7 && !strcmp(NameStr, "fgetpos"))) {
+          if (FTy->getNumParams() < 2 ||
+              !isa<PointerType>(FTy->getParamType(0)) ||
+              !isa<PointerType>(FTy->getParamType(1)))
+            continue;
+          setDoesNotThrow(F);
+          setDoesNotCapture(F, 1);
+          setDoesNotCapture(F, 2);
+        }
+        break;
+      case 'g':
+        if ((NameLen == 4 && !strcmp(NameStr, "getc")) ||
+            (NameLen == 10 && !strcmp(NameStr, "getlogin_r")) ||
+            (NameLen == 13 && !strcmp(NameStr, "getc_unlocked"))) {
+          if (FTy->getNumParams() == 0 ||
+              !isa<PointerType>(FTy->getParamType(0)))
+            continue;
+          setDoesNotThrow(F);
+          setDoesNotCapture(F, 1);
+        } else if (NameLen == 6 && !strcmp(NameStr, "getenv")) {
+          if (FTy->getNumParams() != 1 ||
+              !isa<PointerType>(FTy->getParamType(0)))
+            continue;
+          setDoesNotThrow(F);
+          setOnlyReadsMemory(F);
+          setDoesNotCapture(F, 1);
+        } else if ((NameLen == 4 && !strcmp(NameStr, "gets")) ||
+                   (NameLen == 7 && !strcmp(NameStr, "getchar"))) {
+          setDoesNotThrow(F);
+        } else if (NameLen == 9 && !strcmp(NameStr, "getitimer")) {
+          if (FTy->getNumParams() != 2 ||
+              !isa<PointerType>(FTy->getParamType(1)))
+            continue;
+          setDoesNotThrow(F);
+          setDoesNotCapture(F, 2);
+        } else if (NameLen == 8 && !strcmp(NameStr, "getpwnam")) {
+          if (FTy->getNumParams() != 1 ||
+              !isa<PointerType>(FTy->getParamType(0)))
+            continue;
+          setDoesNotThrow(F);
+          setDoesNotCapture(F, 1);
+        }
+        break;
+      case 'u':
+        if (NameLen == 6 && !strcmp(NameStr, "ungetc")) {
+          if (FTy->getNumParams() != 2 ||
+              !isa<PointerType>(FTy->getParamType(1)))
+            continue;
+          setDoesNotThrow(F);
+          setDoesNotCapture(F, 2);
+        } else if ((NameLen == 5 && !strcmp(NameStr, "uname")) ||
+                   (NameLen == 6 && !strcmp(NameStr, "unlink")) ||
+                   (NameLen == 8 && !strcmp(NameStr, "unsetenv"))) {
+          if (FTy->getNumParams() != 1 ||
+              !isa<PointerType>(FTy->getParamType(0)))
+            continue;
+          setDoesNotThrow(F);
+          setDoesNotCapture(F, 1);
+        } else if ((NameLen == 5 && !strcmp(NameStr, "utime")) ||
+                   (NameLen == 6 && !strcmp(NameStr, "utimes"))) {
+          if (FTy->getNumParams() != 2 ||
+              !isa<PointerType>(FTy->getParamType(0)) ||
+              !isa<PointerType>(FTy->getParamType(1)))
+            continue;
+          setDoesNotThrow(F);
+          setDoesNotCapture(F, 1);
+          setDoesNotCapture(F, 2);
+        }
+        break;
+      case 'p':
+        if (NameLen == 4 && !strcmp(NameStr, "putc")) {
+          if (FTy->getNumParams() != 2 ||
+              !isa<PointerType>(FTy->getParamType(1)))
+            continue;
+          setDoesNotThrow(F);
+          setDoesNotCapture(F, 2);
+        } else if ((NameLen == 4 && !strcmp(NameStr, "puts")) ||
+                   (NameLen == 6 && !strcmp(NameStr, "printf")) ||
+                   (NameLen == 6 && !strcmp(NameStr, "perror"))) {
+          if (FTy->getNumParams() != 1 ||
+              !isa<PointerType>(FTy->getParamType(0)))
+            continue;
+          setDoesNotThrow(F);
+          setDoesNotCapture(F, 1);
+        } else if ((NameLen == 5 && !strcmp(NameStr, "pread")) ||
+                   (NameLen == 6 && !strcmp(NameStr, "pwrite"))) {
+          if (FTy->getNumParams() != 4 ||
+              !isa<PointerType>(FTy->getParamType(1)))
+            continue;
+          // May throw; these are valid pthread cancellation points.
+          setDoesNotCapture(F, 2);
+        } else if (NameLen == 7 && !strcmp(NameStr, "putchar")) {
+          setDoesNotThrow(F);
+        } else if (NameLen == 5 && !strcmp(NameStr, "popen")) {
+          if (FTy->getNumParams() != 2 ||
+              !isa<PointerType>(FTy->getReturnType()) ||
+              !isa<PointerType>(FTy->getParamType(0)) ||
+              !isa<PointerType>(FTy->getParamType(1)))
+            continue;
+          setDoesNotThrow(F);
+          setDoesNotAlias(F, 0);
+          setDoesNotCapture(F, 1);
+          setDoesNotCapture(F, 2);
+        } else if (NameLen == 6 && !strcmp(NameStr, "pclose")) {
+          if (FTy->getNumParams() != 1 ||
+              !isa<PointerType>(FTy->getParamType(0)))
+            continue;
+          setDoesNotThrow(F);
+          setDoesNotCapture(F, 1);
+        }
+        break;
+      case 'v':
+        if (NameLen == 6 && !strcmp(NameStr, "vscanf")) {
+          if (FTy->getNumParams() != 2 ||
+              !isa<PointerType>(FTy->getParamType(1)))
+            continue;
+          setDoesNotThrow(F);
+          setDoesNotCapture(F, 1);
+        } else if ((NameLen == 7 && !strcmp(NameStr, "vsscanf")) ||
+                   (NameLen == 7 && !strcmp(NameStr, "vfscanf"))) {
+          if (FTy->getNumParams() != 3 ||
+              !isa<PointerType>(FTy->getParamType(1)) ||
+              !isa<PointerType>(FTy->getParamType(2)))
+            continue;
+          setDoesNotThrow(F);
+          setDoesNotCapture(F, 1);
+          setDoesNotCapture(F, 2);
+        } else if (NameLen == 6 && !strcmp(NameStr, "valloc")) {
+          if (!isa<PointerType>(FTy->getReturnType()))
+            continue;
+          setDoesNotThrow(F);
+          setDoesNotAlias(F, 0);
+        } else if (NameLen == 7 && !strcmp(NameStr, "vprintf")) {
+          if (FTy->getNumParams() != 2 ||
+              !isa<PointerType>(FTy->getParamType(0)))
+            continue;
+          setDoesNotThrow(F);
+          setDoesNotCapture(F, 1);
+        } else if ((NameLen == 8 && !strcmp(NameStr, "vfprintf")) ||
+                   (NameLen == 8 && !strcmp(NameStr, "vsprintf"))) {
+          if (FTy->getNumParams() != 3 ||
+              !isa<PointerType>(FTy->getParamType(0)) ||
+              !isa<PointerType>(FTy->getParamType(1)))
+            continue;
+          setDoesNotThrow(F);
+          setDoesNotCapture(F, 1);
+          setDoesNotCapture(F, 2);
+        } else if (NameLen == 9 && !strcmp(NameStr, "vsnprintf")) {
+          if (FTy->getNumParams() != 4 ||
+              !isa<PointerType>(FTy->getParamType(0)) ||
+              !isa<PointerType>(FTy->getParamType(2)))
+            continue;
+          setDoesNotThrow(F);
+          setDoesNotCapture(F, 1);
+          setDoesNotCapture(F, 3);
+        }
+        break;
+      case 'o':
+        if (NameLen == 4 && !strcmp(NameStr, "open")) {
+          if (FTy->getNumParams() < 2 ||
+              !isa<PointerType>(FTy->getParamType(0)))
+            continue;
+          // May throw; "open" is a valid pthread cancellation point.
+          setDoesNotCapture(F, 1);
+        } else if (NameLen == 7 && !strcmp(NameStr, "opendir")) {
+          if (FTy->getNumParams() != 1 ||
+              !isa<PointerType>(FTy->getReturnType()) ||
+              !isa<PointerType>(FTy->getParamType(0)))
+            continue;
+          setDoesNotThrow(F);
+          setDoesNotAlias(F, 0);
+          setDoesNotCapture(F, 1);
+        }
+        break;
+      case 't':
+        if (NameLen == 7 && !strcmp(NameStr, "tmpfile")) {
+          if (!isa<PointerType>(FTy->getReturnType()))
+            continue;
+          setDoesNotThrow(F);
+          setDoesNotAlias(F, 0);
+        } else if (NameLen == 5 && !strcmp(NameStr, "times")) {
+          if (FTy->getNumParams() != 1 ||
+              !isa<PointerType>(FTy->getParamType(0)))
+            continue;
+          setDoesNotThrow(F);
+          setDoesNotCapture(F, 1);
+        }
+        break;
+      case 'h':
+        if ((NameLen == 5 && !strcmp(NameStr, "htonl")) ||
+            (NameLen == 5 && !strcmp(NameStr, "htons"))) {
+          setDoesNotThrow(F);
+          setDoesNotAccessMemory(F);
+        }
+        break;
+      case 'n':
+        if ((NameLen == 5 && !strcmp(NameStr, "ntohl")) ||
+            (NameLen == 5 && !strcmp(NameStr, "ntohs"))) {
+          setDoesNotThrow(F);
+          setDoesNotAccessMemory(F);
+        }
+        break;
+      case 'l':
+        if (NameLen == 5 && !strcmp(NameStr, "lstat")) {
+          if (FTy->getNumParams() != 2 ||
+              !isa<PointerType>(FTy->getParamType(0)) ||
+              !isa<PointerType>(FTy->getParamType(1)))
+            continue;
+          setDoesNotThrow(F);
+          setDoesNotCapture(F, 1);
+          setDoesNotCapture(F, 2);
+        } else if (NameLen == 6 && !strcmp(NameStr, "lchown")) {
+          if (FTy->getNumParams() != 3 ||
+              !isa<PointerType>(FTy->getParamType(0)))
+            continue;
+          setDoesNotThrow(F);
+          setDoesNotCapture(F, 1);
+        }
+        break;
+      case 'q':
+        if (NameLen == 5 && !strcmp(NameStr, "qsort")) {
+          if (FTy->getNumParams() != 4 ||
+              !isa<PointerType>(FTy->getParamType(3)))
+            continue;
+          // May throw; places call through function pointer.
+          setDoesNotCapture(F, 4);
+        }
+        break;
+      case '_':
+        if ((NameLen == 8 && !strcmp(NameStr, "__strdup")) ||
+            (NameLen == 9 && !strcmp(NameStr, "__strndup"))) {
+          if (FTy->getNumParams() < 1 ||
+              !isa<PointerType>(FTy->getReturnType()) ||
+              !isa<PointerType>(FTy->getParamType(0)))
+            continue;
+          setDoesNotThrow(F);
+          setDoesNotAlias(F, 0);
+          setDoesNotCapture(F, 1);
+        } else if (NameLen == 10 && !strcmp(NameStr, "__strtok_r")) {
+          if (FTy->getNumParams() != 3 ||
+              !isa<PointerType>(FTy->getParamType(1)))
+            continue;
+          setDoesNotThrow(F);
+          setDoesNotCapture(F, 2);
+        } else if (NameLen == 8 && !strcmp(NameStr, "_IO_getc")) {
+          if (FTy->getNumParams() != 1 ||
+              !isa<PointerType>(FTy->getParamType(0)))
+            continue;
+          setDoesNotThrow(F);
+          setDoesNotCapture(F, 1);
+        } else if (NameLen == 8 && !strcmp(NameStr, "_IO_putc")) {
+          if (FTy->getNumParams() != 2 ||
+              !isa<PointerType>(FTy->getParamType(1)))
+            continue;
+          setDoesNotThrow(F);
+          setDoesNotCapture(F, 2);
+        }
+        break;
+      case 1:
+        if (NameLen == 15 && !strcmp(NameStr, "\1__isoc99_scanf")) {
+          if (FTy->getNumParams() < 1 ||
+              !isa<PointerType>(FTy->getParamType(0)))
+            continue;
+          setDoesNotThrow(F);
+          setDoesNotCapture(F, 1);
+        } else if ((NameLen == 7 && !strcmp(NameStr, "\1stat64")) ||
+                   (NameLen == 8 && !strcmp(NameStr, "\1lstat64")) ||
+                   (NameLen == 10 && !strcmp(NameStr, "\1statvfs64")) ||
+                   (NameLen == 16 && !strcmp(NameStr, "\1__isoc99_sscanf"))) {
+          if (FTy->getNumParams() < 1 ||
+              !isa<PointerType>(FTy->getParamType(0)) ||
+              !isa<PointerType>(FTy->getParamType(1)))
+            continue;
+          setDoesNotThrow(F);
+          setDoesNotCapture(F, 1);
+          setDoesNotCapture(F, 2);
+        } else if (NameLen == 8 && !strcmp(NameStr, "\1fopen64")) {
+          if (FTy->getNumParams() != 2 ||
+              !isa<PointerType>(FTy->getReturnType()) ||
+              !isa<PointerType>(FTy->getParamType(0)) ||
+              !isa<PointerType>(FTy->getParamType(1)))
+            continue;
+          setDoesNotThrow(F);
+          setDoesNotAlias(F, 0);
+          setDoesNotCapture(F, 1);
+          setDoesNotCapture(F, 2);
+        } else if ((NameLen == 9 && !strcmp(NameStr, "\1fseeko64")) ||
+                   (NameLen == 9 && !strcmp(NameStr, "\1ftello64"))) {
+          if (FTy->getNumParams() == 0 ||
+              !isa<PointerType>(FTy->getParamType(0)))
+            continue;
+          setDoesNotThrow(F);
+          setDoesNotCapture(F, 1);
+        } else if (NameLen == 10 && !strcmp(NameStr, "\1tmpfile64")) {
+          if (!isa<PointerType>(FTy->getReturnType()))
+            continue;
+          setDoesNotThrow(F);
+          setDoesNotAlias(F, 0);
+        } else if ((NameLen == 8 && !strcmp(NameStr, "\1fstat64")) ||
+                   (NameLen == 11 && !strcmp(NameStr, "\1fstatvfs64"))) {
+          if (FTy->getNumParams() != 2 ||
+              !isa<PointerType>(FTy->getParamType(1)))
+            continue;
+          setDoesNotThrow(F);
+          setDoesNotCapture(F, 2);
+        } else if (NameLen == 7 && !strcmp(NameStr, "\1open64")) {
+          if (FTy->getNumParams() < 2 ||
+              !isa<PointerType>(FTy->getParamType(0)))
+            continue;
+          // May throw; "open" is a valid pthread cancellation point.
+          setDoesNotCapture(F, 1);
+        }
+        break;
+    }
+  }
+  return Modified;
+}
+
+// TODO:
+//   Additional cases that we need to add to this file:
+//
+// cbrt:
+//   * cbrt(expN(X))  -> expN(x/3)
+//   * cbrt(sqrt(x))  -> pow(x,1/6)
+//   * cbrt(sqrt(x))  -> pow(x,1/9)
+//
+// cos, cosf, cosl:
+//   * cos(-x)  -> cos(x)
+//
+// exp, expf, expl:
+//   * exp(log(x))  -> x
+//
+// log, logf, logl:
+//   * log(exp(x))   -> x
+//   * log(x**y)     -> y*log(x)
+//   * log(exp(y))   -> y*log(e)
+//   * log(exp2(y))  -> y*log(2)
+//   * log(exp10(y)) -> y*log(10)
+//   * log(sqrt(x))  -> 0.5*log(x)
+//   * log(pow(x,y)) -> y*log(x)
+//
+// lround, lroundf, lroundl:
+//   * lround(cnst) -> cnst'
+//
+// memcmp:
+//   * memcmp(x,y,l)   -> cnst
+//      (if all arguments are constant and strlen(x) <= l and strlen(y) <= l)
+//
+// pow, powf, powl:
+//   * pow(exp(x),y)  -> exp(x*y)
+//   * pow(sqrt(x),y) -> pow(x,y*0.5)
+//   * pow(pow(x,y),z)-> pow(x,y*z)
+//
+// puts:
+//   * puts("") -> putchar("\n")
+//
+// round, roundf, roundl:
+//   * round(cnst) -> cnst'
+//
+// signbit:
+//   * signbit(cnst) -> cnst'
+//   * signbit(nncst) -> 0 (if pstv is a non-negative constant)
+//
+// sqrt, sqrtf, sqrtl:
+//   * sqrt(expN(x))  -> expN(x*0.5)
+//   * sqrt(Nroot(x)) -> pow(x,1/(2*N))
+//   * sqrt(pow(x,y)) -> pow(|x|,y*0.5)
+//
+// stpcpy:
+//   * stpcpy(str, "literal") ->
+//           llvm.memcpy(str,"literal",strlen("literal")+1,1)
+// strrchr:
+//   * strrchr(s,c) -> reverse_offset_of_in(c,s)
+//      (if c is a constant integer and s is a constant string)
+//   * strrchr(s1,0) -> strchr(s1,0)
+//
+// strpbrk:
+//   * strpbrk(s,a) -> offset_in_for(s,a)
+//      (if s and a are both constant strings)
+//   * strpbrk(s,"") -> 0
+//   * strpbrk(s,a) -> strchr(s,a[0]) (if a is constant string of length 1)
+//
+// strspn, strcspn:
+//   * strspn(s,a)   -> const_int (if both args are constant)
+//   * strspn("",a)  -> 0
+//   * strspn(s,"")  -> 0
+//   * strcspn(s,a)  -> const_int (if both args are constant)
+//   * strcspn("",a) -> 0
+//   * strcspn(s,"") -> strlen(a)
+//
+// strstr:
+//   * strstr(x,x)  -> x
+//   * strstr(s1,s2) -> offset_of_s2_in(s1)
+//       (if s1 and s2 are constant strings)
+//
+// tan, tanf, tanl:
+//   * tan(atan(x)) -> x
+//
+// trunc, truncf, truncl:
+//   * trunc(cnst) -> cnst'
+//
+//
diff --git a/lib/Transforms/Scalar/TailDuplication.cpp b/lib/Transforms/Scalar/TailDuplication.cpp
new file mode 100644
index 0000000..99a7dee
--- /dev/null
+++ b/lib/Transforms/Scalar/TailDuplication.cpp
@@ -0,0 +1,365 @@
+//===- TailDuplication.cpp - Simplify CFG through tail duplication --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass performs a limited form of tail duplication, intended to simplify
+// CFGs by removing some unconditional branches.  This pass is necessary to
+// straighten out loops created by the C front-end, but also is capable of
+// making other code nicer.  After this pass is run, the CFG simplify pass
+// should be run to clean up the mess.
+//
+// This pass could be enhanced in the future to use profile information to be
+// more aggressive.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "tailduplicate"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Constant.h"
+#include "llvm/Function.h"
+#include "llvm/Instructions.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Pass.h"
+#include "llvm/Type.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include <map>
+using namespace llvm;
+
+STATISTIC(NumEliminated, "Number of unconditional branches eliminated");
+
+static cl::opt<unsigned>
+TailDupThreshold("taildup-threshold",
+                 cl::desc("Max block size to tail duplicate"),
+                 cl::init(1), cl::Hidden);
+
+namespace {
+  class VISIBILITY_HIDDEN TailDup : public FunctionPass {
+    bool runOnFunction(Function &F);
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    TailDup() : FunctionPass(&ID) {}
+
+  private:
+    inline bool shouldEliminateUnconditionalBranch(TerminatorInst *, unsigned);
+    inline void eliminateUnconditionalBranch(BranchInst *BI);
+    SmallPtrSet<BasicBlock*, 4> CycleDetector;
+  };
+}
+
+char TailDup::ID = 0;
+static RegisterPass<TailDup> X("tailduplicate", "Tail Duplication");
+
+// Public interface to the Tail Duplication pass
+FunctionPass *llvm::createTailDuplicationPass() { return new TailDup(); }
+
+/// runOnFunction - Top level algorithm - Loop over each unconditional branch in
+/// the function, eliminating it if it looks attractive enough.  CycleDetector
+/// prevents infinite loops by checking that we aren't redirecting a branch to
+/// a place it already pointed to earlier; see PR 2323.
+bool TailDup::runOnFunction(Function &F) {
+  bool Changed = false;
+  CycleDetector.clear();
+  for (Function::iterator I = F.begin(), E = F.end(); I != E; ) {
+    if (shouldEliminateUnconditionalBranch(I->getTerminator(),
+                                           TailDupThreshold)) {
+      eliminateUnconditionalBranch(cast<BranchInst>(I->getTerminator()));
+      Changed = true;
+    } else {
+      ++I;
+      CycleDetector.clear();
+    }
+  }
+  return Changed;
+}
+
+/// shouldEliminateUnconditionalBranch - Return true if this branch looks
+/// attractive to eliminate.  We eliminate the branch if the destination basic
+/// block has <= 5 instructions in it, not counting PHI nodes.  In practice,
+/// since one of these is a terminator instruction, this means that we will add
+/// up to 4 instructions to the new block.
+///
+/// We don't count PHI nodes in the count since they will be removed when the
+/// contents of the block are copied over.
+///
+bool TailDup::shouldEliminateUnconditionalBranch(TerminatorInst *TI,
+                                                 unsigned Threshold) {
+  BranchInst *BI = dyn_cast<BranchInst>(TI);
+  if (!BI || !BI->isUnconditional()) return false;  // Not an uncond branch!
+
+  BasicBlock *Dest = BI->getSuccessor(0);
+  if (Dest == BI->getParent()) return false;        // Do not loop infinitely!
+
+  // Do not inline a block if we will just get another branch to the same block!
+  TerminatorInst *DTI = Dest->getTerminator();
+  if (BranchInst *DBI = dyn_cast<BranchInst>(DTI))
+    if (DBI->isUnconditional() && DBI->getSuccessor(0) == Dest)
+      return false;                                 // Do not loop infinitely!
+
+  // FIXME: DemoteRegToStack cannot yet demote invoke instructions to the stack,
+  // because doing so would require breaking critical edges.  This should be
+  // fixed eventually.
+  if (!DTI->use_empty())
+    return false;
+
+  // Do not bother with blocks with only a single predecessor: simplify
+  // CFG will fold these two blocks together!
+  pred_iterator PI = pred_begin(Dest), PE = pred_end(Dest);
+  ++PI;
+  if (PI == PE) return false;  // Exactly one predecessor!
+
+  BasicBlock::iterator I = Dest->getFirstNonPHI();
+
+  for (unsigned Size = 0; I != Dest->end(); ++I) {
+    if (Size == Threshold) return false;  // The block is too large.
+    
+    // Don't tail duplicate call instructions.  They are very large compared to
+    // other instructions.
+    if (isa<CallInst>(I) || isa<InvokeInst>(I)) return false;
+
+    // Allso alloca and malloc.
+    if (isa<AllocationInst>(I)) return false;
+
+    // Some vector instructions can expand into a number of instructions.
+    if (isa<ShuffleVectorInst>(I) || isa<ExtractElementInst>(I) ||
+        isa<InsertElementInst>(I)) return false;
+    
+    // Only count instructions that are not debugger intrinsics.
+    if (!isa<DbgInfoIntrinsic>(I)) ++Size;
+  }
+
+  // Do not tail duplicate a block that has thousands of successors into a block
+  // with a single successor if the block has many other predecessors.  This can
+  // cause an N^2 explosion in CFG edges (and PHI node entries), as seen in
+  // cases that have a large number of indirect gotos.
+  unsigned NumSuccs = DTI->getNumSuccessors();
+  if (NumSuccs > 8) {
+    unsigned TooMany = 128;
+    if (NumSuccs >= TooMany) return false;
+    TooMany = TooMany/NumSuccs;
+    for (; PI != PE; ++PI)
+      if (TooMany-- == 0) return false;
+  }
+  
+  // If this unconditional branch is a fall-through, be careful about
+  // tail duplicating it.  In particular, we don't want to taildup it if the
+  // original block will still be there after taildup is completed: doing so
+  // would eliminate the fall-through, requiring unconditional branches.
+  Function::iterator DestI = Dest;
+  if (&*--DestI == BI->getParent()) {
+    // The uncond branch is a fall-through.  Tail duplication of the block is
+    // will eliminate the fall-through-ness and end up cloning the terminator
+    // at the end of the Dest block.  Since the original Dest block will
+    // continue to exist, this means that one or the other will not be able to
+    // fall through.  One typical example that this helps with is code like:
+    // if (a)
+    //   foo();
+    // if (b)
+    //   foo();
+    // Cloning the 'if b' block into the end of the first foo block is messy.
+    
+    // The messy case is when the fall-through block falls through to other
+    // blocks.  This is what we would be preventing if we cloned the block.
+    DestI = Dest;
+    if (++DestI != Dest->getParent()->end()) {
+      BasicBlock *DestSucc = DestI;
+      // If any of Dest's successors are fall-throughs, don't do this xform.
+      for (succ_iterator SI = succ_begin(Dest), SE = succ_end(Dest);
+           SI != SE; ++SI)
+        if (*SI == DestSucc)
+          return false;
+    }
+  }
+
+  // Finally, check that we haven't redirected to this target block earlier;
+  // there are cases where we loop forever if we don't check this (PR 2323).
+  if (!CycleDetector.insert(Dest))
+    return false;
+
+  return true;
+}
+
+/// FindObviousSharedDomOf - We know there is a branch from SrcBlock to
+/// DestBlock, and that SrcBlock is not the only predecessor of DstBlock.  If we
+/// can find a predecessor of SrcBlock that is a dominator of both SrcBlock and
+/// DstBlock, return it.
+static BasicBlock *FindObviousSharedDomOf(BasicBlock *SrcBlock,
+                                          BasicBlock *DstBlock) {
+  // SrcBlock must have a single predecessor.
+  pred_iterator PI = pred_begin(SrcBlock), PE = pred_end(SrcBlock);
+  if (PI == PE || ++PI != PE) return 0;
+
+  BasicBlock *SrcPred = *pred_begin(SrcBlock);
+
+  // Look at the predecessors of DstBlock.  One of them will be SrcBlock.  If
+  // there is only one other pred, get it, otherwise we can't handle it.
+  PI = pred_begin(DstBlock); PE = pred_end(DstBlock);
+  BasicBlock *DstOtherPred = 0;
+  if (*PI == SrcBlock) {
+    if (++PI == PE) return 0;
+    DstOtherPred = *PI;
+    if (++PI != PE) return 0;
+  } else {
+    DstOtherPred = *PI;
+    if (++PI == PE || *PI != SrcBlock || ++PI != PE) return 0;
+  }
+
+  // We can handle two situations here: "if then" and "if then else" blocks.  An
+  // 'if then' situation is just where DstOtherPred == SrcPred.
+  if (DstOtherPred == SrcPred)
+    return SrcPred;
+
+  // Check to see if we have an "if then else" situation, which means that
+  // DstOtherPred will have a single predecessor and it will be SrcPred.
+  PI = pred_begin(DstOtherPred); PE = pred_end(DstOtherPred);
+  if (PI != PE && *PI == SrcPred) {
+    if (++PI != PE) return 0;  // Not a single pred.
+    return SrcPred;  // Otherwise, it's an "if then" situation.  Return the if.
+  }
+
+  // Otherwise, this is something we can't handle.
+  return 0;
+}
+
+
+/// eliminateUnconditionalBranch - Clone the instructions from the destination
+/// block into the source block, eliminating the specified unconditional branch.
+/// If the destination block defines values used by successors of the dest
+/// block, we may need to insert PHI nodes.
+///
+void TailDup::eliminateUnconditionalBranch(BranchInst *Branch) {
+  BasicBlock *SourceBlock = Branch->getParent();
+  BasicBlock *DestBlock = Branch->getSuccessor(0);
+  assert(SourceBlock != DestBlock && "Our predicate is broken!");
+
+  DOUT << "TailDuplication[" << SourceBlock->getParent()->getName()
+       << "]: Eliminating branch: " << *Branch;
+
+  // See if we can avoid duplicating code by moving it up to a dominator of both
+  // blocks.
+  if (BasicBlock *DomBlock = FindObviousSharedDomOf(SourceBlock, DestBlock)) {
+    DOUT << "Found shared dominator: " << DomBlock->getName() << "\n";
+
+    // If there are non-phi instructions in DestBlock that have no operands
+    // defined in DestBlock, and if the instruction has no side effects, we can
+    // move the instruction to DomBlock instead of duplicating it.
+    BasicBlock::iterator BBI = DestBlock->getFirstNonPHI();
+    while (!isa<TerminatorInst>(BBI)) {
+      Instruction *I = BBI++;
+
+      bool CanHoist = !I->isTrapping() && !I->mayHaveSideEffects();
+      if (CanHoist) {
+        for (unsigned op = 0, e = I->getNumOperands(); op != e; ++op)
+          if (Instruction *OpI = dyn_cast<Instruction>(I->getOperand(op)))
+            if (OpI->getParent() == DestBlock ||
+                (isa<InvokeInst>(OpI) && OpI->getParent() == DomBlock)) {
+              CanHoist = false;
+              break;
+            }
+        if (CanHoist) {
+          // Remove from DestBlock, move right before the term in DomBlock.
+          DestBlock->getInstList().remove(I);
+          DomBlock->getInstList().insert(DomBlock->getTerminator(), I);
+          DOUT << "Hoisted: " << *I;
+        }
+      }
+    }
+  }
+
+  // Tail duplication can not update SSA properties correctly if the values
+  // defined in the duplicated tail are used outside of the tail itself.  For
+  // this reason, we spill all values that are used outside of the tail to the
+  // stack.
+  for (BasicBlock::iterator I = DestBlock->begin(); I != DestBlock->end(); ++I)
+    if (I->isUsedOutsideOfBlock(DestBlock)) {
+      // We found a use outside of the tail.  Create a new stack slot to
+      // break this inter-block usage pattern.
+      DemoteRegToStack(*I);
+    }
+
+  // We are going to have to map operands from the original block B to the new
+  // copy of the block B'.  If there are PHI nodes in the DestBlock, these PHI
+  // nodes also define part of this mapping.  Loop over these PHI nodes, adding
+  // them to our mapping.
+  //
+  std::map<Value*, Value*> ValueMapping;
+
+  BasicBlock::iterator BI = DestBlock->begin();
+  bool HadPHINodes = isa<PHINode>(BI);
+  for (; PHINode *PN = dyn_cast<PHINode>(BI); ++BI)
+    ValueMapping[PN] = PN->getIncomingValueForBlock(SourceBlock);
+
+  // Clone the non-phi instructions of the dest block into the source block,
+  // keeping track of the mapping...
+  //
+  for (; BI != DestBlock->end(); ++BI) {
+    Instruction *New = BI->clone();
+    New->setName(BI->getName());
+    SourceBlock->getInstList().push_back(New);
+    ValueMapping[BI] = New;
+  }
+
+  // Now that we have built the mapping information and cloned all of the
+  // instructions (giving us a new terminator, among other things), walk the new
+  // instructions, rewriting references of old instructions to use new
+  // instructions.
+  //
+  BI = Branch; ++BI;  // Get an iterator to the first new instruction
+  for (; BI != SourceBlock->end(); ++BI)
+    for (unsigned i = 0, e = BI->getNumOperands(); i != e; ++i)
+      if (Value *Remapped = ValueMapping[BI->getOperand(i)])
+        BI->setOperand(i, Remapped);
+
+  // Next we check to see if any of the successors of DestBlock had PHI nodes.
+  // If so, we need to add entries to the PHI nodes for SourceBlock now.
+  for (succ_iterator SI = succ_begin(DestBlock), SE = succ_end(DestBlock);
+       SI != SE; ++SI) {
+    BasicBlock *Succ = *SI;
+    for (BasicBlock::iterator PNI = Succ->begin(); isa<PHINode>(PNI); ++PNI) {
+      PHINode *PN = cast<PHINode>(PNI);
+      // Ok, we have a PHI node.  Figure out what the incoming value was for the
+      // DestBlock.
+      Value *IV = PN->getIncomingValueForBlock(DestBlock);
+
+      // Remap the value if necessary...
+      if (Value *MappedIV = ValueMapping[IV])
+        IV = MappedIV;
+      PN->addIncoming(IV, SourceBlock);
+    }
+  }
+
+  // Next, remove the old branch instruction, and any PHI node entries that we
+  // had.
+  BI = Branch; ++BI;  // Get an iterator to the first new instruction
+  DestBlock->removePredecessor(SourceBlock); // Remove entries in PHI nodes...
+  SourceBlock->getInstList().erase(Branch);  // Destroy the uncond branch...
+
+  // Final step: now that we have finished everything up, walk the cloned
+  // instructions one last time, constant propagating and DCE'ing them, because
+  // they may not be needed anymore.
+  //
+  if (HadPHINodes) {
+    while (BI != SourceBlock->end()) {
+      Instruction *Inst = BI++;
+      if (isInstructionTriviallyDead(Inst))
+        Inst->eraseFromParent();
+      else if (Constant *C = ConstantFoldInstruction(Inst)) {
+        Inst->replaceAllUsesWith(C);
+        Inst->eraseFromParent();
+      }
+    }
+  }
+
+  ++NumEliminated;  // We just killed a branch!
+}
diff --git a/lib/Transforms/Scalar/TailRecursionElimination.cpp b/lib/Transforms/Scalar/TailRecursionElimination.cpp
new file mode 100644
index 0000000..682d069
--- /dev/null
+++ b/lib/Transforms/Scalar/TailRecursionElimination.cpp
@@ -0,0 +1,479 @@
+//===- TailRecursionElimination.cpp - Eliminate Tail Calls ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file transforms calls of the current function (self recursion) followed
+// by a return instruction with a branch to the entry of the function, creating
+// a loop.  This pass also implements the following extensions to the basic
+// algorithm:
+//
+//  1. Trivial instructions between the call and return do not prevent the
+//     transformation from taking place, though currently the analysis cannot
+//     support moving any really useful instructions (only dead ones).
+//  2. This pass transforms functions that are prevented from being tail
+//     recursive by an associative expression to use an accumulator variable,
+//     thus compiling the typical naive factorial or 'fib' implementation into
+//     efficient code.
+//  3. TRE is performed if the function returns void, if the return
+//     returns the result returned by the call, or if the function returns a
+//     run-time constant on all exits from the function.  It is possible, though
+//     unlikely, that the return returns something else (like constant 0), and
+//     can still be TRE'd.  It can be TRE'd if ALL OTHER return instructions in
+//     the function return the exact same value.
+//  4. If it can prove that callees do not access theier caller stack frame,
+//     they are marked as eligible for tail call elimination (by the code
+//     generator).
+//
+// There are several improvements that could be made:
+//
+//  1. If the function has any alloca instructions, these instructions will be
+//     moved out of the entry block of the function, causing them to be
+//     evaluated each time through the tail recursion.  Safely keeping allocas
+//     in the entry block requires analysis to proves that the tail-called
+//     function does not read or write the stack object.
+//  2. Tail recursion is only performed if the call immediately preceeds the
+//     return instruction.  It's possible that there could be a jump between
+//     the call and the return.
+//  3. There can be intervening operations between the call and the return that
+//     prevent the TRE from occurring.  For example, there could be GEP's and
+//     stores to memory that will not be read or written by the call.  This
+//     requires some substantial analysis (such as with DSA) to prove safe to
+//     move ahead of the call, but doing so could allow many more TREs to be
+//     performed, for example in TreeAdd/TreeAlloc from the treeadd benchmark.
+//  4. The algorithm we use to detect if callees access their caller stack
+//     frames is very primitive.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "tailcallelim"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
+#include "llvm/Instructions.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/Compiler.h"
+using namespace llvm;
+
+STATISTIC(NumEliminated, "Number of tail calls removed");
+STATISTIC(NumAccumAdded, "Number of accumulators introduced");
+
+namespace {
+  struct VISIBILITY_HIDDEN TailCallElim : public FunctionPass {
+    static char ID; // Pass identification, replacement for typeid
+    TailCallElim() : FunctionPass(&ID) {}
+
+    virtual bool runOnFunction(Function &F);
+
+  private:
+    bool ProcessReturningBlock(ReturnInst *RI, BasicBlock *&OldEntry,
+                               bool &TailCallsAreMarkedTail,
+                               std::vector<PHINode*> &ArgumentPHIs,
+                               bool CannotTailCallElimCallsMarkedTail);
+    bool CanMoveAboveCall(Instruction *I, CallInst *CI);
+    Value *CanTransformAccumulatorRecursion(Instruction *I, CallInst *CI);
+  };
+}
+
+char TailCallElim::ID = 0;
+static RegisterPass<TailCallElim> X("tailcallelim", "Tail Call Elimination");
+
+// Public interface to the TailCallElimination pass
+FunctionPass *llvm::createTailCallEliminationPass() {
+  return new TailCallElim();
+}
+
+
+/// AllocaMightEscapeToCalls - Return true if this alloca may be accessed by
+/// callees of this function.  We only do very simple analysis right now, this
+/// could be expanded in the future to use mod/ref information for particular
+/// call sites if desired.
+static bool AllocaMightEscapeToCalls(AllocaInst *AI) {
+  // FIXME: do simple 'address taken' analysis.
+  return true;
+}
+
+/// FunctionContainsAllocas - Scan the specified basic block for alloca
+/// instructions.  If it contains any that might be accessed by calls, return
+/// true.
+static bool CheckForEscapingAllocas(BasicBlock *BB,
+                                    bool &CannotTCETailMarkedCall) {
+  bool RetVal = false;
+  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I)
+    if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) {
+      RetVal |= AllocaMightEscapeToCalls(AI);
+
+      // If this alloca is in the body of the function, or if it is a variable
+      // sized allocation, we cannot tail call eliminate calls marked 'tail'
+      // with this mechanism.
+      if (BB != &BB->getParent()->getEntryBlock() ||
+          !isa<ConstantInt>(AI->getArraySize()))
+        CannotTCETailMarkedCall = true;
+    }
+  return RetVal;
+}
+
+bool TailCallElim::runOnFunction(Function &F) {
+  // If this function is a varargs function, we won't be able to PHI the args
+  // right, so don't even try to convert it...
+  if (F.getFunctionType()->isVarArg()) return false;
+
+  BasicBlock *OldEntry = 0;
+  bool TailCallsAreMarkedTail = false;
+  std::vector<PHINode*> ArgumentPHIs;
+  bool MadeChange = false;
+
+  bool FunctionContainsEscapingAllocas = false;
+
+  // CannotTCETailMarkedCall - If true, we cannot perform TCE on tail calls
+  // marked with the 'tail' attribute, because doing so would cause the stack
+  // size to increase (real TCE would deallocate variable sized allocas, TCE
+  // doesn't).
+  bool CannotTCETailMarkedCall = false;
+
+  // Loop over the function, looking for any returning blocks, and keeping track
+  // of whether this function has any non-trivially used allocas.
+  for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
+    if (FunctionContainsEscapingAllocas && CannotTCETailMarkedCall)
+      break;
+
+    FunctionContainsEscapingAllocas |=
+      CheckForEscapingAllocas(BB, CannotTCETailMarkedCall);
+  }
+  
+  /// FIXME: The code generator produces really bad code when an 'escaping
+  /// alloca' is changed from being a static alloca to being a dynamic alloca.
+  /// Until this is resolved, disable this transformation if that would ever
+  /// happen.  This bug is PR962.
+  if (FunctionContainsEscapingAllocas)
+    return false;
+  
+
+  // Second pass, change any tail calls to loops.
+  for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB)
+    if (ReturnInst *Ret = dyn_cast<ReturnInst>(BB->getTerminator()))
+      MadeChange |= ProcessReturningBlock(Ret, OldEntry, TailCallsAreMarkedTail,
+                                          ArgumentPHIs,CannotTCETailMarkedCall);
+
+  // If we eliminated any tail recursions, it's possible that we inserted some
+  // silly PHI nodes which just merge an initial value (the incoming operand)
+  // with themselves.  Check to see if we did and clean up our mess if so.  This
+  // occurs when a function passes an argument straight through to its tail
+  // call.
+  if (!ArgumentPHIs.empty()) {
+    for (unsigned i = 0, e = ArgumentPHIs.size(); i != e; ++i) {
+      PHINode *PN = ArgumentPHIs[i];
+
+      // If the PHI Node is a dynamic constant, replace it with the value it is.
+      if (Value *PNV = PN->hasConstantValue()) {
+        PN->replaceAllUsesWith(PNV);
+        PN->eraseFromParent();
+      }
+    }
+  }
+
+  // Finally, if this function contains no non-escaping allocas, mark all calls
+  // in the function as eligible for tail calls (there is no stack memory for
+  // them to access).
+  if (!FunctionContainsEscapingAllocas)
+    for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB)
+      for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I)
+        if (CallInst *CI = dyn_cast<CallInst>(I)) {
+          CI->setTailCall();
+          MadeChange = true;
+        }
+
+  return MadeChange;
+}
+
+
+/// CanMoveAboveCall - Return true if it is safe to move the specified
+/// instruction from after the call to before the call, assuming that all
+/// instructions between the call and this instruction are movable.
+///
+bool TailCallElim::CanMoveAboveCall(Instruction *I, CallInst *CI) {
+  // FIXME: We can move load/store/call/free instructions above the call if the
+  // call does not mod/ref the memory location being processed.
+  if (I->mayHaveSideEffects() || isa<LoadInst>(I))
+    return false;
+
+  // Otherwise, if this is a side-effect free instruction, check to make sure
+  // that it does not use the return value of the call.  If it doesn't use the
+  // return value of the call, it must only use things that are defined before
+  // the call, or movable instructions between the call and the instruction
+  // itself.
+  for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i)
+    if (I->getOperand(i) == CI)
+      return false;
+  return true;
+}
+
+// isDynamicConstant - Return true if the specified value is the same when the
+// return would exit as it was when the initial iteration of the recursive
+// function was executed.
+//
+// We currently handle static constants and arguments that are not modified as
+// part of the recursion.
+//
+static bool isDynamicConstant(Value *V, CallInst *CI) {
+  if (isa<Constant>(V)) return true; // Static constants are always dyn consts
+
+  // Check to see if this is an immutable argument, if so, the value
+  // will be available to initialize the accumulator.
+  if (Argument *Arg = dyn_cast<Argument>(V)) {
+    // Figure out which argument number this is...
+    unsigned ArgNo = 0;
+    Function *F = CI->getParent()->getParent();
+    for (Function::arg_iterator AI = F->arg_begin(); &*AI != Arg; ++AI)
+      ++ArgNo;
+
+    // If we are passing this argument into call as the corresponding
+    // argument operand, then the argument is dynamically constant.
+    // Otherwise, we cannot transform this function safely.
+    if (CI->getOperand(ArgNo+1) == Arg)
+      return true;
+  }
+  // Not a constant or immutable argument, we can't safely transform.
+  return false;
+}
+
+// getCommonReturnValue - Check to see if the function containing the specified
+// return instruction and tail call consistently returns the same
+// runtime-constant value at all exit points.  If so, return the returned value.
+//
+static Value *getCommonReturnValue(ReturnInst *TheRI, CallInst *CI) {
+  Function *F = TheRI->getParent()->getParent();
+  Value *ReturnedValue = 0;
+
+  // TODO: Handle multiple value ret instructions;
+  if (isa<StructType>(F->getReturnType()))
+      return 0;
+
+  for (Function::iterator BBI = F->begin(), E = F->end(); BBI != E; ++BBI)
+    if (ReturnInst *RI = dyn_cast<ReturnInst>(BBI->getTerminator()))
+      if (RI != TheRI) {
+        Value *RetOp = RI->getOperand(0);
+
+        // We can only perform this transformation if the value returned is
+        // evaluatable at the start of the initial invocation of the function,
+        // instead of at the end of the evaluation.
+        //
+        if (!isDynamicConstant(RetOp, CI))
+          return 0;
+
+        if (ReturnedValue && RetOp != ReturnedValue)
+          return 0;     // Cannot transform if differing values are returned.
+        ReturnedValue = RetOp;
+      }
+  return ReturnedValue;
+}
+
+/// CanTransformAccumulatorRecursion - If the specified instruction can be
+/// transformed using accumulator recursion elimination, return the constant
+/// which is the start of the accumulator value.  Otherwise return null.
+///
+Value *TailCallElim::CanTransformAccumulatorRecursion(Instruction *I,
+                                                      CallInst *CI) {
+  if (!I->isAssociative()) return 0;
+  assert(I->getNumOperands() == 2 &&
+         "Associative operations should have 2 args!");
+
+  // Exactly one operand should be the result of the call instruction...
+  if ((I->getOperand(0) == CI && I->getOperand(1) == CI) ||
+      (I->getOperand(0) != CI && I->getOperand(1) != CI))
+    return 0;
+
+  // The only user of this instruction we allow is a single return instruction.
+  if (!I->hasOneUse() || !isa<ReturnInst>(I->use_back()))
+    return 0;
+
+  // Ok, now we have to check all of the other return instructions in this
+  // function.  If they return non-constants or differing values, then we cannot
+  // transform the function safely.
+  return getCommonReturnValue(cast<ReturnInst>(I->use_back()), CI);
+}
+
+bool TailCallElim::ProcessReturningBlock(ReturnInst *Ret, BasicBlock *&OldEntry,
+                                         bool &TailCallsAreMarkedTail,
+                                         std::vector<PHINode*> &ArgumentPHIs,
+                                       bool CannotTailCallElimCallsMarkedTail) {
+  BasicBlock *BB = Ret->getParent();
+  Function *F = BB->getParent();
+
+  if (&BB->front() == Ret) // Make sure there is something before the ret...
+    return false;
+  
+  // If the return is in the entry block, then making this transformation would
+  // turn infinite recursion into an infinite loop.  This transformation is ok
+  // in theory, but breaks some code like:
+  //   double fabs(double f) { return __builtin_fabs(f); } // a 'fabs' call
+  // disable this xform in this case, because the code generator will lower the
+  // call to fabs into inline code.
+  if (BB == &F->getEntryBlock())
+    return false;
+
+  // Scan backwards from the return, checking to see if there is a tail call in
+  // this block.  If so, set CI to it.
+  CallInst *CI;
+  BasicBlock::iterator BBI = Ret;
+  while (1) {
+    CI = dyn_cast<CallInst>(BBI);
+    if (CI && CI->getCalledFunction() == F)
+      break;
+
+    if (BBI == BB->begin())
+      return false;          // Didn't find a potential tail call.
+    --BBI;
+  }
+
+  // If this call is marked as a tail call, and if there are dynamic allocas in
+  // the function, we cannot perform this optimization.
+  if (CI->isTailCall() && CannotTailCallElimCallsMarkedTail)
+    return false;
+
+  // If we are introducing accumulator recursion to eliminate associative
+  // operations after the call instruction, this variable contains the initial
+  // value for the accumulator.  If this value is set, we actually perform
+  // accumulator recursion elimination instead of simple tail recursion
+  // elimination.
+  Value *AccumulatorRecursionEliminationInitVal = 0;
+  Instruction *AccumulatorRecursionInstr = 0;
+
+  // Ok, we found a potential tail call.  We can currently only transform the
+  // tail call if all of the instructions between the call and the return are
+  // movable to above the call itself, leaving the call next to the return.
+  // Check that this is the case now.
+  for (BBI = CI, ++BBI; &*BBI != Ret; ++BBI)
+    if (!CanMoveAboveCall(BBI, CI)) {
+      // If we can't move the instruction above the call, it might be because it
+      // is an associative operation that could be tranformed using accumulator
+      // recursion elimination.  Check to see if this is the case, and if so,
+      // remember the initial accumulator value for later.
+      if ((AccumulatorRecursionEliminationInitVal =
+                             CanTransformAccumulatorRecursion(BBI, CI))) {
+        // Yes, this is accumulator recursion.  Remember which instruction
+        // accumulates.
+        AccumulatorRecursionInstr = BBI;
+      } else {
+        return false;   // Otherwise, we cannot eliminate the tail recursion!
+      }
+    }
+
+  // We can only transform call/return pairs that either ignore the return value
+  // of the call and return void, ignore the value of the call and return a
+  // constant, return the value returned by the tail call, or that are being
+  // accumulator recursion variable eliminated.
+  if (Ret->getNumOperands() == 1 && Ret->getReturnValue() != CI &&
+      !isa<UndefValue>(Ret->getReturnValue()) &&
+      AccumulatorRecursionEliminationInitVal == 0 &&
+      !getCommonReturnValue(Ret, CI))
+    return false;
+
+  // OK! We can transform this tail call.  If this is the first one found,
+  // create the new entry block, allowing us to branch back to the old entry.
+  if (OldEntry == 0) {
+    OldEntry = &F->getEntryBlock();
+    BasicBlock *NewEntry = BasicBlock::Create("", F, OldEntry);
+    NewEntry->takeName(OldEntry);
+    OldEntry->setName("tailrecurse");
+    BranchInst::Create(OldEntry, NewEntry);
+
+    // If this tail call is marked 'tail' and if there are any allocas in the
+    // entry block, move them up to the new entry block.
+    TailCallsAreMarkedTail = CI->isTailCall();
+    if (TailCallsAreMarkedTail)
+      // Move all fixed sized allocas from OldEntry to NewEntry.
+      for (BasicBlock::iterator OEBI = OldEntry->begin(), E = OldEntry->end(),
+             NEBI = NewEntry->begin(); OEBI != E; )
+        if (AllocaInst *AI = dyn_cast<AllocaInst>(OEBI++))
+          if (isa<ConstantInt>(AI->getArraySize()))
+            AI->moveBefore(NEBI);
+
+    // Now that we have created a new block, which jumps to the entry
+    // block, insert a PHI node for each argument of the function.
+    // For now, we initialize each PHI to only have the real arguments
+    // which are passed in.
+    Instruction *InsertPos = OldEntry->begin();
+    for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end();
+         I != E; ++I) {
+      PHINode *PN = PHINode::Create(I->getType(),
+                                    I->getName() + ".tr", InsertPos);
+      I->replaceAllUsesWith(PN); // Everyone use the PHI node now!
+      PN->addIncoming(I, NewEntry);
+      ArgumentPHIs.push_back(PN);
+    }
+  }
+
+  // If this function has self recursive calls in the tail position where some
+  // are marked tail and some are not, only transform one flavor or another.  We
+  // have to choose whether we move allocas in the entry block to the new entry
+  // block or not, so we can't make a good choice for both.  NOTE: We could do
+  // slightly better here in the case that the function has no entry block
+  // allocas.
+  if (TailCallsAreMarkedTail && !CI->isTailCall())
+    return false;
+
+  // Ok, now that we know we have a pseudo-entry block WITH all of the
+  // required PHI nodes, add entries into the PHI node for the actual
+  // parameters passed into the tail-recursive call.
+  for (unsigned i = 0, e = CI->getNumOperands()-1; i != e; ++i)
+    ArgumentPHIs[i]->addIncoming(CI->getOperand(i+1), BB);
+
+  // If we are introducing an accumulator variable to eliminate the recursion,
+  // do so now.  Note that we _know_ that no subsequent tail recursion
+  // eliminations will happen on this function because of the way the
+  // accumulator recursion predicate is set up.
+  //
+  if (AccumulatorRecursionEliminationInitVal) {
+    Instruction *AccRecInstr = AccumulatorRecursionInstr;
+    // Start by inserting a new PHI node for the accumulator.
+    PHINode *AccPN = PHINode::Create(AccRecInstr->getType(), "accumulator.tr",
+                                     OldEntry->begin());
+
+    // Loop over all of the predecessors of the tail recursion block.  For the
+    // real entry into the function we seed the PHI with the initial value,
+    // computed earlier.  For any other existing branches to this block (due to
+    // other tail recursions eliminated) the accumulator is not modified.
+    // Because we haven't added the branch in the current block to OldEntry yet,
+    // it will not show up as a predecessor.
+    for (pred_iterator PI = pred_begin(OldEntry), PE = pred_end(OldEntry);
+         PI != PE; ++PI) {
+      if (*PI == &F->getEntryBlock())
+        AccPN->addIncoming(AccumulatorRecursionEliminationInitVal, *PI);
+      else
+        AccPN->addIncoming(AccPN, *PI);
+    }
+
+    // Add an incoming argument for the current block, which is computed by our
+    // associative accumulator instruction.
+    AccPN->addIncoming(AccRecInstr, BB);
+
+    // Next, rewrite the accumulator recursion instruction so that it does not
+    // use the result of the call anymore, instead, use the PHI node we just
+    // inserted.
+    AccRecInstr->setOperand(AccRecInstr->getOperand(0) != CI, AccPN);
+
+    // Finally, rewrite any return instructions in the program to return the PHI
+    // node instead of the "initval" that they do currently.  This loop will
+    // actually rewrite the return value we are destroying, but that's ok.
+    for (Function::iterator BBI = F->begin(), E = F->end(); BBI != E; ++BBI)
+      if (ReturnInst *RI = dyn_cast<ReturnInst>(BBI->getTerminator()))
+        RI->setOperand(0, AccPN);
+    ++NumAccumAdded;
+  }
+
+  // Now that all of the PHI nodes are in place, remove the call and
+  // ret instructions, replacing them with an unconditional branch.
+  BranchInst::Create(OldEntry, Ret);
+  BB->getInstList().erase(Ret);  // Remove return.
+  BB->getInstList().erase(CI);   // Remove call.
+  ++NumEliminated;
+  return true;
+}
diff --git a/lib/Transforms/Utils/AddrModeMatcher.cpp b/lib/Transforms/Utils/AddrModeMatcher.cpp
new file mode 100644
index 0000000..71049fa
--- /dev/null
+++ b/lib/Transforms/Utils/AddrModeMatcher.cpp
@@ -0,0 +1,594 @@
+//===- AddrModeMatcher.cpp - Addressing mode matching facility --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements target addressing mode matcher class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/AddrModeMatcher.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/GlobalValue.h"
+#include "llvm/Instruction.h"
+#include "llvm/Assembly/Writer.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Support/GetElementPtrTypeIterator.h"
+#include "llvm/Support/PatternMatch.h"
+
+using namespace llvm;
+using namespace llvm::PatternMatch;
+
+void ExtAddrMode::print(OStream &OS) const {
+  bool NeedPlus = false;
+  OS << "[";
+  if (BaseGV) {
+    OS << (NeedPlus ? " + " : "")
+       << "GV:";
+    WriteAsOperand(*OS.stream(), BaseGV, /*PrintType=*/false);
+    NeedPlus = true;
+  }
+
+  if (BaseOffs)
+    OS << (NeedPlus ? " + " : "") << BaseOffs, NeedPlus = true;
+
+  if (BaseReg) {
+    OS << (NeedPlus ? " + " : "")
+       << "Base:";
+    WriteAsOperand(*OS.stream(), BaseReg, /*PrintType=*/false);
+    NeedPlus = true;
+  }
+  if (Scale) {
+    OS << (NeedPlus ? " + " : "")
+       << Scale << "*";
+    WriteAsOperand(*OS.stream(), ScaledReg, /*PrintType=*/false);
+    NeedPlus = true;
+  }
+
+  OS << ']';
+}
+
+void ExtAddrMode::dump() const {
+  print(cerr);
+  cerr << '\n';
+}
+
+
+/// MatchScaledValue - Try adding ScaleReg*Scale to the current addressing mode.
+/// Return true and update AddrMode if this addr mode is legal for the target,
+/// false if not.
+bool AddressingModeMatcher::MatchScaledValue(Value *ScaleReg, int64_t Scale,
+                                             unsigned Depth) {
+  // If Scale is 1, then this is the same as adding ScaleReg to the addressing
+  // mode.  Just process that directly.
+  if (Scale == 1)
+    return MatchAddr(ScaleReg, Depth);
+  
+  // If the scale is 0, it takes nothing to add this.
+  if (Scale == 0)
+    return true;
+  
+  // If we already have a scale of this value, we can add to it, otherwise, we
+  // need an available scale field.
+  if (AddrMode.Scale != 0 && AddrMode.ScaledReg != ScaleReg)
+    return false;
+
+  ExtAddrMode TestAddrMode = AddrMode;
+
+  // Add scale to turn X*4+X*3 -> X*7.  This could also do things like
+  // [A+B + A*7] -> [B+A*8].
+  TestAddrMode.Scale += Scale;
+  TestAddrMode.ScaledReg = ScaleReg;
+
+  // If the new address isn't legal, bail out.
+  if (!TLI.isLegalAddressingMode(TestAddrMode, AccessTy))
+    return false;
+
+  // It was legal, so commit it.
+  AddrMode = TestAddrMode;
+  
+  // Okay, we decided that we can add ScaleReg+Scale to AddrMode.  Check now
+  // to see if ScaleReg is actually X+C.  If so, we can turn this into adding
+  // X*Scale + C*Scale to addr mode.
+  ConstantInt *CI = 0; Value *AddLHS = 0;
+  if (isa<Instruction>(ScaleReg) &&  // not a constant expr.
+      match(ScaleReg, m_Add(m_Value(AddLHS), m_ConstantInt(CI)))) {
+    TestAddrMode.ScaledReg = AddLHS;
+    TestAddrMode.BaseOffs += CI->getSExtValue()*TestAddrMode.Scale;
+      
+    // If this addressing mode is legal, commit it and remember that we folded
+    // this instruction.
+    if (TLI.isLegalAddressingMode(TestAddrMode, AccessTy)) {
+      AddrModeInsts.push_back(cast<Instruction>(ScaleReg));
+      AddrMode = TestAddrMode;
+      return true;
+    }
+  }
+
+  // Otherwise, not (x+c)*scale, just return what we have.
+  return true;
+}
+
+/// MightBeFoldableInst - This is a little filter, which returns true if an
+/// addressing computation involving I might be folded into a load/store
+/// accessing it.  This doesn't need to be perfect, but needs to accept at least
+/// the set of instructions that MatchOperationAddr can.
+static bool MightBeFoldableInst(Instruction *I) {
+  switch (I->getOpcode()) {
+  case Instruction::BitCast:
+    // Don't touch identity bitcasts.
+    if (I->getType() == I->getOperand(0)->getType())
+      return false;
+    return isa<PointerType>(I->getType()) || isa<IntegerType>(I->getType());
+  case Instruction::PtrToInt:
+    // PtrToInt is always a noop, as we know that the int type is pointer sized.
+    return true;
+  case Instruction::IntToPtr:
+    // We know the input is intptr_t, so this is foldable.
+    return true;
+  case Instruction::Add:
+    return true;
+  case Instruction::Mul:
+  case Instruction::Shl:
+    // Can only handle X*C and X << C.
+    return isa<ConstantInt>(I->getOperand(1));
+  case Instruction::GetElementPtr:
+    return true;
+  default:
+    return false;
+  }
+}
+
+
+/// MatchOperationAddr - Given an instruction or constant expr, see if we can
+/// fold the operation into the addressing mode.  If so, update the addressing
+/// mode and return true, otherwise return false without modifying AddrMode.
+bool AddressingModeMatcher::MatchOperationAddr(User *AddrInst, unsigned Opcode,
+                                               unsigned Depth) {
+  // Avoid exponential behavior on extremely deep expression trees.
+  if (Depth >= 5) return false;
+  
+  switch (Opcode) {
+  case Instruction::PtrToInt:
+    // PtrToInt is always a noop, as we know that the int type is pointer sized.
+    return MatchAddr(AddrInst->getOperand(0), Depth);
+  case Instruction::IntToPtr:
+    // This inttoptr is a no-op if the integer type is pointer sized.
+    if (TLI.getValueType(AddrInst->getOperand(0)->getType()) ==
+        TLI.getPointerTy())
+      return MatchAddr(AddrInst->getOperand(0), Depth);
+    return false;
+  case Instruction::BitCast:
+    // BitCast is always a noop, and we can handle it as long as it is
+    // int->int or pointer->pointer (we don't want int<->fp or something).
+    if ((isa<PointerType>(AddrInst->getOperand(0)->getType()) ||
+         isa<IntegerType>(AddrInst->getOperand(0)->getType())) &&
+        // Don't touch identity bitcasts.  These were probably put here by LSR,
+        // and we don't want to mess around with them.  Assume it knows what it
+        // is doing.
+        AddrInst->getOperand(0)->getType() != AddrInst->getType())
+      return MatchAddr(AddrInst->getOperand(0), Depth);
+    return false;
+  case Instruction::Add: {
+    // Check to see if we can merge in the RHS then the LHS.  If so, we win.
+    ExtAddrMode BackupAddrMode = AddrMode;
+    unsigned OldSize = AddrModeInsts.size();
+    if (MatchAddr(AddrInst->getOperand(1), Depth+1) &&
+        MatchAddr(AddrInst->getOperand(0), Depth+1))
+      return true;
+    
+    // Restore the old addr mode info.
+    AddrMode = BackupAddrMode;
+    AddrModeInsts.resize(OldSize);
+    
+    // Otherwise this was over-aggressive.  Try merging in the LHS then the RHS.
+    if (MatchAddr(AddrInst->getOperand(0), Depth+1) &&
+        MatchAddr(AddrInst->getOperand(1), Depth+1))
+      return true;
+    
+    // Otherwise we definitely can't merge the ADD in.
+    AddrMode = BackupAddrMode;
+    AddrModeInsts.resize(OldSize);
+    break;
+  }
+  //case Instruction::Or:
+  // TODO: We can handle "Or Val, Imm" iff this OR is equivalent to an ADD.
+  //break;
+  case Instruction::Mul:
+  case Instruction::Shl: {
+    // Can only handle X*C and X << C.
+    ConstantInt *RHS = dyn_cast<ConstantInt>(AddrInst->getOperand(1));
+    if (!RHS) return false;
+    int64_t Scale = RHS->getSExtValue();
+    if (Opcode == Instruction::Shl)
+      Scale = 1 << Scale;
+    
+    return MatchScaledValue(AddrInst->getOperand(0), Scale, Depth);
+  }
+  case Instruction::GetElementPtr: {
+    // Scan the GEP.  We check it if it contains constant offsets and at most
+    // one variable offset.
+    int VariableOperand = -1;
+    unsigned VariableScale = 0;
+    
+    int64_t ConstantOffset = 0;
+    const TargetData *TD = TLI.getTargetData();
+    gep_type_iterator GTI = gep_type_begin(AddrInst);
+    for (unsigned i = 1, e = AddrInst->getNumOperands(); i != e; ++i, ++GTI) {
+      if (const StructType *STy = dyn_cast<StructType>(*GTI)) {
+        const StructLayout *SL = TD->getStructLayout(STy);
+        unsigned Idx =
+          cast<ConstantInt>(AddrInst->getOperand(i))->getZExtValue();
+        ConstantOffset += SL->getElementOffset(Idx);
+      } else {
+        uint64_t TypeSize = TD->getTypeAllocSize(GTI.getIndexedType());
+        if (ConstantInt *CI = dyn_cast<ConstantInt>(AddrInst->getOperand(i))) {
+          ConstantOffset += CI->getSExtValue()*TypeSize;
+        } else if (TypeSize) {  // Scales of zero don't do anything.
+          // We only allow one variable index at the moment.
+          if (VariableOperand != -1)
+            return false;
+          
+          // Remember the variable index.
+          VariableOperand = i;
+          VariableScale = TypeSize;
+        }
+      }
+    }
+    
+    // A common case is for the GEP to only do a constant offset.  In this case,
+    // just add it to the disp field and check validity.
+    if (VariableOperand == -1) {
+      AddrMode.BaseOffs += ConstantOffset;
+      if (ConstantOffset == 0 || TLI.isLegalAddressingMode(AddrMode, AccessTy)){
+        // Check to see if we can fold the base pointer in too.
+        if (MatchAddr(AddrInst->getOperand(0), Depth+1))
+          return true;
+      }
+      AddrMode.BaseOffs -= ConstantOffset;
+      return false;
+    }
+
+    // Save the valid addressing mode in case we can't match.
+    ExtAddrMode BackupAddrMode = AddrMode;
+    unsigned OldSize = AddrModeInsts.size();
+
+    // See if the scale and offset amount is valid for this target.
+    AddrMode.BaseOffs += ConstantOffset;
+
+    // Match the base operand of the GEP.
+    if (!MatchAddr(AddrInst->getOperand(0), Depth+1)) {
+      // If it couldn't be matched, just stuff the value in a register.
+      if (AddrMode.HasBaseReg) {
+        AddrMode = BackupAddrMode;
+        AddrModeInsts.resize(OldSize);
+        return false;
+      }
+      AddrMode.HasBaseReg = true;
+      AddrMode.BaseReg = AddrInst->getOperand(0);
+    }
+
+    // Match the remaining variable portion of the GEP.
+    if (!MatchScaledValue(AddrInst->getOperand(VariableOperand), VariableScale,
+                          Depth)) {
+      // If it couldn't be matched, try stuffing the base into a register
+      // instead of matching it, and retrying the match of the scale.
+      AddrMode = BackupAddrMode;
+      AddrModeInsts.resize(OldSize);
+      if (AddrMode.HasBaseReg)
+        return false;
+      AddrMode.HasBaseReg = true;
+      AddrMode.BaseReg = AddrInst->getOperand(0);
+      AddrMode.BaseOffs += ConstantOffset;
+      if (!MatchScaledValue(AddrInst->getOperand(VariableOperand),
+                            VariableScale, Depth)) {
+        // If even that didn't work, bail.
+        AddrMode = BackupAddrMode;
+        AddrModeInsts.resize(OldSize);
+        return false;
+      }
+    }
+
+    return true;
+  }
+  }
+  return false;
+}
+
+/// MatchAddr - If we can, try to add the value of 'Addr' into the current
+/// addressing mode.  If Addr can't be added to AddrMode this returns false and
+/// leaves AddrMode unmodified.  This assumes that Addr is either a pointer type
+/// or intptr_t for the target.
+///
+bool AddressingModeMatcher::MatchAddr(Value *Addr, unsigned Depth) {
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(Addr)) {
+    // Fold in immediates if legal for the target.
+    AddrMode.BaseOffs += CI->getSExtValue();
+    if (TLI.isLegalAddressingMode(AddrMode, AccessTy))
+      return true;
+    AddrMode.BaseOffs -= CI->getSExtValue();
+  } else if (GlobalValue *GV = dyn_cast<GlobalValue>(Addr)) {
+    // If this is a global variable, try to fold it into the addressing mode.
+    if (AddrMode.BaseGV == 0) {
+      AddrMode.BaseGV = GV;
+      if (TLI.isLegalAddressingMode(AddrMode, AccessTy))
+        return true;
+      AddrMode.BaseGV = 0;
+    }
+  } else if (Instruction *I = dyn_cast<Instruction>(Addr)) {
+    ExtAddrMode BackupAddrMode = AddrMode;
+    unsigned OldSize = AddrModeInsts.size();
+
+    // Check to see if it is possible to fold this operation.
+    if (MatchOperationAddr(I, I->getOpcode(), Depth)) {
+      // Okay, it's possible to fold this.  Check to see if it is actually
+      // *profitable* to do so.  We use a simple cost model to avoid increasing
+      // register pressure too much.
+      if (I->hasOneUse() ||
+          IsProfitableToFoldIntoAddressingMode(I, BackupAddrMode, AddrMode)) {
+        AddrModeInsts.push_back(I);
+        return true;
+      }
+      
+      // It isn't profitable to do this, roll back.
+      //cerr << "NOT FOLDING: " << *I;
+      AddrMode = BackupAddrMode;
+      AddrModeInsts.resize(OldSize);
+    }
+  } else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Addr)) {
+    if (MatchOperationAddr(CE, CE->getOpcode(), Depth))
+      return true;
+  } else if (isa<ConstantPointerNull>(Addr)) {
+    // Null pointer gets folded without affecting the addressing mode.
+    return true;
+  }
+
+  // Worse case, the target should support [reg] addressing modes. :)
+  if (!AddrMode.HasBaseReg) {
+    AddrMode.HasBaseReg = true;
+    AddrMode.BaseReg = Addr;
+    // Still check for legality in case the target supports [imm] but not [i+r].
+    if (TLI.isLegalAddressingMode(AddrMode, AccessTy))
+      return true;
+    AddrMode.HasBaseReg = false;
+    AddrMode.BaseReg = 0;
+  }
+
+  // If the base register is already taken, see if we can do [r+r].
+  if (AddrMode.Scale == 0) {
+    AddrMode.Scale = 1;
+    AddrMode.ScaledReg = Addr;
+    if (TLI.isLegalAddressingMode(AddrMode, AccessTy))
+      return true;
+    AddrMode.Scale = 0;
+    AddrMode.ScaledReg = 0;
+  }
+  // Couldn't match.
+  return false;
+}
+
+
+/// IsOperandAMemoryOperand - Check to see if all uses of OpVal by the specified
+/// inline asm call are due to memory operands.  If so, return true, otherwise
+/// return false.
+static bool IsOperandAMemoryOperand(CallInst *CI, InlineAsm *IA, Value *OpVal,
+                                    const TargetLowering &TLI) {
+  std::vector<InlineAsm::ConstraintInfo>
+  Constraints = IA->ParseConstraints();
+  
+  unsigned ArgNo = 1;   // ArgNo - The operand of the CallInst.
+  for (unsigned i = 0, e = Constraints.size(); i != e; ++i) {
+    TargetLowering::AsmOperandInfo OpInfo(Constraints[i]);
+    
+    // Compute the value type for each operand.
+    switch (OpInfo.Type) {
+      case InlineAsm::isOutput:
+        if (OpInfo.isIndirect)
+          OpInfo.CallOperandVal = CI->getOperand(ArgNo++);
+        break;
+      case InlineAsm::isInput:
+        OpInfo.CallOperandVal = CI->getOperand(ArgNo++);
+        break;
+      case InlineAsm::isClobber:
+        // Nothing to do.
+        break;
+    }
+    
+    // Compute the constraint code and ConstraintType to use.
+    TLI.ComputeConstraintToUse(OpInfo, SDValue(),
+                             OpInfo.ConstraintType == TargetLowering::C_Memory);
+    
+    // If this asm operand is our Value*, and if it isn't an indirect memory
+    // operand, we can't fold it!
+    if (OpInfo.CallOperandVal == OpVal &&
+        (OpInfo.ConstraintType != TargetLowering::C_Memory ||
+         !OpInfo.isIndirect))
+      return false;
+  }
+  
+  return true;
+}
+
+
+/// FindAllMemoryUses - Recursively walk all the uses of I until we find a
+/// memory use.  If we find an obviously non-foldable instruction, return true.
+/// Add the ultimately found memory instructions to MemoryUses.
+static bool FindAllMemoryUses(Instruction *I,
+                SmallVectorImpl<std::pair<Instruction*,unsigned> > &MemoryUses,
+                              SmallPtrSet<Instruction*, 16> &ConsideredInsts,
+                              const TargetLowering &TLI) {
+  // If we already considered this instruction, we're done.
+  if (!ConsideredInsts.insert(I))
+    return false;
+  
+  // If this is an obviously unfoldable instruction, bail out.
+  if (!MightBeFoldableInst(I))
+    return true;
+
+  // Loop over all the uses, recursively processing them.
+  for (Value::use_iterator UI = I->use_begin(), E = I->use_end();
+       UI != E; ++UI) {
+    if (LoadInst *LI = dyn_cast<LoadInst>(*UI)) {
+      MemoryUses.push_back(std::make_pair(LI, UI.getOperandNo()));
+      continue;
+    }
+    
+    if (StoreInst *SI = dyn_cast<StoreInst>(*UI)) {
+      if (UI.getOperandNo() == 0) return true; // Storing addr, not into addr.
+      MemoryUses.push_back(std::make_pair(SI, UI.getOperandNo()));
+      continue;
+    }
+    
+    if (CallInst *CI = dyn_cast<CallInst>(*UI)) {
+      InlineAsm *IA = dyn_cast<InlineAsm>(CI->getCalledValue());
+      if (IA == 0) return true;
+      
+      // If this is a memory operand, we're cool, otherwise bail out.
+      if (!IsOperandAMemoryOperand(CI, IA, I, TLI))
+        return true;
+      continue;
+    }
+    
+    if (FindAllMemoryUses(cast<Instruction>(*UI), MemoryUses, ConsideredInsts,
+                          TLI))
+      return true;
+  }
+
+  return false;
+}
+
+
+/// ValueAlreadyLiveAtInst - Retrn true if Val is already known to be live at
+/// the use site that we're folding it into.  If so, there is no cost to
+/// include it in the addressing mode.  KnownLive1 and KnownLive2 are two values
+/// that we know are live at the instruction already.
+bool AddressingModeMatcher::ValueAlreadyLiveAtInst(Value *Val,Value *KnownLive1,
+                                                   Value *KnownLive2) {
+  // If Val is either of the known-live values, we know it is live!
+  if (Val == 0 || Val == KnownLive1 || Val == KnownLive2)
+    return true;
+  
+  // All values other than instructions and arguments (e.g. constants) are live.
+  if (!isa<Instruction>(Val) && !isa<Argument>(Val)) return true;
+  
+  // If Val is a constant sized alloca in the entry block, it is live, this is
+  // true because it is just a reference to the stack/frame pointer, which is
+  // live for the whole function.
+  if (AllocaInst *AI = dyn_cast<AllocaInst>(Val))
+    if (AI->isStaticAlloca())
+      return true;
+  
+  // Check to see if this value is already used in the memory instruction's
+  // block.  If so, it's already live into the block at the very least, so we
+  // can reasonably fold it.
+  BasicBlock *MemBB = MemoryInst->getParent();
+  for (Value::use_iterator UI = Val->use_begin(), E = Val->use_end();
+       UI != E; ++UI)
+    // We know that uses of arguments and instructions have to be instructions.
+    if (cast<Instruction>(*UI)->getParent() == MemBB)
+      return true;
+  
+  return false;
+}
+
+
+
+/// IsProfitableToFoldIntoAddressingMode - It is possible for the addressing
+/// mode of the machine to fold the specified instruction into a load or store
+/// that ultimately uses it.  However, the specified instruction has multiple
+/// uses.  Given this, it may actually increase register pressure to fold it
+/// into the load.  For example, consider this code:
+///
+///     X = ...
+///     Y = X+1
+///     use(Y)   -> nonload/store
+///     Z = Y+1
+///     load Z
+///
+/// In this case, Y has multiple uses, and can be folded into the load of Z
+/// (yielding load [X+2]).  However, doing this will cause both "X" and "X+1" to
+/// be live at the use(Y) line.  If we don't fold Y into load Z, we use one
+/// fewer register.  Since Y can't be folded into "use(Y)" we don't increase the
+/// number of computations either.
+///
+/// Note that this (like most of CodeGenPrepare) is just a rough heuristic.  If
+/// X was live across 'load Z' for other reasons, we actually *would* want to
+/// fold the addressing mode in the Z case.  This would make Y die earlier.
+bool AddressingModeMatcher::
+IsProfitableToFoldIntoAddressingMode(Instruction *I, ExtAddrMode &AMBefore,
+                                     ExtAddrMode &AMAfter) {
+  if (IgnoreProfitability) return true;
+  
+  // AMBefore is the addressing mode before this instruction was folded into it,
+  // and AMAfter is the addressing mode after the instruction was folded.  Get
+  // the set of registers referenced by AMAfter and subtract out those
+  // referenced by AMBefore: this is the set of values which folding in this
+  // address extends the lifetime of.
+  //
+  // Note that there are only two potential values being referenced here,
+  // BaseReg and ScaleReg (global addresses are always available, as are any
+  // folded immediates).
+  Value *BaseReg = AMAfter.BaseReg, *ScaledReg = AMAfter.ScaledReg;
+  
+  // If the BaseReg or ScaledReg was referenced by the previous addrmode, their
+  // lifetime wasn't extended by adding this instruction.
+  if (ValueAlreadyLiveAtInst(BaseReg, AMBefore.BaseReg, AMBefore.ScaledReg))
+    BaseReg = 0;
+  if (ValueAlreadyLiveAtInst(ScaledReg, AMBefore.BaseReg, AMBefore.ScaledReg))
+    ScaledReg = 0;
+
+  // If folding this instruction (and it's subexprs) didn't extend any live
+  // ranges, we're ok with it.
+  if (BaseReg == 0 && ScaledReg == 0)
+    return true;
+
+  // If all uses of this instruction are ultimately load/store/inlineasm's,
+  // check to see if their addressing modes will include this instruction.  If
+  // so, we can fold it into all uses, so it doesn't matter if it has multiple
+  // uses.
+  SmallVector<std::pair<Instruction*,unsigned>, 16> MemoryUses;
+  SmallPtrSet<Instruction*, 16> ConsideredInsts;
+  if (FindAllMemoryUses(I, MemoryUses, ConsideredInsts, TLI))
+    return false;  // Has a non-memory, non-foldable use!
+  
+  // Now that we know that all uses of this instruction are part of a chain of
+  // computation involving only operations that could theoretically be folded
+  // into a memory use, loop over each of these uses and see if they could
+  // *actually* fold the instruction.
+  SmallVector<Instruction*, 32> MatchedAddrModeInsts;
+  for (unsigned i = 0, e = MemoryUses.size(); i != e; ++i) {
+    Instruction *User = MemoryUses[i].first;
+    unsigned OpNo = MemoryUses[i].second;
+    
+    // Get the access type of this use.  If the use isn't a pointer, we don't
+    // know what it accesses.
+    Value *Address = User->getOperand(OpNo);
+    if (!isa<PointerType>(Address->getType()))
+      return false;
+    const Type *AddressAccessTy =
+      cast<PointerType>(Address->getType())->getElementType();
+    
+    // Do a match against the root of this address, ignoring profitability. This
+    // will tell us if the addressing mode for the memory operation will
+    // *actually* cover the shared instruction.
+    ExtAddrMode Result;
+    AddressingModeMatcher Matcher(MatchedAddrModeInsts, TLI, AddressAccessTy,
+                                  MemoryInst, Result);
+    Matcher.IgnoreProfitability = true;
+    bool Success = Matcher.MatchAddr(Address, 0);
+    Success = Success; assert(Success && "Couldn't select *anything*?");
+
+    // If the match didn't cover I, then it won't be shared by it.
+    if (std::find(MatchedAddrModeInsts.begin(), MatchedAddrModeInsts.end(),
+                  I) == MatchedAddrModeInsts.end())
+      return false;
+    
+    MatchedAddrModeInsts.clear();
+  }
+  
+  return true;
+}
diff --git a/lib/Transforms/Utils/BasicBlockUtils.cpp b/lib/Transforms/Utils/BasicBlockUtils.cpp
new file mode 100644
index 0000000..6d1180d
--- /dev/null
+++ b/lib/Transforms/Utils/BasicBlockUtils.cpp
@@ -0,0 +1,622 @@
+//===-- BasicBlockUtils.cpp - BasicBlock Utilities -------------------------==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This family of functions perform manipulations on basic blocks, and
+// instructions contained within basic blocks.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Function.h"
+#include "llvm/Instructions.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Constant.h"
+#include "llvm/Type.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Support/ValueHandle.h"
+#include <algorithm>
+using namespace llvm;
+
+/// DeleteDeadBlock - Delete the specified block, which must have no
+/// predecessors.
+void llvm::DeleteDeadBlock(BasicBlock *BB) {
+  assert((pred_begin(BB) == pred_end(BB) ||
+         // Can delete self loop.
+         BB->getSinglePredecessor() == BB) && "Block is not dead!");
+  TerminatorInst *BBTerm = BB->getTerminator();
+  
+  // Loop through all of our successors and make sure they know that one
+  // of their predecessors is going away.
+  for (unsigned i = 0, e = BBTerm->getNumSuccessors(); i != e; ++i)
+    BBTerm->getSuccessor(i)->removePredecessor(BB);
+  
+  // Zap all the instructions in the block.
+  while (!BB->empty()) {
+    Instruction &I = BB->back();
+    // If this instruction is used, replace uses with an arbitrary value.
+    // Because control flow can't get here, we don't care what we replace the
+    // value with.  Note that since this block is unreachable, and all values
+    // contained within it must dominate their uses, that all uses will
+    // eventually be removed (they are themselves dead).
+    if (!I.use_empty())
+      I.replaceAllUsesWith(UndefValue::get(I.getType()));
+    BB->getInstList().pop_back();
+  }
+  
+  // Zap the block!
+  BB->eraseFromParent();
+}
+
+/// FoldSingleEntryPHINodes - We know that BB has one predecessor.  If there are
+/// any single-entry PHI nodes in it, fold them away.  This handles the case
+/// when all entries to the PHI nodes in a block are guaranteed equal, such as
+/// when the block has exactly one predecessor.
+void llvm::FoldSingleEntryPHINodes(BasicBlock *BB) {
+  if (!isa<PHINode>(BB->begin()))
+    return;
+  
+  while (PHINode *PN = dyn_cast<PHINode>(BB->begin())) {
+    if (PN->getIncomingValue(0) != PN)
+      PN->replaceAllUsesWith(PN->getIncomingValue(0));
+    else
+      PN->replaceAllUsesWith(UndefValue::get(PN->getType()));
+    PN->eraseFromParent();
+  }
+}
+
+
+/// DeleteDeadPHIs - Examine each PHI in the given block and delete it if it
+/// is dead. Also recursively delete any operands that become dead as
+/// a result. This includes tracing the def-use list from the PHI to see if
+/// it is ultimately unused or if it reaches an unused cycle.
+void llvm::DeleteDeadPHIs(BasicBlock *BB) {
+  // Recursively deleting a PHI may cause multiple PHIs to be deleted
+  // or RAUW'd undef, so use an array of WeakVH for the PHIs to delete.
+  SmallVector<WeakVH, 8> PHIs;
+  for (BasicBlock::iterator I = BB->begin();
+       PHINode *PN = dyn_cast<PHINode>(I); ++I)
+    PHIs.push_back(PN);
+
+  for (unsigned i = 0, e = PHIs.size(); i != e; ++i)
+    if (PHINode *PN = dyn_cast_or_null<PHINode>(PHIs[i].operator Value*()))
+      RecursivelyDeleteDeadPHINode(PN);
+}
+
+/// MergeBlockIntoPredecessor - Attempts to merge a block into its predecessor,
+/// if possible.  The return value indicates success or failure.
+bool llvm::MergeBlockIntoPredecessor(BasicBlock* BB, Pass* P) {
+  pred_iterator PI(pred_begin(BB)), PE(pred_end(BB));
+  // Can't merge the entry block.
+  if (pred_begin(BB) == pred_end(BB)) return false;
+  
+  BasicBlock *PredBB = *PI++;
+  for (; PI != PE; ++PI)  // Search all predecessors, see if they are all same
+    if (*PI != PredBB) {
+      PredBB = 0;       // There are multiple different predecessors...
+      break;
+    }
+  
+  // Can't merge if there are multiple predecessors.
+  if (!PredBB) return false;
+  // Don't break self-loops.
+  if (PredBB == BB) return false;
+  // Don't break invokes.
+  if (isa<InvokeInst>(PredBB->getTerminator())) return false;
+  
+  succ_iterator SI(succ_begin(PredBB)), SE(succ_end(PredBB));
+  BasicBlock* OnlySucc = BB;
+  for (; SI != SE; ++SI)
+    if (*SI != OnlySucc) {
+      OnlySucc = 0;     // There are multiple distinct successors!
+      break;
+    }
+  
+  // Can't merge if there are multiple successors.
+  if (!OnlySucc) return false;
+
+  // Can't merge if there is PHI loop.
+  for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE; ++BI) {
+    if (PHINode *PN = dyn_cast<PHINode>(BI)) {
+      for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+        if (PN->getIncomingValue(i) == PN)
+          return false;
+    } else
+      break;
+  }
+
+  // Begin by getting rid of unneeded PHIs.
+  while (PHINode *PN = dyn_cast<PHINode>(&BB->front())) {
+    PN->replaceAllUsesWith(PN->getIncomingValue(0));
+    BB->getInstList().pop_front();  // Delete the phi node...
+  }
+  
+  // Delete the unconditional branch from the predecessor...
+  PredBB->getInstList().pop_back();
+  
+  // Move all definitions in the successor to the predecessor...
+  PredBB->getInstList().splice(PredBB->end(), BB->getInstList());
+  
+  // Make all PHI nodes that referred to BB now refer to Pred as their
+  // source...
+  BB->replaceAllUsesWith(PredBB);
+  
+  // Inherit predecessors name if it exists.
+  if (!PredBB->hasName())
+    PredBB->takeName(BB);
+  
+  // Finally, erase the old block and update dominator info.
+  if (P) {
+    if (DominatorTree* DT = P->getAnalysisIfAvailable<DominatorTree>()) {
+      DomTreeNode* DTN = DT->getNode(BB);
+      DomTreeNode* PredDTN = DT->getNode(PredBB);
+  
+      if (DTN) {
+        SmallPtrSet<DomTreeNode*, 8> Children(DTN->begin(), DTN->end());
+        for (SmallPtrSet<DomTreeNode*, 8>::iterator DI = Children.begin(),
+             DE = Children.end(); DI != DE; ++DI)
+          DT->changeImmediateDominator(*DI, PredDTN);
+
+        DT->eraseNode(BB);
+      }
+    }
+  }
+  
+  BB->eraseFromParent();
+  
+  
+  return true;
+}
+
+/// ReplaceInstWithValue - Replace all uses of an instruction (specified by BI)
+/// with a value, then remove and delete the original instruction.
+///
+void llvm::ReplaceInstWithValue(BasicBlock::InstListType &BIL,
+                                BasicBlock::iterator &BI, Value *V) {
+  Instruction &I = *BI;
+  // Replaces all of the uses of the instruction with uses of the value
+  I.replaceAllUsesWith(V);
+
+  // Make sure to propagate a name if there is one already.
+  if (I.hasName() && !V->hasName())
+    V->takeName(&I);
+
+  // Delete the unnecessary instruction now...
+  BI = BIL.erase(BI);
+}
+
+
+/// ReplaceInstWithInst - Replace the instruction specified by BI with the
+/// instruction specified by I.  The original instruction is deleted and BI is
+/// updated to point to the new instruction.
+///
+void llvm::ReplaceInstWithInst(BasicBlock::InstListType &BIL,
+                               BasicBlock::iterator &BI, Instruction *I) {
+  assert(I->getParent() == 0 &&
+         "ReplaceInstWithInst: Instruction already inserted into basic block!");
+
+  // Insert the new instruction into the basic block...
+  BasicBlock::iterator New = BIL.insert(BI, I);
+
+  // Replace all uses of the old instruction, and delete it.
+  ReplaceInstWithValue(BIL, BI, I);
+
+  // Move BI back to point to the newly inserted instruction
+  BI = New;
+}
+
+/// ReplaceInstWithInst - Replace the instruction specified by From with the
+/// instruction specified by To.
+///
+void llvm::ReplaceInstWithInst(Instruction *From, Instruction *To) {
+  BasicBlock::iterator BI(From);
+  ReplaceInstWithInst(From->getParent()->getInstList(), BI, To);
+}
+
+/// RemoveSuccessor - Change the specified terminator instruction such that its
+/// successor SuccNum no longer exists.  Because this reduces the outgoing
+/// degree of the current basic block, the actual terminator instruction itself
+/// may have to be changed.  In the case where the last successor of the block 
+/// is deleted, a return instruction is inserted in its place which can cause a
+/// surprising change in program behavior if it is not expected.
+///
+void llvm::RemoveSuccessor(TerminatorInst *TI, unsigned SuccNum) {
+  assert(SuccNum < TI->getNumSuccessors() &&
+         "Trying to remove a nonexistant successor!");
+
+  // If our old successor block contains any PHI nodes, remove the entry in the
+  // PHI nodes that comes from this branch...
+  //
+  BasicBlock *BB = TI->getParent();
+  TI->getSuccessor(SuccNum)->removePredecessor(BB);
+
+  TerminatorInst *NewTI = 0;
+  switch (TI->getOpcode()) {
+  case Instruction::Br:
+    // If this is a conditional branch... convert to unconditional branch.
+    if (TI->getNumSuccessors() == 2) {
+      cast<BranchInst>(TI)->setUnconditionalDest(TI->getSuccessor(1-SuccNum));
+    } else {                    // Otherwise convert to a return instruction...
+      Value *RetVal = 0;
+
+      // Create a value to return... if the function doesn't return null...
+      if (BB->getParent()->getReturnType() != Type::VoidTy)
+        RetVal = Constant::getNullValue(BB->getParent()->getReturnType());
+
+      // Create the return...
+      NewTI = ReturnInst::Create(RetVal);
+    }
+    break;
+
+  case Instruction::Invoke:    // Should convert to call
+  case Instruction::Switch:    // Should remove entry
+  default:
+  case Instruction::Ret:       // Cannot happen, has no successors!
+    assert(0 && "Unhandled terminator instruction type in RemoveSuccessor!");
+    abort();
+  }
+
+  if (NewTI)   // If it's a different instruction, replace.
+    ReplaceInstWithInst(TI, NewTI);
+}
+
+/// SplitEdge -  Split the edge connecting specified block. Pass P must 
+/// not be NULL. 
+BasicBlock *llvm::SplitEdge(BasicBlock *BB, BasicBlock *Succ, Pass *P) {
+  TerminatorInst *LatchTerm = BB->getTerminator();
+  unsigned SuccNum = 0;
+#ifndef NDEBUG
+  unsigned e = LatchTerm->getNumSuccessors();
+#endif
+  for (unsigned i = 0; ; ++i) {
+    assert(i != e && "Didn't find edge?");
+    if (LatchTerm->getSuccessor(i) == Succ) {
+      SuccNum = i;
+      break;
+    }
+  }
+  
+  // If this is a critical edge, let SplitCriticalEdge do it.
+  if (SplitCriticalEdge(BB->getTerminator(), SuccNum, P))
+    return LatchTerm->getSuccessor(SuccNum);
+
+  // If the edge isn't critical, then BB has a single successor or Succ has a
+  // single pred.  Split the block.
+  BasicBlock::iterator SplitPoint;
+  if (BasicBlock *SP = Succ->getSinglePredecessor()) {
+    // If the successor only has a single pred, split the top of the successor
+    // block.
+    assert(SP == BB && "CFG broken");
+    SP = NULL;
+    return SplitBlock(Succ, Succ->begin(), P);
+  } else {
+    // Otherwise, if BB has a single successor, split it at the bottom of the
+    // block.
+    assert(BB->getTerminator()->getNumSuccessors() == 1 &&
+           "Should have a single succ!"); 
+    return SplitBlock(BB, BB->getTerminator(), P);
+  }
+}
+
+/// SplitBlock - Split the specified block at the specified instruction - every
+/// thing before SplitPt stays in Old and everything starting with SplitPt moves
+/// to a new block.  The two blocks are joined by an unconditional branch and
+/// the loop info is updated.
+///
+BasicBlock *llvm::SplitBlock(BasicBlock *Old, Instruction *SplitPt, Pass *P) {
+  BasicBlock::iterator SplitIt = SplitPt;
+  while (isa<PHINode>(SplitIt))
+    ++SplitIt;
+  BasicBlock *New = Old->splitBasicBlock(SplitIt, Old->getName()+".split");
+
+  // The new block lives in whichever loop the old one did.
+  if (LoopInfo* LI = P->getAnalysisIfAvailable<LoopInfo>())
+    if (Loop *L = LI->getLoopFor(Old))
+      L->addBasicBlockToLoop(New, LI->getBase());
+
+  if (DominatorTree *DT = P->getAnalysisIfAvailable<DominatorTree>())
+    {
+      // Old dominates New. New node domiantes all other nodes dominated by Old.
+      DomTreeNode *OldNode = DT->getNode(Old);
+      std::vector<DomTreeNode *> Children;
+      for (DomTreeNode::iterator I = OldNode->begin(), E = OldNode->end();
+           I != E; ++I) 
+        Children.push_back(*I);
+
+      DomTreeNode *NewNode =   DT->addNewBlock(New,Old);
+
+      for (std::vector<DomTreeNode *>::iterator I = Children.begin(),
+             E = Children.end(); I != E; ++I) 
+        DT->changeImmediateDominator(*I, NewNode);
+    }
+
+  if (DominanceFrontier *DF = P->getAnalysisIfAvailable<DominanceFrontier>())
+    DF->splitBlock(Old);
+    
+  return New;
+}
+
+
+/// SplitBlockPredecessors - This method transforms BB by introducing a new
+/// basic block into the function, and moving some of the predecessors of BB to
+/// be predecessors of the new block.  The new predecessors are indicated by the
+/// Preds array, which has NumPreds elements in it.  The new block is given a
+/// suffix of 'Suffix'.
+///
+/// This currently updates the LLVM IR, AliasAnalysis, DominatorTree and
+/// DominanceFrontier, but no other analyses.
+BasicBlock *llvm::SplitBlockPredecessors(BasicBlock *BB, 
+                                         BasicBlock *const *Preds,
+                                         unsigned NumPreds, const char *Suffix,
+                                         Pass *P) {
+  // Create new basic block, insert right before the original block.
+  BasicBlock *NewBB =
+    BasicBlock::Create(BB->getName()+Suffix, BB->getParent(), BB);
+  
+  // The new block unconditionally branches to the old block.
+  BranchInst *BI = BranchInst::Create(BB, NewBB);
+  
+  // Move the edges from Preds to point to NewBB instead of BB.
+  for (unsigned i = 0; i != NumPreds; ++i)
+    Preds[i]->getTerminator()->replaceUsesOfWith(BB, NewBB);
+  
+  // Update dominator tree and dominator frontier if available.
+  DominatorTree *DT = P ? P->getAnalysisIfAvailable<DominatorTree>() : 0;
+  if (DT)
+    DT->splitBlock(NewBB);
+  if (DominanceFrontier *DF = P ? P->getAnalysisIfAvailable<DominanceFrontier>():0)
+    DF->splitBlock(NewBB);
+  AliasAnalysis *AA = P ? P->getAnalysisIfAvailable<AliasAnalysis>() : 0;
+  
+  
+  // Insert a new PHI node into NewBB for every PHI node in BB and that new PHI
+  // node becomes an incoming value for BB's phi node.  However, if the Preds
+  // list is empty, we need to insert dummy entries into the PHI nodes in BB to
+  // account for the newly created predecessor.
+  if (NumPreds == 0) {
+    // Insert dummy values as the incoming value.
+    for (BasicBlock::iterator I = BB->begin(); isa<PHINode>(I); ++I)
+      cast<PHINode>(I)->addIncoming(UndefValue::get(I->getType()), NewBB);
+    return NewBB;
+  }
+  
+  // Otherwise, create a new PHI node in NewBB for each PHI node in BB.
+  for (BasicBlock::iterator I = BB->begin(); isa<PHINode>(I); ) {
+    PHINode *PN = cast<PHINode>(I++);
+    
+    // Check to see if all of the values coming in are the same.  If so, we
+    // don't need to create a new PHI node.
+    Value *InVal = PN->getIncomingValueForBlock(Preds[0]);
+    for (unsigned i = 1; i != NumPreds; ++i)
+      if (InVal != PN->getIncomingValueForBlock(Preds[i])) {
+        InVal = 0;
+        break;
+      }
+    
+    if (InVal) {
+      // If all incoming values for the new PHI would be the same, just don't
+      // make a new PHI.  Instead, just remove the incoming values from the old
+      // PHI.
+      for (unsigned i = 0; i != NumPreds; ++i)
+        PN->removeIncomingValue(Preds[i], false);
+    } else {
+      // If the values coming into the block are not the same, we need a PHI.
+      // Create the new PHI node, insert it into NewBB at the end of the block
+      PHINode *NewPHI =
+        PHINode::Create(PN->getType(), PN->getName()+".ph", BI);
+      if (AA) AA->copyValue(PN, NewPHI);
+      
+      // Move all of the PHI values for 'Preds' to the new PHI.
+      for (unsigned i = 0; i != NumPreds; ++i) {
+        Value *V = PN->removeIncomingValue(Preds[i], false);
+        NewPHI->addIncoming(V, Preds[i]);
+      }
+      InVal = NewPHI;
+    }
+    
+    // Add an incoming value to the PHI node in the loop for the preheader
+    // edge.
+    PN->addIncoming(InVal, NewBB);
+    
+    // Check to see if we can eliminate this phi node.
+    if (Value *V = PN->hasConstantValue(DT != 0)) {
+      Instruction *I = dyn_cast<Instruction>(V);
+      if (!I || DT == 0 || DT->dominates(I, PN)) {
+        PN->replaceAllUsesWith(V);
+        if (AA) AA->deleteValue(PN);
+        PN->eraseFromParent();
+      }
+    }
+  }
+  
+  return NewBB;
+}
+
+/// FindFunctionBackedges - Analyze the specified function to find all of the
+/// loop backedges in the function and return them.  This is a relatively cheap
+/// (compared to computing dominators and loop info) analysis.
+///
+/// The output is added to Result, as pairs of <from,to> edge info.
+void llvm::FindFunctionBackedges(const Function &F,
+     SmallVectorImpl<std::pair<const BasicBlock*,const BasicBlock*> > &Result) {
+  const BasicBlock *BB = &F.getEntryBlock();
+  if (succ_begin(BB) == succ_end(BB))
+    return;
+  
+  SmallPtrSet<const BasicBlock*, 8> Visited;
+  SmallVector<std::pair<const BasicBlock*, succ_const_iterator>, 8> VisitStack;
+  SmallPtrSet<const BasicBlock*, 8> InStack;
+  
+  Visited.insert(BB);
+  VisitStack.push_back(std::make_pair(BB, succ_begin(BB)));
+  InStack.insert(BB);
+  do {
+    std::pair<const BasicBlock*, succ_const_iterator> &Top = VisitStack.back();
+    const BasicBlock *ParentBB = Top.first;
+    succ_const_iterator &I = Top.second;
+    
+    bool FoundNew = false;
+    while (I != succ_end(ParentBB)) {
+      BB = *I++;
+      if (Visited.insert(BB)) {
+        FoundNew = true;
+        break;
+      }
+      // Successor is in VisitStack, it's a back edge.
+      if (InStack.count(BB))
+        Result.push_back(std::make_pair(ParentBB, BB));
+    }
+    
+    if (FoundNew) {
+      // Go down one level if there is a unvisited successor.
+      InStack.insert(BB);
+      VisitStack.push_back(std::make_pair(BB, succ_begin(BB)));
+    } else {
+      // Go up one level.
+      InStack.erase(VisitStack.pop_back_val().first);
+    }
+  } while (!VisitStack.empty());
+  
+  
+}
+
+
+
+/// AreEquivalentAddressValues - Test if A and B will obviously have the same
+/// value. This includes recognizing that %t0 and %t1 will have the same
+/// value in code like this:
+///   %t0 = getelementptr \@a, 0, 3
+///   store i32 0, i32* %t0
+///   %t1 = getelementptr \@a, 0, 3
+///   %t2 = load i32* %t1
+///
+static bool AreEquivalentAddressValues(const Value *A, const Value *B) {
+  // Test if the values are trivially equivalent.
+  if (A == B) return true;
+  
+  // Test if the values come form identical arithmetic instructions.
+  if (isa<BinaryOperator>(A) || isa<CastInst>(A) ||
+      isa<PHINode>(A) || isa<GetElementPtrInst>(A))
+    if (const Instruction *BI = dyn_cast<Instruction>(B))
+      if (cast<Instruction>(A)->isIdenticalTo(BI))
+        return true;
+  
+  // Otherwise they may not be equivalent.
+  return false;
+}
+
+/// FindAvailableLoadedValue - Scan the ScanBB block backwards (starting at the
+/// instruction before ScanFrom) checking to see if we have the value at the
+/// memory address *Ptr locally available within a small number of instructions.
+/// If the value is available, return it.
+///
+/// If not, return the iterator for the last validated instruction that the 
+/// value would be live through.  If we scanned the entire block and didn't find
+/// something that invalidates *Ptr or provides it, ScanFrom would be left at
+/// begin() and this returns null.  ScanFrom could also be left 
+///
+/// MaxInstsToScan specifies the maximum instructions to scan in the block.  If
+/// it is set to 0, it will scan the whole block. You can also optionally
+/// specify an alias analysis implementation, which makes this more precise.
+Value *llvm::FindAvailableLoadedValue(Value *Ptr, BasicBlock *ScanBB,
+                                      BasicBlock::iterator &ScanFrom,
+                                      unsigned MaxInstsToScan,
+                                      AliasAnalysis *AA) {
+  if (MaxInstsToScan == 0) MaxInstsToScan = ~0U;
+
+  // If we're using alias analysis to disambiguate get the size of *Ptr.
+  unsigned AccessSize = 0;
+  if (AA) {
+    const Type *AccessTy = cast<PointerType>(Ptr->getType())->getElementType();
+    AccessSize = AA->getTargetData().getTypeStoreSizeInBits(AccessTy);
+  }
+  
+  while (ScanFrom != ScanBB->begin()) {
+    // We must ignore debug info directives when counting (otherwise they
+    // would affect codegen).
+    Instruction *Inst = --ScanFrom;
+    if (isa<DbgInfoIntrinsic>(Inst))
+      continue;
+    // We skip pointer-to-pointer bitcasts, which are NOPs.
+    // It is necessary for correctness to skip those that feed into a
+    // llvm.dbg.declare, as these are not present when debugging is off.
+    if (isa<BitCastInst>(Inst) && isa<PointerType>(Inst->getType()))
+      continue;
+
+    // Restore ScanFrom to expected value in case next test succeeds
+    ScanFrom++;
+   
+    // Don't scan huge blocks.
+    if (MaxInstsToScan-- == 0) return 0;
+    
+    --ScanFrom;
+    // If this is a load of Ptr, the loaded value is available.
+    if (LoadInst *LI = dyn_cast<LoadInst>(Inst))
+      if (AreEquivalentAddressValues(LI->getOperand(0), Ptr))
+        return LI;
+    
+    if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
+      // If this is a store through Ptr, the value is available!
+      if (AreEquivalentAddressValues(SI->getOperand(1), Ptr))
+        return SI->getOperand(0);
+      
+      // If Ptr is an alloca and this is a store to a different alloca, ignore
+      // the store.  This is a trivial form of alias analysis that is important
+      // for reg2mem'd code.
+      if ((isa<AllocaInst>(Ptr) || isa<GlobalVariable>(Ptr)) &&
+          (isa<AllocaInst>(SI->getOperand(1)) ||
+           isa<GlobalVariable>(SI->getOperand(1))))
+        continue;
+      
+      // If we have alias analysis and it says the store won't modify the loaded
+      // value, ignore the store.
+      if (AA &&
+          (AA->getModRefInfo(SI, Ptr, AccessSize) & AliasAnalysis::Mod) == 0)
+        continue;
+      
+      // Otherwise the store that may or may not alias the pointer, bail out.
+      ++ScanFrom;
+      return 0;
+    }
+    
+    // If this is some other instruction that may clobber Ptr, bail out.
+    if (Inst->mayWriteToMemory()) {
+      // If alias analysis claims that it really won't modify the load,
+      // ignore it.
+      if (AA &&
+          (AA->getModRefInfo(Inst, Ptr, AccessSize) & AliasAnalysis::Mod) == 0)
+        continue;
+      
+      // May modify the pointer, bail out.
+      ++ScanFrom;
+      return 0;
+    }
+  }
+  
+  // Got to the start of the block, we didn't find it, but are done for this
+  // block.
+  return 0;
+}
+
+/// CopyPrecedingStopPoint - If I is immediately preceded by a StopPoint,
+/// make a copy of the stoppoint before InsertPos (presumably before copying
+/// or moving I).
+void llvm::CopyPrecedingStopPoint(Instruction *I, 
+                                  BasicBlock::iterator InsertPos) {
+  if (I != I->getParent()->begin()) {
+    BasicBlock::iterator BBI = I;  --BBI;
+    if (DbgStopPointInst *DSPI = dyn_cast<DbgStopPointInst>(BBI)) {
+      CallInst *newDSPI = DSPI->clone();
+      newDSPI->insertBefore(InsertPos);
+    }
+  }
+}
diff --git a/lib/Transforms/Utils/BasicInliner.cpp b/lib/Transforms/Utils/BasicInliner.cpp
new file mode 100644
index 0000000..1650cfa
--- /dev/null
+++ b/lib/Transforms/Utils/BasicInliner.cpp
@@ -0,0 +1,181 @@
+//===- BasicInliner.cpp - Basic function level inliner --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a simple function based inliner that does not use
+// call graph information. 
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "basicinliner"
+
+#include "llvm/Module.h"
+#include "llvm/Function.h"
+#include "llvm/Transforms/Utils/BasicInliner.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Support/CallSite.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include <vector>
+
+using namespace llvm;
+
+static cl::opt<unsigned>     
+BasicInlineThreshold("basic-inline-threshold", cl::Hidden, cl::init(200),
+   cl::desc("Control the amount of basic inlining to perform (default = 200)"));
+
+namespace llvm {
+
+  /// BasicInlinerImpl - BasicInliner implemantation class. This hides
+  /// container info, used by basic inliner, from public interface.
+  struct VISIBILITY_HIDDEN BasicInlinerImpl {
+    
+    BasicInlinerImpl(const BasicInlinerImpl&); // DO NOT IMPLEMENT
+    void operator=(const BasicInlinerImpl&); // DO NO IMPLEMENT
+  public:
+    BasicInlinerImpl(TargetData *T) : TD(T) {}
+
+    /// addFunction - Add function into the list of functions to process.
+    /// All functions must be inserted using this interface before invoking
+    /// inlineFunctions().
+    void addFunction(Function *F) {
+      Functions.push_back(F);
+    }
+
+    /// neverInlineFunction - Sometimes a function is never to be inlined 
+    /// because of one or other reason. 
+    void neverInlineFunction(Function *F) {
+      NeverInline.insert(F);
+    }
+
+    /// inlineFuctions - Walk all call sites in all functions supplied by
+    /// client. Inline as many call sites as possible. Delete completely
+    /// inlined functions.
+    void inlineFunctions();
+    
+  private:
+    TargetData *TD;
+    std::vector<Function *> Functions;
+    SmallPtrSet<const Function *, 16> NeverInline;
+    SmallPtrSet<Function *, 8> DeadFunctions;
+    InlineCostAnalyzer CA;
+  };
+
+/// inlineFuctions - Walk all call sites in all functions supplied by
+/// client. Inline as many call sites as possible. Delete completely
+/// inlined functions.
+void BasicInlinerImpl::inlineFunctions() {
+      
+  // Scan through and identify all call sites ahead of time so that we only
+  // inline call sites in the original functions, not call sites that result
+  // from inlining other functions.
+  std::vector<CallSite> CallSites;
+  
+  for (std::vector<Function *>::iterator FI = Functions.begin(),
+         FE = Functions.end(); FI != FE; ++FI) {
+    Function *F = *FI;
+    for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB)
+      for (BasicBlock::iterator I = BB->begin(); I != BB->end(); ++I) {
+        CallSite CS = CallSite::get(I);
+        if (CS.getInstruction() && CS.getCalledFunction()
+            && !CS.getCalledFunction()->isDeclaration())
+          CallSites.push_back(CS);
+      }
+  }
+  
+  DOUT << ": " << CallSites.size() << " call sites.\n";
+  
+  // Inline call sites.
+  bool Changed = false;
+  do {
+    Changed = false;
+    for (unsigned index = 0; index != CallSites.size() && !CallSites.empty(); 
+         ++index) {
+      CallSite CS = CallSites[index];
+      if (Function *Callee = CS.getCalledFunction()) {
+        
+        // Eliminate calls that are never inlinable.
+        if (Callee->isDeclaration() ||
+            CS.getInstruction()->getParent()->getParent() == Callee) {
+          CallSites.erase(CallSites.begin() + index);
+          --index;
+          continue;
+        }
+        InlineCost IC = CA.getInlineCost(CS, NeverInline);
+        if (IC.isAlways()) {        
+          DOUT << "  Inlining: cost=always"
+               <<", call: " << *CS.getInstruction();
+        } else if (IC.isNever()) {
+          DOUT << "  NOT Inlining: cost=never"
+               <<", call: " << *CS.getInstruction();
+          continue;
+        } else {
+          int Cost = IC.getValue();
+          
+          if (Cost >= (int) BasicInlineThreshold) {
+            DOUT << "  NOT Inlining: cost = " << Cost
+                 << ", call: " <<  *CS.getInstruction();
+            continue;
+          } else {
+            DOUT << "  Inlining: cost = " << Cost
+                 << ", call: " <<  *CS.getInstruction();
+          }
+        }
+        
+        // Inline
+        if (InlineFunction(CS, NULL, TD)) {
+          if (Callee->use_empty() && (Callee->hasLocalLinkage() ||
+                                      Callee->hasAvailableExternallyLinkage()))
+            DeadFunctions.insert(Callee);
+          Changed = true;
+          CallSites.erase(CallSites.begin() + index);
+          --index;
+        }
+      }
+    }
+  } while (Changed);
+  
+  // Remove completely inlined functions from module.
+  for(SmallPtrSet<Function *, 8>::iterator I = DeadFunctions.begin(),
+        E = DeadFunctions.end(); I != E; ++I) {
+    Function *D = *I;
+    Module *M = D->getParent();
+    M->getFunctionList().remove(D);
+  }
+}
+
+BasicInliner::BasicInliner(TargetData *TD) {
+  Impl = new BasicInlinerImpl(TD);
+}
+
+BasicInliner::~BasicInliner() {
+  delete Impl;
+}
+
+/// addFunction - Add function into the list of functions to process.
+/// All functions must be inserted using this interface before invoking
+/// inlineFunctions().
+void BasicInliner::addFunction(Function *F) {
+  Impl->addFunction(F);
+}
+
+/// neverInlineFunction - Sometimes a function is never to be inlined because
+/// of one or other reason. 
+void BasicInliner::neverInlineFunction(Function *F) {
+  Impl->neverInlineFunction(F);
+}
+
+/// inlineFuctions - Walk all call sites in all functions supplied by
+/// client. Inline as many call sites as possible. Delete completely
+/// inlined functions.
+void BasicInliner::inlineFunctions() {
+  Impl->inlineFunctions();
+}
+
+}
diff --git a/lib/Transforms/Utils/BreakCriticalEdges.cpp b/lib/Transforms/Utils/BreakCriticalEdges.cpp
new file mode 100644
index 0000000..c4fd1ea
--- /dev/null
+++ b/lib/Transforms/Utils/BreakCriticalEdges.cpp
@@ -0,0 +1,282 @@
+//===- BreakCriticalEdges.cpp - Critical Edge Elimination Pass ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// BreakCriticalEdges pass - Break all of the critical edges in the CFG by
+// inserting a dummy basic block.  This pass may be "required" by passes that
+// cannot deal with critical edges.  For this usage, the structure type is
+// forward declared.  This pass obviously invalidates the CFG, but can update
+// forward dominator (set, immediate dominators, tree, and frontier)
+// information.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "break-crit-edges"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Function.h"
+#include "llvm/Instructions.h"
+#include "llvm/Type.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+using namespace llvm;
+
+STATISTIC(NumBroken, "Number of blocks inserted");
+
+namespace {
+  struct VISIBILITY_HIDDEN BreakCriticalEdges : public FunctionPass {
+    static char ID; // Pass identification, replacement for typeid
+    BreakCriticalEdges() : FunctionPass(&ID) {}
+
+    virtual bool runOnFunction(Function &F);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addPreserved<DominatorTree>();
+      AU.addPreserved<DominanceFrontier>();
+      AU.addPreserved<LoopInfo>();
+
+      // No loop canonicalization guarantees are broken by this pass.
+      AU.addPreservedID(LoopSimplifyID);
+    }
+  };
+}
+
+char BreakCriticalEdges::ID = 0;
+static RegisterPass<BreakCriticalEdges>
+X("break-crit-edges", "Break critical edges in CFG");
+
+// Publically exposed interface to pass...
+const PassInfo *const llvm::BreakCriticalEdgesID = &X;
+FunctionPass *llvm::createBreakCriticalEdgesPass() {
+  return new BreakCriticalEdges();
+}
+
+// runOnFunction - Loop over all of the edges in the CFG, breaking critical
+// edges as they are found.
+//
+bool BreakCriticalEdges::runOnFunction(Function &F) {
+  bool Changed = false;
+  for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I) {
+    TerminatorInst *TI = I->getTerminator();
+    if (TI->getNumSuccessors() > 1)
+      for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i)
+        if (SplitCriticalEdge(TI, i, this)) {
+          ++NumBroken;
+          Changed = true;
+        }
+  }
+
+  return Changed;
+}
+
+//===----------------------------------------------------------------------===//
+//    Implementation of the external critical edge manipulation functions
+//===----------------------------------------------------------------------===//
+
+// isCriticalEdge - Return true if the specified edge is a critical edge.
+// Critical edges are edges from a block with multiple successors to a block
+// with multiple predecessors.
+//
+bool llvm::isCriticalEdge(const TerminatorInst *TI, unsigned SuccNum,
+                          bool AllowIdenticalEdges) {
+  assert(SuccNum < TI->getNumSuccessors() && "Illegal edge specification!");
+  if (TI->getNumSuccessors() == 1) return false;
+
+  const BasicBlock *Dest = TI->getSuccessor(SuccNum);
+  pred_const_iterator I = pred_begin(Dest), E = pred_end(Dest);
+
+  // If there is more than one predecessor, this is a critical edge...
+  assert(I != E && "No preds, but we have an edge to the block?");
+  const BasicBlock *FirstPred = *I;
+  ++I;        // Skip one edge due to the incoming arc from TI.
+  if (!AllowIdenticalEdges)
+    return I != E;
+  
+  // If AllowIdenticalEdges is true, then we allow this edge to be considered
+  // non-critical iff all preds come from TI's block.
+  while (I != E) {
+    if (*I != FirstPred)
+      return true;
+    // Note: leave this as is until no one ever compiles with either gcc 4.0.1
+    // or Xcode 2. This seems to work around the pred_iterator assert in PR 2207
+    E = pred_end(*I);
+    ++I;
+  }
+  return false;
+}
+
+/// SplitCriticalEdge - If this edge is a critical edge, insert a new node to
+/// split the critical edge.  This will update DominatorTree and
+/// DominatorFrontier  information if it is available, thus calling this pass
+/// will not invalidate  any of them.  This returns true if the edge was split,
+/// false otherwise.  This ensures that all edges to that dest go to one block
+/// instead of each going to a different block.
+//
+bool llvm::SplitCriticalEdge(TerminatorInst *TI, unsigned SuccNum, Pass *P,
+                             bool MergeIdenticalEdges) {
+  if (!isCriticalEdge(TI, SuccNum, MergeIdenticalEdges)) return false;
+  BasicBlock *TIBB = TI->getParent();
+  BasicBlock *DestBB = TI->getSuccessor(SuccNum);
+
+  // Create a new basic block, linking it into the CFG.
+  BasicBlock *NewBB = BasicBlock::Create(TIBB->getName() + "." +
+                                         DestBB->getName() + "_crit_edge");
+  // Create our unconditional branch...
+  BranchInst::Create(DestBB, NewBB);
+
+  // Branch to the new block, breaking the edge.
+  TI->setSuccessor(SuccNum, NewBB);
+
+  // Insert the block into the function... right after the block TI lives in.
+  Function &F = *TIBB->getParent();
+  Function::iterator FBBI = TIBB;
+  F.getBasicBlockList().insert(++FBBI, NewBB);
+  
+  // If there are any PHI nodes in DestBB, we need to update them so that they
+  // merge incoming values from NewBB instead of from TIBB.
+  //
+  for (BasicBlock::iterator I = DestBB->begin(); isa<PHINode>(I); ++I) {
+    PHINode *PN = cast<PHINode>(I);
+    // We no longer enter through TIBB, now we come in through NewBB.  Revector
+    // exactly one entry in the PHI node that used to come from TIBB to come
+    // from NewBB.
+    int BBIdx = PN->getBasicBlockIndex(TIBB);
+    PN->setIncomingBlock(BBIdx, NewBB);
+  }
+  
+  // If there are any other edges from TIBB to DestBB, update those to go
+  // through the split block, making those edges non-critical as well (and
+  // reducing the number of phi entries in the DestBB if relevant).
+  if (MergeIdenticalEdges) {
+    for (unsigned i = SuccNum+1, e = TI->getNumSuccessors(); i != e; ++i) {
+      if (TI->getSuccessor(i) != DestBB) continue;
+      
+      // Remove an entry for TIBB from DestBB phi nodes.
+      DestBB->removePredecessor(TIBB);
+      
+      // We found another edge to DestBB, go to NewBB instead.
+      TI->setSuccessor(i, NewBB);
+    }
+  }
+  
+  
+
+  // If we don't have a pass object, we can't update anything...
+  if (P == 0) return true;
+
+  // Now update analysis information.  Since the only predecessor of NewBB is
+  // the TIBB, TIBB clearly dominates NewBB.  TIBB usually doesn't dominate
+  // anything, as there are other successors of DestBB.  However, if all other
+  // predecessors of DestBB are already dominated by DestBB (e.g. DestBB is a
+  // loop header) then NewBB dominates DestBB.
+  SmallVector<BasicBlock*, 8> OtherPreds;
+
+  for (pred_iterator I = pred_begin(DestBB), E = pred_end(DestBB); I != E; ++I)
+    if (*I != NewBB)
+      OtherPreds.push_back(*I);
+  
+  bool NewBBDominatesDestBB = true;
+  
+  // Should we update DominatorTree information?
+  if (DominatorTree *DT = P->getAnalysisIfAvailable<DominatorTree>()) {
+    DomTreeNode *TINode = DT->getNode(TIBB);
+
+    // The new block is not the immediate dominator for any other nodes, but
+    // TINode is the immediate dominator for the new node.
+    //
+    if (TINode) {       // Don't break unreachable code!
+      DomTreeNode *NewBBNode = DT->addNewBlock(NewBB, TIBB);
+      DomTreeNode *DestBBNode = 0;
+     
+      // If NewBBDominatesDestBB hasn't been computed yet, do so with DT.
+      if (!OtherPreds.empty()) {
+        DestBBNode = DT->getNode(DestBB);
+        while (!OtherPreds.empty() && NewBBDominatesDestBB) {
+          if (DomTreeNode *OPNode = DT->getNode(OtherPreds.back()))
+            NewBBDominatesDestBB = DT->dominates(DestBBNode, OPNode);
+          OtherPreds.pop_back();
+        }
+        OtherPreds.clear();
+      }
+      
+      // If NewBBDominatesDestBB, then NewBB dominates DestBB, otherwise it
+      // doesn't dominate anything.
+      if (NewBBDominatesDestBB) {
+        if (!DestBBNode) DestBBNode = DT->getNode(DestBB);
+        DT->changeImmediateDominator(DestBBNode, NewBBNode);
+      }
+    }
+  }
+
+  // Should we update DominanceFrontier information?
+  if (DominanceFrontier *DF = P->getAnalysisIfAvailable<DominanceFrontier>()) {
+    // If NewBBDominatesDestBB hasn't been computed yet, do so with DF.
+    if (!OtherPreds.empty()) {
+      // FIXME: IMPLEMENT THIS!
+      assert(0 && "Requiring domfrontiers but not idom/domtree/domset."
+             " not implemented yet!");
+    }
+    
+    // Since the new block is dominated by its only predecessor TIBB,
+    // it cannot be in any block's dominance frontier.  If NewBB dominates
+    // DestBB, its dominance frontier is the same as DestBB's, otherwise it is
+    // just {DestBB}.
+    DominanceFrontier::DomSetType NewDFSet;
+    if (NewBBDominatesDestBB) {
+      DominanceFrontier::iterator I = DF->find(DestBB);
+      if (I != DF->end()) {
+        DF->addBasicBlock(NewBB, I->second);
+        
+        if (I->second.count(DestBB)) {
+          // However NewBB's frontier does not include DestBB.
+          DominanceFrontier::iterator NF = DF->find(NewBB);
+          DF->removeFromFrontier(NF, DestBB);
+        }
+      }
+      else
+        DF->addBasicBlock(NewBB, DominanceFrontier::DomSetType());
+    } else {
+      DominanceFrontier::DomSetType NewDFSet;
+      NewDFSet.insert(DestBB);
+      DF->addBasicBlock(NewBB, NewDFSet);
+    }
+  }
+  
+  // Update LoopInfo if it is around.
+  if (LoopInfo *LI = P->getAnalysisIfAvailable<LoopInfo>()) {
+    // If one or the other blocks were not in a loop, the new block is not
+    // either, and thus LI doesn't need to be updated.
+    if (Loop *TIL = LI->getLoopFor(TIBB))
+      if (Loop *DestLoop = LI->getLoopFor(DestBB)) {
+        if (TIL == DestLoop) {
+          // Both in the same loop, the NewBB joins loop.
+          DestLoop->addBasicBlockToLoop(NewBB, LI->getBase());
+        } else if (TIL->contains(DestLoop->getHeader())) {
+          // Edge from an outer loop to an inner loop.  Add to the outer loop.
+          TIL->addBasicBlockToLoop(NewBB, LI->getBase());
+        } else if (DestLoop->contains(TIL->getHeader())) {
+          // Edge from an inner loop to an outer loop.  Add to the outer loop.
+          DestLoop->addBasicBlockToLoop(NewBB, LI->getBase());
+        } else {
+          // Edge from two loops with no containment relation.  Because these
+          // are natural loops, we know that the destination block must be the
+          // header of its loop (adding a branch into a loop elsewhere would
+          // create an irreducible loop).
+          assert(DestLoop->getHeader() == DestBB &&
+                 "Should not create irreducible loops!");
+          if (Loop *P = DestLoop->getParentLoop())
+            P->addBasicBlockToLoop(NewBB, LI->getBase());
+        }
+      }
+  }
+  return true;
+}
diff --git a/lib/Transforms/Utils/CMakeLists.txt b/lib/Transforms/Utils/CMakeLists.txt
new file mode 100644
index 0000000..6628b4b
--- /dev/null
+++ b/lib/Transforms/Utils/CMakeLists.txt
@@ -0,0 +1,27 @@
+add_llvm_library(LLVMTransformUtils
+  AddrModeMatcher.cpp
+  BasicBlockUtils.cpp
+  BasicInliner.cpp
+  BreakCriticalEdges.cpp
+  CloneFunction.cpp
+  CloneLoop.cpp
+  CloneModule.cpp
+  CloneTrace.cpp
+  CodeExtractor.cpp
+  DemoteRegToStack.cpp
+  InlineCost.cpp
+  InlineFunction.cpp
+  LCSSA.cpp
+  Local.cpp
+  LoopSimplify.cpp
+  LowerAllocations.cpp
+  LowerInvoke.cpp
+  LowerSwitch.cpp
+  Mem2Reg.cpp
+  PromoteMemoryToRegister.cpp
+  SimplifyCFG.cpp
+  UnifyFunctionExitNodes.cpp
+  UnrollLoop.cpp
+  ValueMapper.cpp
+  InstructionNamer.cpp
+  )
diff --git a/lib/Transforms/Utils/CloneFunction.cpp b/lib/Transforms/Utils/CloneFunction.cpp
new file mode 100644
index 0000000..d0fdefa
--- /dev/null
+++ b/lib/Transforms/Utils/CloneFunction.cpp
@@ -0,0 +1,533 @@
+//===- CloneFunction.cpp - Clone a function into another function ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the CloneFunctionInto interface, which is used as the
+// low-level function cloner.  This is used by the CloneFunction and function
+// inliner to do the dirty work of copying the body of a function around.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Instructions.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/GlobalVariable.h"
+#include "llvm/Function.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/DebugInfo.h"
+#include "llvm/ADT/SmallVector.h"
+#include <map>
+using namespace llvm;
+
+// CloneBasicBlock - See comments in Cloning.h
+BasicBlock *llvm::CloneBasicBlock(const BasicBlock *BB,
+                                  DenseMap<const Value*, Value*> &ValueMap,
+                                  const char *NameSuffix, Function *F,
+                                  ClonedCodeInfo *CodeInfo) {
+  BasicBlock *NewBB = BasicBlock::Create("", F);
+  if (BB->hasName()) NewBB->setName(BB->getName()+NameSuffix);
+
+  bool hasCalls = false, hasDynamicAllocas = false, hasStaticAllocas = false;
+  
+  // Loop over all instructions, and copy them over.
+  for (BasicBlock::const_iterator II = BB->begin(), IE = BB->end();
+       II != IE; ++II) {
+    Instruction *NewInst = II->clone();
+    if (II->hasName())
+      NewInst->setName(II->getName()+NameSuffix);
+    NewBB->getInstList().push_back(NewInst);
+    ValueMap[II] = NewInst;                // Add instruction map to value.
+    
+    hasCalls |= (isa<CallInst>(II) && !isa<DbgInfoIntrinsic>(II));
+    if (const AllocaInst *AI = dyn_cast<AllocaInst>(II)) {
+      if (isa<ConstantInt>(AI->getArraySize()))
+        hasStaticAllocas = true;
+      else
+        hasDynamicAllocas = true;
+    }
+  }
+  
+  if (CodeInfo) {
+    CodeInfo->ContainsCalls          |= hasCalls;
+    CodeInfo->ContainsUnwinds        |= isa<UnwindInst>(BB->getTerminator());
+    CodeInfo->ContainsDynamicAllocas |= hasDynamicAllocas;
+    CodeInfo->ContainsDynamicAllocas |= hasStaticAllocas && 
+                                        BB != &BB->getParent()->getEntryBlock();
+  }
+  return NewBB;
+}
+
+// Clone OldFunc into NewFunc, transforming the old arguments into references to
+// ArgMap values.
+//
+void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc,
+                             DenseMap<const Value*, Value*> &ValueMap,
+                             std::vector<ReturnInst*> &Returns,
+                             const char *NameSuffix, ClonedCodeInfo *CodeInfo) {
+  assert(NameSuffix && "NameSuffix cannot be null!");
+
+#ifndef NDEBUG
+  for (Function::const_arg_iterator I = OldFunc->arg_begin(), 
+       E = OldFunc->arg_end(); I != E; ++I)
+    assert(ValueMap.count(I) && "No mapping from source argument specified!");
+#endif
+
+  // Clone any attributes.
+  if (NewFunc->arg_size() == OldFunc->arg_size())
+    NewFunc->copyAttributesFrom(OldFunc);
+  else {
+    //Some arguments were deleted with the ValueMap. Copy arguments one by one
+    for (Function::const_arg_iterator I = OldFunc->arg_begin(), 
+           E = OldFunc->arg_end(); I != E; ++I)
+      if (Argument* Anew = dyn_cast<Argument>(ValueMap[I]))
+        Anew->addAttr( OldFunc->getAttributes()
+                       .getParamAttributes(I->getArgNo() + 1));
+    NewFunc->setAttributes(NewFunc->getAttributes()
+                           .addAttr(0, OldFunc->getAttributes()
+                                     .getRetAttributes()));
+    NewFunc->setAttributes(NewFunc->getAttributes()
+                           .addAttr(~0, OldFunc->getAttributes()
+                                     .getFnAttributes()));
+
+  }
+
+  // Loop over all of the basic blocks in the function, cloning them as
+  // appropriate.  Note that we save BE this way in order to handle cloning of
+  // recursive functions into themselves.
+  //
+  for (Function::const_iterator BI = OldFunc->begin(), BE = OldFunc->end();
+       BI != BE; ++BI) {
+    const BasicBlock &BB = *BI;
+
+    // Create a new basic block and copy instructions into it!
+    BasicBlock *CBB = CloneBasicBlock(&BB, ValueMap, NameSuffix, NewFunc,
+                                      CodeInfo);
+    ValueMap[&BB] = CBB;                       // Add basic block mapping.
+
+    if (ReturnInst *RI = dyn_cast<ReturnInst>(CBB->getTerminator()))
+      Returns.push_back(RI);
+  }
+
+  // Loop over all of the instructions in the function, fixing up operand
+  // references as we go.  This uses ValueMap to do all the hard work.
+  //
+  for (Function::iterator BB = cast<BasicBlock>(ValueMap[OldFunc->begin()]),
+         BE = NewFunc->end(); BB != BE; ++BB)
+    // Loop over all instructions, fixing each one as we find it...
+    for (BasicBlock::iterator II = BB->begin(); II != BB->end(); ++II)
+      RemapInstruction(II, ValueMap);
+}
+
+/// CloneFunction - Return a copy of the specified function, but without
+/// embedding the function into another module.  Also, any references specified
+/// in the ValueMap are changed to refer to their mapped value instead of the
+/// original one.  If any of the arguments to the function are in the ValueMap,
+/// the arguments are deleted from the resultant function.  The ValueMap is
+/// updated to include mappings from all of the instructions and basicblocks in
+/// the function from their old to new values.
+///
+Function *llvm::CloneFunction(const Function *F,
+                              DenseMap<const Value*, Value*> &ValueMap,
+                              ClonedCodeInfo *CodeInfo) {
+  std::vector<const Type*> ArgTypes;
+
+  // The user might be deleting arguments to the function by specifying them in
+  // the ValueMap.  If so, we need to not add the arguments to the arg ty vector
+  //
+  for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
+       I != E; ++I)
+    if (ValueMap.count(I) == 0)  // Haven't mapped the argument to anything yet?
+      ArgTypes.push_back(I->getType());
+
+  // Create a new function type...
+  FunctionType *FTy = FunctionType::get(F->getFunctionType()->getReturnType(),
+                                    ArgTypes, F->getFunctionType()->isVarArg());
+
+  // Create the new function...
+  Function *NewF = Function::Create(FTy, F->getLinkage(), F->getName());
+
+  // Loop over the arguments, copying the names of the mapped arguments over...
+  Function::arg_iterator DestI = NewF->arg_begin();
+  for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
+       I != E; ++I)
+    if (ValueMap.count(I) == 0) {   // Is this argument preserved?
+      DestI->setName(I->getName()); // Copy the name over...
+      ValueMap[I] = DestI++;        // Add mapping to ValueMap
+    }
+
+  std::vector<ReturnInst*> Returns;  // Ignore returns cloned...
+  CloneFunctionInto(NewF, F, ValueMap, Returns, "", CodeInfo);
+  return NewF;
+}
+
+
+
+namespace {
+  /// PruningFunctionCloner - This class is a private class used to implement
+  /// the CloneAndPruneFunctionInto method.
+  struct VISIBILITY_HIDDEN PruningFunctionCloner {
+    Function *NewFunc;
+    const Function *OldFunc;
+    DenseMap<const Value*, Value*> &ValueMap;
+    std::vector<ReturnInst*> &Returns;
+    const char *NameSuffix;
+    ClonedCodeInfo *CodeInfo;
+    const TargetData *TD;
+    Value *DbgFnStart;
+  public:
+    PruningFunctionCloner(Function *newFunc, const Function *oldFunc,
+                          DenseMap<const Value*, Value*> &valueMap,
+                          std::vector<ReturnInst*> &returns,
+                          const char *nameSuffix, 
+                          ClonedCodeInfo *codeInfo,
+                          const TargetData *td)
+    : NewFunc(newFunc), OldFunc(oldFunc), ValueMap(valueMap), Returns(returns),
+      NameSuffix(nameSuffix), CodeInfo(codeInfo), TD(td), DbgFnStart(NULL) {
+    }
+
+    /// CloneBlock - The specified block is found to be reachable, clone it and
+    /// anything that it can reach.
+    void CloneBlock(const BasicBlock *BB,
+                    std::vector<const BasicBlock*> &ToClone);
+    
+  public:
+    /// ConstantFoldMappedInstruction - Constant fold the specified instruction,
+    /// mapping its operands through ValueMap if they are available.
+    Constant *ConstantFoldMappedInstruction(const Instruction *I);
+  };
+}
+
+/// CloneBlock - The specified block is found to be reachable, clone it and
+/// anything that it can reach.
+void PruningFunctionCloner::CloneBlock(const BasicBlock *BB,
+                                       std::vector<const BasicBlock*> &ToClone){
+  Value *&BBEntry = ValueMap[BB];
+
+  // Have we already cloned this block?
+  if (BBEntry) return;
+  
+  // Nope, clone it now.
+  BasicBlock *NewBB;
+  BBEntry = NewBB = BasicBlock::Create();
+  if (BB->hasName()) NewBB->setName(BB->getName()+NameSuffix);
+
+  bool hasCalls = false, hasDynamicAllocas = false, hasStaticAllocas = false;
+  
+  // Loop over all instructions, and copy them over, DCE'ing as we go.  This
+  // loop doesn't include the terminator.
+  for (BasicBlock::const_iterator II = BB->begin(), IE = --BB->end();
+       II != IE; ++II) {
+    // If this instruction constant folds, don't bother cloning the instruction,
+    // instead, just add the constant to the value map.
+    if (Constant *C = ConstantFoldMappedInstruction(II)) {
+      ValueMap[II] = C;
+      continue;
+    }
+
+    // Do not clone llvm.dbg.region.end. It will be adjusted by the inliner.
+    if (const DbgFuncStartInst *DFSI = dyn_cast<DbgFuncStartInst>(II)) {
+      if (DbgFnStart == NULL) {
+        DISubprogram SP(cast<GlobalVariable>(DFSI->getSubprogram()));
+        if (SP.describes(BB->getParent()))
+          DbgFnStart = DFSI->getSubprogram();
+      }
+    } 
+    if (const DbgRegionEndInst *DREIS = dyn_cast<DbgRegionEndInst>(II)) {
+      if (DREIS->getContext() == DbgFnStart)
+        continue;
+    }
+      
+    Instruction *NewInst = II->clone();
+    if (II->hasName())
+      NewInst->setName(II->getName()+NameSuffix);
+    NewBB->getInstList().push_back(NewInst);
+    ValueMap[II] = NewInst;                // Add instruction map to value.
+    
+    hasCalls |= (isa<CallInst>(II) && !isa<DbgInfoIntrinsic>(II));
+    if (const AllocaInst *AI = dyn_cast<AllocaInst>(II)) {
+      if (isa<ConstantInt>(AI->getArraySize()))
+        hasStaticAllocas = true;
+      else
+        hasDynamicAllocas = true;
+    }
+  }
+  
+  // Finally, clone over the terminator.
+  const TerminatorInst *OldTI = BB->getTerminator();
+  bool TerminatorDone = false;
+  if (const BranchInst *BI = dyn_cast<BranchInst>(OldTI)) {
+    if (BI->isConditional()) {
+      // If the condition was a known constant in the callee...
+      ConstantInt *Cond = dyn_cast<ConstantInt>(BI->getCondition());
+      // Or is a known constant in the caller...
+      if (Cond == 0)  
+        Cond = dyn_cast_or_null<ConstantInt>(ValueMap[BI->getCondition()]);
+
+      // Constant fold to uncond branch!
+      if (Cond) {
+        BasicBlock *Dest = BI->getSuccessor(!Cond->getZExtValue());
+        ValueMap[OldTI] = BranchInst::Create(Dest, NewBB);
+        ToClone.push_back(Dest);
+        TerminatorDone = true;
+      }
+    }
+  } else if (const SwitchInst *SI = dyn_cast<SwitchInst>(OldTI)) {
+    // If switching on a value known constant in the caller.
+    ConstantInt *Cond = dyn_cast<ConstantInt>(SI->getCondition());
+    if (Cond == 0)  // Or known constant after constant prop in the callee...
+      Cond = dyn_cast_or_null<ConstantInt>(ValueMap[SI->getCondition()]);
+    if (Cond) {     // Constant fold to uncond branch!
+      BasicBlock *Dest = SI->getSuccessor(SI->findCaseValue(Cond));
+      ValueMap[OldTI] = BranchInst::Create(Dest, NewBB);
+      ToClone.push_back(Dest);
+      TerminatorDone = true;
+    }
+  }
+  
+  if (!TerminatorDone) {
+    Instruction *NewInst = OldTI->clone();
+    if (OldTI->hasName())
+      NewInst->setName(OldTI->getName()+NameSuffix);
+    NewBB->getInstList().push_back(NewInst);
+    ValueMap[OldTI] = NewInst;             // Add instruction map to value.
+    
+    // Recursively clone any reachable successor blocks.
+    const TerminatorInst *TI = BB->getTerminator();
+    for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i)
+      ToClone.push_back(TI->getSuccessor(i));
+  }
+  
+  if (CodeInfo) {
+    CodeInfo->ContainsCalls          |= hasCalls;
+    CodeInfo->ContainsUnwinds        |= isa<UnwindInst>(OldTI);
+    CodeInfo->ContainsDynamicAllocas |= hasDynamicAllocas;
+    CodeInfo->ContainsDynamicAllocas |= hasStaticAllocas && 
+      BB != &BB->getParent()->front();
+  }
+  
+  if (ReturnInst *RI = dyn_cast<ReturnInst>(NewBB->getTerminator()))
+    Returns.push_back(RI);
+}
+
+/// ConstantFoldMappedInstruction - Constant fold the specified instruction,
+/// mapping its operands through ValueMap if they are available.
+Constant *PruningFunctionCloner::
+ConstantFoldMappedInstruction(const Instruction *I) {
+  SmallVector<Constant*, 8> Ops;
+  for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i)
+    if (Constant *Op = dyn_cast_or_null<Constant>(MapValue(I->getOperand(i),
+                                                           ValueMap)))
+      Ops.push_back(Op);
+    else
+      return 0;  // All operands not constant!
+
+  if (const CmpInst *CI = dyn_cast<CmpInst>(I))
+    return ConstantFoldCompareInstOperands(CI->getPredicate(),
+                                           &Ops[0], Ops.size(), TD);
+
+  if (const LoadInst *LI = dyn_cast<LoadInst>(I))
+    if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Ops[0]))
+      if (!LI->isVolatile() && CE->getOpcode() == Instruction::GetElementPtr)
+        if (GlobalVariable *GV = dyn_cast<GlobalVariable>(CE->getOperand(0)))
+          if (GV->isConstant() && GV->hasDefinitiveInitializer())
+            return ConstantFoldLoadThroughGEPConstantExpr(GV->getInitializer(),
+                                                          CE);
+
+  return ConstantFoldInstOperands(I->getOpcode(), I->getType(), &Ops[0],
+                                  Ops.size(), TD);
+}
+
+/// CloneAndPruneFunctionInto - This works exactly like CloneFunctionInto,
+/// except that it does some simple constant prop and DCE on the fly.  The
+/// effect of this is to copy significantly less code in cases where (for
+/// example) a function call with constant arguments is inlined, and those
+/// constant arguments cause a significant amount of code in the callee to be
+/// dead.  Since this doesn't produce an exact copy of the input, it can't be
+/// used for things like CloneFunction or CloneModule.
+void llvm::CloneAndPruneFunctionInto(Function *NewFunc, const Function *OldFunc,
+                                     DenseMap<const Value*, Value*> &ValueMap,
+                                     std::vector<ReturnInst*> &Returns,
+                                     const char *NameSuffix, 
+                                     ClonedCodeInfo *CodeInfo,
+                                     const TargetData *TD) {
+  assert(NameSuffix && "NameSuffix cannot be null!");
+  
+#ifndef NDEBUG
+  for (Function::const_arg_iterator II = OldFunc->arg_begin(), 
+       E = OldFunc->arg_end(); II != E; ++II)
+    assert(ValueMap.count(II) && "No mapping from source argument specified!");
+#endif
+
+  PruningFunctionCloner PFC(NewFunc, OldFunc, ValueMap, Returns,
+                            NameSuffix, CodeInfo, TD);
+
+  // Clone the entry block, and anything recursively reachable from it.
+  std::vector<const BasicBlock*> CloneWorklist;
+  CloneWorklist.push_back(&OldFunc->getEntryBlock());
+  while (!CloneWorklist.empty()) {
+    const BasicBlock *BB = CloneWorklist.back();
+    CloneWorklist.pop_back();
+    PFC.CloneBlock(BB, CloneWorklist);
+  }
+  
+  // Loop over all of the basic blocks in the old function.  If the block was
+  // reachable, we have cloned it and the old block is now in the value map:
+  // insert it into the new function in the right order.  If not, ignore it.
+  //
+  // Defer PHI resolution until rest of function is resolved.
+  std::vector<const PHINode*> PHIToResolve;
+  for (Function::const_iterator BI = OldFunc->begin(), BE = OldFunc->end();
+       BI != BE; ++BI) {
+    BasicBlock *NewBB = cast_or_null<BasicBlock>(ValueMap[BI]);
+    if (NewBB == 0) continue;  // Dead block.
+
+    // Add the new block to the new function.
+    NewFunc->getBasicBlockList().push_back(NewBB);
+    
+    // Loop over all of the instructions in the block, fixing up operand
+    // references as we go.  This uses ValueMap to do all the hard work.
+    //
+    BasicBlock::iterator I = NewBB->begin();
+    
+    // Handle PHI nodes specially, as we have to remove references to dead
+    // blocks.
+    if (PHINode *PN = dyn_cast<PHINode>(I)) {
+      // Skip over all PHI nodes, remembering them for later.
+      BasicBlock::const_iterator OldI = BI->begin();
+      for (; (PN = dyn_cast<PHINode>(I)); ++I, ++OldI)
+        PHIToResolve.push_back(cast<PHINode>(OldI));
+    }
+    
+    // Otherwise, remap the rest of the instructions normally.
+    for (; I != NewBB->end(); ++I)
+      RemapInstruction(I, ValueMap);
+  }
+  
+  // Defer PHI resolution until rest of function is resolved, PHI resolution
+  // requires the CFG to be up-to-date.
+  for (unsigned phino = 0, e = PHIToResolve.size(); phino != e; ) {
+    const PHINode *OPN = PHIToResolve[phino];
+    unsigned NumPreds = OPN->getNumIncomingValues();
+    const BasicBlock *OldBB = OPN->getParent();
+    BasicBlock *NewBB = cast<BasicBlock>(ValueMap[OldBB]);
+
+    // Map operands for blocks that are live and remove operands for blocks
+    // that are dead.
+    for (; phino != PHIToResolve.size() &&
+         PHIToResolve[phino]->getParent() == OldBB; ++phino) {
+      OPN = PHIToResolve[phino];
+      PHINode *PN = cast<PHINode>(ValueMap[OPN]);
+      for (unsigned pred = 0, e = NumPreds; pred != e; ++pred) {
+        if (BasicBlock *MappedBlock = 
+            cast_or_null<BasicBlock>(ValueMap[PN->getIncomingBlock(pred)])) {
+          Value *InVal = MapValue(PN->getIncomingValue(pred), ValueMap);
+          assert(InVal && "Unknown input value?");
+          PN->setIncomingValue(pred, InVal);
+          PN->setIncomingBlock(pred, MappedBlock);
+        } else {
+          PN->removeIncomingValue(pred, false);
+          --pred, --e;  // Revisit the next entry.
+        }
+      } 
+    }
+    
+    // The loop above has removed PHI entries for those blocks that are dead
+    // and has updated others.  However, if a block is live (i.e. copied over)
+    // but its terminator has been changed to not go to this block, then our
+    // phi nodes will have invalid entries.  Update the PHI nodes in this
+    // case.
+    PHINode *PN = cast<PHINode>(NewBB->begin());
+    NumPreds = std::distance(pred_begin(NewBB), pred_end(NewBB));
+    if (NumPreds != PN->getNumIncomingValues()) {
+      assert(NumPreds < PN->getNumIncomingValues());
+      // Count how many times each predecessor comes to this block.
+      std::map<BasicBlock*, unsigned> PredCount;
+      for (pred_iterator PI = pred_begin(NewBB), E = pred_end(NewBB);
+           PI != E; ++PI)
+        --PredCount[*PI];
+      
+      // Figure out how many entries to remove from each PHI.
+      for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+        ++PredCount[PN->getIncomingBlock(i)];
+      
+      // At this point, the excess predecessor entries are positive in the
+      // map.  Loop over all of the PHIs and remove excess predecessor
+      // entries.
+      BasicBlock::iterator I = NewBB->begin();
+      for (; (PN = dyn_cast<PHINode>(I)); ++I) {
+        for (std::map<BasicBlock*, unsigned>::iterator PCI =PredCount.begin(),
+             E = PredCount.end(); PCI != E; ++PCI) {
+          BasicBlock *Pred     = PCI->first;
+          for (unsigned NumToRemove = PCI->second; NumToRemove; --NumToRemove)
+            PN->removeIncomingValue(Pred, false);
+        }
+      }
+    }
+    
+    // If the loops above have made these phi nodes have 0 or 1 operand,
+    // replace them with undef or the input value.  We must do this for
+    // correctness, because 0-operand phis are not valid.
+    PN = cast<PHINode>(NewBB->begin());
+    if (PN->getNumIncomingValues() == 0) {
+      BasicBlock::iterator I = NewBB->begin();
+      BasicBlock::const_iterator OldI = OldBB->begin();
+      while ((PN = dyn_cast<PHINode>(I++))) {
+        Value *NV = UndefValue::get(PN->getType());
+        PN->replaceAllUsesWith(NV);
+        assert(ValueMap[OldI] == PN && "ValueMap mismatch");
+        ValueMap[OldI] = NV;
+        PN->eraseFromParent();
+        ++OldI;
+      }
+    }
+    // NOTE: We cannot eliminate single entry phi nodes here, because of
+    // ValueMap.  Single entry phi nodes can have multiple ValueMap entries
+    // pointing at them.  Thus, deleting one would require scanning the ValueMap
+    // to update any entries in it that would require that.  This would be
+    // really slow.
+  }
+  
+  // Now that the inlined function body has been fully constructed, go through
+  // and zap unconditional fall-through branches.  This happen all the time when
+  // specializing code: code specialization turns conditional branches into
+  // uncond branches, and this code folds them.
+  Function::iterator I = cast<BasicBlock>(ValueMap[&OldFunc->getEntryBlock()]);
+  while (I != NewFunc->end()) {
+    BranchInst *BI = dyn_cast<BranchInst>(I->getTerminator());
+    if (!BI || BI->isConditional()) { ++I; continue; }
+    
+    // Note that we can't eliminate uncond branches if the destination has
+    // single-entry PHI nodes.  Eliminating the single-entry phi nodes would
+    // require scanning the ValueMap to update any entries that point to the phi
+    // node.
+    BasicBlock *Dest = BI->getSuccessor(0);
+    if (!Dest->getSinglePredecessor() || isa<PHINode>(Dest->begin())) {
+      ++I; continue;
+    }
+    
+    // We know all single-entry PHI nodes in the inlined function have been
+    // removed, so we just need to splice the blocks.
+    BI->eraseFromParent();
+    
+    // Move all the instructions in the succ to the pred.
+    I->getInstList().splice(I->end(), Dest->getInstList());
+    
+    // Make all PHI nodes that referred to Dest now refer to I as their source.
+    Dest->replaceAllUsesWith(I);
+
+    // Remove the dest block.
+    Dest->eraseFromParent();
+    
+    // Do not increment I, iteratively merge all things this block branches to.
+  }
+}
diff --git a/lib/Transforms/Utils/CloneLoop.cpp b/lib/Transforms/Utils/CloneLoop.cpp
new file mode 100644
index 0000000..7e000a1
--- /dev/null
+++ b/lib/Transforms/Utils/CloneLoop.cpp
@@ -0,0 +1,152 @@
+//===- CloneLoop.cpp - Clone loop nest ------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the CloneLoop interface which makes a copy of a loop.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/BasicBlock.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/ADT/DenseMap.h"
+
+
+using namespace llvm;
+
+/// CloneDominatorInfo - Clone basicblock's dominator tree and, if available,
+/// dominance info. It is expected that basic block is already cloned.
+static void CloneDominatorInfo(BasicBlock *BB, 
+                               DenseMap<const Value *, Value *> &ValueMap,
+                               DominatorTree *DT,
+                               DominanceFrontier *DF) {
+
+  assert (DT && "DominatorTree is not available");
+  DenseMap<const Value *, Value*>::iterator BI = ValueMap.find(BB);
+  assert (BI != ValueMap.end() && "BasicBlock clone is missing");
+  BasicBlock *NewBB = cast<BasicBlock>(BI->second);
+
+  // NewBB already got dominator info.
+  if (DT->getNode(NewBB))
+    return;
+
+  assert (DT->getNode(BB) && "BasicBlock does not have dominator info");
+  // Entry block is not expected here. Infinite loops are not to cloned.
+  assert (DT->getNode(BB)->getIDom() && "BasicBlock does not have immediate dominator");
+  BasicBlock *BBDom = DT->getNode(BB)->getIDom()->getBlock();
+
+  // NewBB's dominator is either BB's dominator or BB's dominator's clone.
+  BasicBlock *NewBBDom = BBDom;
+  DenseMap<const Value *, Value*>::iterator BBDomI = ValueMap.find(BBDom);
+  if (BBDomI != ValueMap.end()) {
+    NewBBDom = cast<BasicBlock>(BBDomI->second);
+    if (!DT->getNode(NewBBDom))
+      CloneDominatorInfo(BBDom, ValueMap, DT, DF);
+  }
+  DT->addNewBlock(NewBB, NewBBDom);
+
+  // Copy cloned dominance frontiner set
+  if (DF) {
+    DominanceFrontier::DomSetType NewDFSet;
+    DominanceFrontier::iterator DFI = DF->find(BB);
+    if ( DFI != DF->end()) {
+      DominanceFrontier::DomSetType S = DFI->second;
+        for (DominanceFrontier::DomSetType::iterator I = S.begin(), E = S.end();
+             I != E; ++I) {
+          BasicBlock *DB = *I;
+          DenseMap<const Value*, Value*>::iterator IDM = ValueMap.find(DB);
+          if (IDM != ValueMap.end())
+            NewDFSet.insert(cast<BasicBlock>(IDM->second));
+          else
+            NewDFSet.insert(DB);
+        }
+    }
+    DF->addBasicBlock(NewBB, NewDFSet);
+  }
+}
+
+/// CloneLoop - Clone Loop. Clone dominator info. Populate ValueMap
+/// using old blocks to new blocks mapping.
+Loop *llvm::CloneLoop(Loop *OrigL, LPPassManager  *LPM, LoopInfo *LI,
+                      DenseMap<const Value *, Value *> &ValueMap, Pass *P) {
+  
+  DominatorTree *DT = NULL;
+  DominanceFrontier *DF = NULL;
+  if (P) {
+    DT = P->getAnalysisIfAvailable<DominatorTree>();
+    DF = P->getAnalysisIfAvailable<DominanceFrontier>();
+  }
+
+  SmallVector<BasicBlock *, 16> NewBlocks;
+
+  // Populate loop nest.
+  SmallVector<Loop *, 8> LoopNest;
+  LoopNest.push_back(OrigL);
+
+
+  Loop *NewParentLoop = NULL;
+  while (!LoopNest.empty()) {
+    Loop *L = LoopNest.pop_back_val();
+    Loop *NewLoop = new Loop();
+
+    if (!NewParentLoop)
+      NewParentLoop = NewLoop;
+
+    LPM->insertLoop(NewLoop, L->getParentLoop());
+
+    // Clone Basic Blocks.
+    for (Loop::block_iterator I = L->block_begin(), E = L->block_end();
+         I != E; ++I) {
+      BasicBlock *BB = *I;
+      BasicBlock *NewBB = CloneBasicBlock(BB, ValueMap, ".clone");
+      ValueMap[BB] = NewBB;
+      if (P)
+        LPM->cloneBasicBlockSimpleAnalysis(BB, NewBB, L);
+      NewLoop->addBasicBlockToLoop(NewBB, LI->getBase());
+      NewBlocks.push_back(NewBB);
+    }
+
+    // Clone dominator info.
+    if (DT)
+      for (Loop::block_iterator I = L->block_begin(), E = L->block_end();
+           I != E; ++I) {
+        BasicBlock *BB = *I;
+        CloneDominatorInfo(BB, ValueMap, DT, DF);
+      }
+
+    // Process sub loops
+    for (Loop::iterator I = L->begin(), E = L->end(); I != E; ++I)
+      LoopNest.push_back(*I);
+  }
+
+  // Remap instructions to reference operands from ValueMap.
+  for(SmallVector<BasicBlock *, 16>::iterator NBItr = NewBlocks.begin(), 
+        NBE = NewBlocks.end();  NBItr != NBE; ++NBItr) {
+    BasicBlock *NB = *NBItr;
+    for(BasicBlock::iterator BI = NB->begin(), BE = NB->end(); 
+        BI != BE; ++BI) {
+      Instruction *Insn = BI;
+      for (unsigned index = 0, num_ops = Insn->getNumOperands(); 
+           index != num_ops; ++index) {
+        Value *Op = Insn->getOperand(index);
+        DenseMap<const Value *, Value *>::iterator OpItr = ValueMap.find(Op);
+        if (OpItr != ValueMap.end())
+          Insn->setOperand(index, OpItr->second);
+      }
+    }
+  }
+
+  BasicBlock *Latch = OrigL->getLoopLatch();
+  Function *F = Latch->getParent();
+  F->getBasicBlockList().insert(OrigL->getHeader(), 
+                                NewBlocks.begin(), NewBlocks.end());
+
+
+  return NewParentLoop;
+}
diff --git a/lib/Transforms/Utils/CloneModule.cpp b/lib/Transforms/Utils/CloneModule.cpp
new file mode 100644
index 0000000..337fa8a
--- /dev/null
+++ b/lib/Transforms/Utils/CloneModule.cpp
@@ -0,0 +1,126 @@
+//===- CloneModule.cpp - Clone an entire module ---------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the CloneModule interface which makes a copy of an
+// entire module.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Module.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/TypeSymbolTable.h"
+#include "llvm/Constant.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+using namespace llvm;
+
+/// CloneModule - Return an exact copy of the specified module.  This is not as
+/// easy as it might seem because we have to worry about making copies of global
+/// variables and functions, and making their (initializers and references,
+/// respectively) refer to the right globals.
+///
+Module *llvm::CloneModule(const Module *M) {
+  // Create the value map that maps things from the old module over to the new
+  // module.
+  DenseMap<const Value*, Value*> ValueMap;
+  return CloneModule(M, ValueMap);
+}
+
+Module *llvm::CloneModule(const Module *M,
+                          DenseMap<const Value*, Value*> &ValueMap) {
+  // First off, we need to create the new module...
+  Module *New = new Module(M->getModuleIdentifier());
+  New->setDataLayout(M->getDataLayout());
+  New->setTargetTriple(M->getTargetTriple());
+  New->setModuleInlineAsm(M->getModuleInlineAsm());
+
+  // Copy all of the type symbol table entries over.
+  const TypeSymbolTable &TST = M->getTypeSymbolTable();
+  for (TypeSymbolTable::const_iterator TI = TST.begin(), TE = TST.end(); 
+       TI != TE; ++TI)
+    New->addTypeName(TI->first, TI->second);
+  
+  // Copy all of the dependent libraries over.
+  for (Module::lib_iterator I = M->lib_begin(), E = M->lib_end(); I != E; ++I)
+    New->addLibrary(*I);
+
+  // Loop over all of the global variables, making corresponding globals in the
+  // new module.  Here we add them to the ValueMap and to the new Module.  We
+  // don't worry about attributes or initializers, they will come later.
+  //
+  for (Module::const_global_iterator I = M->global_begin(), E = M->global_end();
+       I != E; ++I) {
+    GlobalVariable *GV = new GlobalVariable(I->getType()->getElementType(),
+                                            false,
+                                            GlobalValue::ExternalLinkage, 0,
+                                            I->getName(), New);
+    GV->setAlignment(I->getAlignment());
+    ValueMap[I] = GV;
+  }
+
+  // Loop over the functions in the module, making external functions as before
+  for (Module::const_iterator I = M->begin(), E = M->end(); I != E; ++I) {
+    Function *NF =
+      Function::Create(cast<FunctionType>(I->getType()->getElementType()),
+                       GlobalValue::ExternalLinkage, I->getName(), New);
+    NF->copyAttributesFrom(I);
+    ValueMap[I] = NF;
+  }
+
+  // Loop over the aliases in the module
+  for (Module::const_alias_iterator I = M->alias_begin(), E = M->alias_end();
+       I != E; ++I)
+    ValueMap[I] = new GlobalAlias(I->getType(), GlobalAlias::ExternalLinkage,
+                                  I->getName(), NULL, New);
+  
+  // Now that all of the things that global variable initializer can refer to
+  // have been created, loop through and copy the global variable referrers
+  // over...  We also set the attributes on the global now.
+  //
+  for (Module::const_global_iterator I = M->global_begin(), E = M->global_end();
+       I != E; ++I) {
+    GlobalVariable *GV = cast<GlobalVariable>(ValueMap[I]);
+    if (I->hasInitializer())
+      GV->setInitializer(cast<Constant>(MapValue(I->getInitializer(),
+                                                 ValueMap)));
+    GV->setLinkage(I->getLinkage());
+    GV->setThreadLocal(I->isThreadLocal());
+    GV->setConstant(I->isConstant());
+  }
+
+  // Similarly, copy over function bodies now...
+  //
+  for (Module::const_iterator I = M->begin(), E = M->end(); I != E; ++I) {
+    Function *F = cast<Function>(ValueMap[I]);
+    if (!I->isDeclaration()) {
+      Function::arg_iterator DestI = F->arg_begin();
+      for (Function::const_arg_iterator J = I->arg_begin(); J != I->arg_end();
+           ++J) {
+        DestI->setName(J->getName());
+        ValueMap[J] = DestI++;
+      }
+
+      std::vector<ReturnInst*> Returns;  // Ignore returns cloned...
+      CloneFunctionInto(F, I, ValueMap, Returns);
+    }
+
+    F->setLinkage(I->getLinkage());
+  }
+
+  // And aliases
+  for (Module::const_alias_iterator I = M->alias_begin(), E = M->alias_end();
+       I != E; ++I) {
+    GlobalAlias *GA = cast<GlobalAlias>(ValueMap[I]);
+    GA->setLinkage(I->getLinkage());
+    if (const Constant* C = I->getAliasee())
+      GA->setAliasee(cast<Constant>(MapValue(C, ValueMap)));
+  }
+  
+  return New;
+}
diff --git a/lib/Transforms/Utils/CloneTrace.cpp b/lib/Transforms/Utils/CloneTrace.cpp
new file mode 100644
index 0000000..0711139
--- /dev/null
+++ b/lib/Transforms/Utils/CloneTrace.cpp
@@ -0,0 +1,119 @@
+//===- CloneTrace.cpp - Clone a trace -------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the CloneTrace interface, which is used when writing
+// runtime optimizations. It takes a vector of basic blocks clones the basic
+// blocks, removes internal phi nodes, adds it to the same function as the
+// original (although there is no jump to it) and returns the new vector of
+// basic blocks.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/Trace.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Instructions.h"
+#include "llvm/Function.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+using namespace llvm;
+
+//Clones the trace (a vector of basic blocks)
+std::vector<BasicBlock *>
+llvm::CloneTrace(const std::vector<BasicBlock*> &origTrace) {
+  std::vector<BasicBlock *> clonedTrace;
+  DenseMap<const Value*, Value*> ValueMap;
+
+  //First, loop over all the Basic Blocks in the trace and copy
+  //them using CloneBasicBlock. Also fix the phi nodes during
+  //this loop. To fix the phi nodes, we delete incoming branches
+  //that are not in the trace.
+  for (std::vector<BasicBlock *>::const_iterator T = origTrace.begin(),
+    End = origTrace.end(); T != End; ++T) {
+
+    //Clone Basic Block
+    BasicBlock *clonedBlock =
+      CloneBasicBlock(*T, ValueMap, ".tr", (*T)->getParent());
+
+    //Add it to our new trace
+    clonedTrace.push_back(clonedBlock);
+
+    //Add this new mapping to our Value Map
+    ValueMap[*T] = clonedBlock;
+
+    //Loop over the phi instructions and delete operands
+    //that are from blocks not in the trace
+    //only do this if we are NOT the first block
+    if (T != origTrace.begin()) {
+      for (BasicBlock::iterator I = clonedBlock->begin();
+           isa<PHINode>(I); ++I) {
+        PHINode *PN = cast<PHINode>(I);
+        //get incoming value for the previous BB
+        Value *V = PN->getIncomingValueForBlock(*(T-1));
+        assert(V && "No incoming value from a BasicBlock in our trace!");
+
+        //remap our phi node to point to incoming value
+        ValueMap[*&I] = V;
+
+        //remove phi node
+        clonedBlock->getInstList().erase(PN);
+      }
+    }
+  }
+
+  //Second loop to do the remapping
+  for (std::vector<BasicBlock *>::const_iterator BB = clonedTrace.begin(),
+    BE = clonedTrace.end(); BB != BE; ++BB) {
+    for (BasicBlock::iterator I = (*BB)->begin(); I != (*BB)->end(); ++I) {
+      //Loop over all the operands of the instruction
+      for (unsigned op=0, E = I->getNumOperands(); op != E; ++op) {
+        const Value *Op = I->getOperand(op);
+
+        //Get it out of the value map
+        Value *V = ValueMap[Op];
+
+        //If not in the value map, then its outside our trace so ignore
+        if (V != 0)
+          I->setOperand(op,V);
+      }
+    }
+  }
+
+  //return new vector of basic blocks
+  return clonedTrace;
+}
+
+/// CloneTraceInto - Clone T into NewFunc. Original<->clone mapping is
+/// saved in ValueMap.
+///
+void llvm::CloneTraceInto(Function *NewFunc, Trace &T,
+                          DenseMap<const Value*, Value*> &ValueMap,
+                          const char *NameSuffix) {
+  assert(NameSuffix && "NameSuffix cannot be null!");
+
+  // Loop over all of the basic blocks in the trace, cloning them as
+  // appropriate.
+  //
+  for (Trace::const_iterator BI = T.begin(), BE = T.end(); BI != BE; ++BI) {
+    const BasicBlock *BB = *BI;
+
+    // Create a new basic block and copy instructions into it!
+    BasicBlock *CBB = CloneBasicBlock(BB, ValueMap, NameSuffix, NewFunc);
+    ValueMap[BB] = CBB;                       // Add basic block mapping.
+  }
+
+  // Loop over all of the instructions in the new function, fixing up operand
+  // references as we go.  This uses ValueMap to do all the hard work.
+  //
+  for (Function::iterator BB =
+         cast<BasicBlock>(ValueMap[T.getEntryBasicBlock()]),
+         BE = NewFunc->end(); BB != BE; ++BB)
+    // Loop over all instructions, fixing each one as we find it...
+    for (BasicBlock::iterator II = BB->begin(); II != BB->end(); ++II)
+      RemapInstruction(II, ValueMap);
+}
+
diff --git a/lib/Transforms/Utils/CodeExtractor.cpp b/lib/Transforms/Utils/CodeExtractor.cpp
new file mode 100644
index 0000000..6d5904e
--- /dev/null
+++ b/lib/Transforms/Utils/CodeExtractor.cpp
@@ -0,0 +1,746 @@
+//===- CodeExtractor.cpp - Pull code region into a new function -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the interface to tear out a code region, such as an
+// individual loop or a parallel section, into a new function, replacing it with
+// a call to the new function.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/FunctionUtils.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Instructions.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/Verifier.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/ADT/StringExtras.h"
+#include <algorithm>
+#include <set>
+using namespace llvm;
+
+// Provide a command-line option to aggregate function arguments into a struct
+// for functions produced by the code extractor. This is useful when converting
+// extracted functions to pthread-based code, as only one argument (void*) can
+// be passed in to pthread_create().
+static cl::opt<bool>
+AggregateArgsOpt("aggregate-extracted-args", cl::Hidden,
+                 cl::desc("Aggregate arguments to code-extracted functions"));
+
+namespace {
+  class VISIBILITY_HIDDEN CodeExtractor {
+    typedef std::vector<Value*> Values;
+    std::set<BasicBlock*> BlocksToExtract;
+    DominatorTree* DT;
+    bool AggregateArgs;
+    unsigned NumExitBlocks;
+    const Type *RetTy;
+  public:
+    CodeExtractor(DominatorTree* dt = 0, bool AggArgs = false)
+      : DT(dt), AggregateArgs(AggArgs||AggregateArgsOpt), NumExitBlocks(~0U) {}
+
+    Function *ExtractCodeRegion(const std::vector<BasicBlock*> &code);
+
+    bool isEligible(const std::vector<BasicBlock*> &code);
+
+  private:
+    /// definedInRegion - Return true if the specified value is defined in the
+    /// extracted region.
+    bool definedInRegion(Value *V) const {
+      if (Instruction *I = dyn_cast<Instruction>(V))
+        if (BlocksToExtract.count(I->getParent()))
+          return true;
+      return false;
+    }
+
+    /// definedInCaller - Return true if the specified value is defined in the
+    /// function being code extracted, but not in the region being extracted.
+    /// These values must be passed in as live-ins to the function.
+    bool definedInCaller(Value *V) const {
+      if (isa<Argument>(V)) return true;
+      if (Instruction *I = dyn_cast<Instruction>(V))
+        if (!BlocksToExtract.count(I->getParent()))
+          return true;
+      return false;
+    }
+
+    void severSplitPHINodes(BasicBlock *&Header);
+    void splitReturnBlocks();
+    void findInputsOutputs(Values &inputs, Values &outputs);
+
+    Function *constructFunction(const Values &inputs,
+                                const Values &outputs,
+                                BasicBlock *header,
+                                BasicBlock *newRootNode, BasicBlock *newHeader,
+                                Function *oldFunction, Module *M);
+
+    void moveCodeToFunction(Function *newFunction);
+
+    void emitCallAndSwitchStatement(Function *newFunction,
+                                    BasicBlock *newHeader,
+                                    Values &inputs,
+                                    Values &outputs);
+
+  };
+}
+
+/// severSplitPHINodes - If a PHI node has multiple inputs from outside of the
+/// region, we need to split the entry block of the region so that the PHI node
+/// is easier to deal with.
+void CodeExtractor::severSplitPHINodes(BasicBlock *&Header) {
+  bool HasPredsFromRegion = false;
+  unsigned NumPredsOutsideRegion = 0;
+
+  if (Header != &Header->getParent()->getEntryBlock()) {
+    PHINode *PN = dyn_cast<PHINode>(Header->begin());
+    if (!PN) return;  // No PHI nodes.
+
+    // If the header node contains any PHI nodes, check to see if there is more
+    // than one entry from outside the region.  If so, we need to sever the
+    // header block into two.
+    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+      if (BlocksToExtract.count(PN->getIncomingBlock(i)))
+        HasPredsFromRegion = true;
+      else
+        ++NumPredsOutsideRegion;
+
+    // If there is one (or fewer) predecessor from outside the region, we don't
+    // need to do anything special.
+    if (NumPredsOutsideRegion <= 1) return;
+  }
+
+  // Otherwise, we need to split the header block into two pieces: one
+  // containing PHI nodes merging values from outside of the region, and a
+  // second that contains all of the code for the block and merges back any
+  // incoming values from inside of the region.
+  BasicBlock::iterator AfterPHIs = Header->getFirstNonPHI();
+  BasicBlock *NewBB = Header->splitBasicBlock(AfterPHIs,
+                                              Header->getName()+".ce");
+
+  // We only want to code extract the second block now, and it becomes the new
+  // header of the region.
+  BasicBlock *OldPred = Header;
+  BlocksToExtract.erase(OldPred);
+  BlocksToExtract.insert(NewBB);
+  Header = NewBB;
+
+  // Okay, update dominator sets. The blocks that dominate the new one are the
+  // blocks that dominate TIBB plus the new block itself.
+  if (DT)
+    DT->splitBlock(NewBB);
+
+  // Okay, now we need to adjust the PHI nodes and any branches from within the
+  // region to go to the new header block instead of the old header block.
+  if (HasPredsFromRegion) {
+    PHINode *PN = cast<PHINode>(OldPred->begin());
+    // Loop over all of the predecessors of OldPred that are in the region,
+    // changing them to branch to NewBB instead.
+    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+      if (BlocksToExtract.count(PN->getIncomingBlock(i))) {
+        TerminatorInst *TI = PN->getIncomingBlock(i)->getTerminator();
+        TI->replaceUsesOfWith(OldPred, NewBB);
+      }
+
+    // Okay, everthing within the region is now branching to the right block, we
+    // just have to update the PHI nodes now, inserting PHI nodes into NewBB.
+    for (AfterPHIs = OldPred->begin(); isa<PHINode>(AfterPHIs); ++AfterPHIs) {
+      PHINode *PN = cast<PHINode>(AfterPHIs);
+      // Create a new PHI node in the new region, which has an incoming value
+      // from OldPred of PN.
+      PHINode *NewPN = PHINode::Create(PN->getType(), PN->getName()+".ce",
+                                       NewBB->begin());
+      NewPN->addIncoming(PN, OldPred);
+
+      // Loop over all of the incoming value in PN, moving them to NewPN if they
+      // are from the extracted region.
+      for (unsigned i = 0; i != PN->getNumIncomingValues(); ++i) {
+        if (BlocksToExtract.count(PN->getIncomingBlock(i))) {
+          NewPN->addIncoming(PN->getIncomingValue(i), PN->getIncomingBlock(i));
+          PN->removeIncomingValue(i);
+          --i;
+        }
+      }
+    }
+  }
+}
+
+void CodeExtractor::splitReturnBlocks() {
+  for (std::set<BasicBlock*>::iterator I = BlocksToExtract.begin(),
+         E = BlocksToExtract.end(); I != E; ++I)
+    if (ReturnInst *RI = dyn_cast<ReturnInst>((*I)->getTerminator()))
+      (*I)->splitBasicBlock(RI, (*I)->getName()+".ret");
+}
+
+// findInputsOutputs - Find inputs to, outputs from the code region.
+//
+void CodeExtractor::findInputsOutputs(Values &inputs, Values &outputs) {
+  std::set<BasicBlock*> ExitBlocks;
+  for (std::set<BasicBlock*>::const_iterator ci = BlocksToExtract.begin(),
+       ce = BlocksToExtract.end(); ci != ce; ++ci) {
+    BasicBlock *BB = *ci;
+
+    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
+      // If a used value is defined outside the region, it's an input.  If an
+      // instruction is used outside the region, it's an output.
+      for (User::op_iterator O = I->op_begin(), E = I->op_end(); O != E; ++O)
+        if (definedInCaller(*O))
+          inputs.push_back(*O);
+
+      // Consider uses of this instruction (outputs).
+      for (Value::use_iterator UI = I->use_begin(), E = I->use_end();
+           UI != E; ++UI)
+        if (!definedInRegion(*UI)) {
+          outputs.push_back(I);
+          break;
+        }
+    } // for: insts
+
+    // Keep track of the exit blocks from the region.
+    TerminatorInst *TI = BB->getTerminator();
+    for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i)
+      if (!BlocksToExtract.count(TI->getSuccessor(i)))
+        ExitBlocks.insert(TI->getSuccessor(i));
+  } // for: basic blocks
+
+  NumExitBlocks = ExitBlocks.size();
+
+  // Eliminate duplicates.
+  std::sort(inputs.begin(), inputs.end());
+  inputs.erase(std::unique(inputs.begin(), inputs.end()), inputs.end());
+  std::sort(outputs.begin(), outputs.end());
+  outputs.erase(std::unique(outputs.begin(), outputs.end()), outputs.end());
+}
+
+/// constructFunction - make a function based on inputs and outputs, as follows:
+/// f(in0, ..., inN, out0, ..., outN)
+///
+Function *CodeExtractor::constructFunction(const Values &inputs,
+                                           const Values &outputs,
+                                           BasicBlock *header,
+                                           BasicBlock *newRootNode,
+                                           BasicBlock *newHeader,
+                                           Function *oldFunction,
+                                           Module *M) {
+  DOUT << "inputs: " << inputs.size() << "\n";
+  DOUT << "outputs: " << outputs.size() << "\n";
+
+  // This function returns unsigned, outputs will go back by reference.
+  switch (NumExitBlocks) {
+  case 0:
+  case 1: RetTy = Type::VoidTy; break;
+  case 2: RetTy = Type::Int1Ty; break;
+  default: RetTy = Type::Int16Ty; break;
+  }
+
+  std::vector<const Type*> paramTy;
+
+  // Add the types of the input values to the function's argument list
+  for (Values::const_iterator i = inputs.begin(),
+         e = inputs.end(); i != e; ++i) {
+    const Value *value = *i;
+    DOUT << "value used in func: " << *value << "\n";
+    paramTy.push_back(value->getType());
+  }
+
+  // Add the types of the output values to the function's argument list.
+  for (Values::const_iterator I = outputs.begin(), E = outputs.end();
+       I != E; ++I) {
+    DOUT << "instr used in func: " << **I << "\n";
+    if (AggregateArgs)
+      paramTy.push_back((*I)->getType());
+    else
+      paramTy.push_back(PointerType::getUnqual((*I)->getType()));
+  }
+
+  DOUT << "Function type: " << *RetTy << " f(";
+  for (std::vector<const Type*>::iterator i = paramTy.begin(),
+         e = paramTy.end(); i != e; ++i)
+    DOUT << **i << ", ";
+  DOUT << ")\n";
+
+  if (AggregateArgs && (inputs.size() + outputs.size() > 0)) {
+    PointerType *StructPtr = PointerType::getUnqual(StructType::get(paramTy));
+    paramTy.clear();
+    paramTy.push_back(StructPtr);
+  }
+  const FunctionType *funcType = FunctionType::get(RetTy, paramTy, false);
+
+  // Create the new function
+  Function *newFunction = Function::Create(funcType,
+                                           GlobalValue::InternalLinkage,
+                                           oldFunction->getName() + "_" +
+                                           header->getName(), M);
+  // If the old function is no-throw, so is the new one.
+  if (oldFunction->doesNotThrow())
+    newFunction->setDoesNotThrow(true);
+  
+  newFunction->getBasicBlockList().push_back(newRootNode);
+
+  // Create an iterator to name all of the arguments we inserted.
+  Function::arg_iterator AI = newFunction->arg_begin();
+
+  // Rewrite all users of the inputs in the extracted region to use the
+  // arguments (or appropriate addressing into struct) instead.
+  for (unsigned i = 0, e = inputs.size(); i != e; ++i) {
+    Value *RewriteVal;
+    if (AggregateArgs) {
+      Value *Idx[2];
+      Idx[0] = Constant::getNullValue(Type::Int32Ty);
+      Idx[1] = ConstantInt::get(Type::Int32Ty, i);
+      std::string GEPname = "gep_" + inputs[i]->getName();
+      TerminatorInst *TI = newFunction->begin()->getTerminator();
+      GetElementPtrInst *GEP = GetElementPtrInst::Create(AI, Idx, Idx+2, 
+                                                         GEPname, TI);
+      RewriteVal = new LoadInst(GEP, "load" + GEPname, TI);
+    } else
+      RewriteVal = AI++;
+
+    std::vector<User*> Users(inputs[i]->use_begin(), inputs[i]->use_end());
+    for (std::vector<User*>::iterator use = Users.begin(), useE = Users.end();
+         use != useE; ++use)
+      if (Instruction* inst = dyn_cast<Instruction>(*use))
+        if (BlocksToExtract.count(inst->getParent()))
+          inst->replaceUsesOfWith(inputs[i], RewriteVal);
+  }
+
+  // Set names for input and output arguments.
+  if (!AggregateArgs) {
+    AI = newFunction->arg_begin();
+    for (unsigned i = 0, e = inputs.size(); i != e; ++i, ++AI)
+      AI->setName(inputs[i]->getName());
+    for (unsigned i = 0, e = outputs.size(); i != e; ++i, ++AI)
+      AI->setName(outputs[i]->getName()+".out");
+  }
+
+  // Rewrite branches to basic blocks outside of the loop to new dummy blocks
+  // within the new function. This must be done before we lose track of which
+  // blocks were originally in the code region.
+  std::vector<User*> Users(header->use_begin(), header->use_end());
+  for (unsigned i = 0, e = Users.size(); i != e; ++i)
+    // The BasicBlock which contains the branch is not in the region
+    // modify the branch target to a new block
+    if (TerminatorInst *TI = dyn_cast<TerminatorInst>(Users[i]))
+      if (!BlocksToExtract.count(TI->getParent()) &&
+          TI->getParent()->getParent() == oldFunction)
+        TI->replaceUsesOfWith(header, newHeader);
+
+  return newFunction;
+}
+
+/// emitCallAndSwitchStatement - This method sets up the caller side by adding
+/// the call instruction, splitting any PHI nodes in the header block as
+/// necessary.
+void CodeExtractor::
+emitCallAndSwitchStatement(Function *newFunction, BasicBlock *codeReplacer,
+                           Values &inputs, Values &outputs) {
+  // Emit a call to the new function, passing in: *pointer to struct (if
+  // aggregating parameters), or plan inputs and allocated memory for outputs
+  std::vector<Value*> params, StructValues, ReloadOutputs;
+
+  // Add inputs as params, or to be filled into the struct
+  for (Values::iterator i = inputs.begin(), e = inputs.end(); i != e; ++i)
+    if (AggregateArgs)
+      StructValues.push_back(*i);
+    else
+      params.push_back(*i);
+
+  // Create allocas for the outputs
+  for (Values::iterator i = outputs.begin(), e = outputs.end(); i != e; ++i) {
+    if (AggregateArgs) {
+      StructValues.push_back(*i);
+    } else {
+      AllocaInst *alloca =
+        new AllocaInst((*i)->getType(), 0, (*i)->getName()+".loc",
+                       codeReplacer->getParent()->begin()->begin());
+      ReloadOutputs.push_back(alloca);
+      params.push_back(alloca);
+    }
+  }
+
+  AllocaInst *Struct = 0;
+  if (AggregateArgs && (inputs.size() + outputs.size() > 0)) {
+    std::vector<const Type*> ArgTypes;
+    for (Values::iterator v = StructValues.begin(),
+           ve = StructValues.end(); v != ve; ++v)
+      ArgTypes.push_back((*v)->getType());
+
+    // Allocate a struct at the beginning of this function
+    Type *StructArgTy = StructType::get(ArgTypes);
+    Struct =
+      new AllocaInst(StructArgTy, 0, "structArg",
+                     codeReplacer->getParent()->begin()->begin());
+    params.push_back(Struct);
+
+    for (unsigned i = 0, e = inputs.size(); i != e; ++i) {
+      Value *Idx[2];
+      Idx[0] = Constant::getNullValue(Type::Int32Ty);
+      Idx[1] = ConstantInt::get(Type::Int32Ty, i);
+      GetElementPtrInst *GEP =
+        GetElementPtrInst::Create(Struct, Idx, Idx + 2,
+                                  "gep_" + StructValues[i]->getName());
+      codeReplacer->getInstList().push_back(GEP);
+      StoreInst *SI = new StoreInst(StructValues[i], GEP);
+      codeReplacer->getInstList().push_back(SI);
+    }
+  }
+
+  // Emit the call to the function
+  CallInst *call = CallInst::Create(newFunction, params.begin(), params.end(),
+                                    NumExitBlocks > 1 ? "targetBlock" : "");
+  codeReplacer->getInstList().push_back(call);
+
+  Function::arg_iterator OutputArgBegin = newFunction->arg_begin();
+  unsigned FirstOut = inputs.size();
+  if (!AggregateArgs)
+    std::advance(OutputArgBegin, inputs.size());
+
+  // Reload the outputs passed in by reference
+  for (unsigned i = 0, e = outputs.size(); i != e; ++i) {
+    Value *Output = 0;
+    if (AggregateArgs) {
+      Value *Idx[2];
+      Idx[0] = Constant::getNullValue(Type::Int32Ty);
+      Idx[1] = ConstantInt::get(Type::Int32Ty, FirstOut + i);
+      GetElementPtrInst *GEP
+        = GetElementPtrInst::Create(Struct, Idx, Idx + 2,
+                                    "gep_reload_" + outputs[i]->getName());
+      codeReplacer->getInstList().push_back(GEP);
+      Output = GEP;
+    } else {
+      Output = ReloadOutputs[i];
+    }
+    LoadInst *load = new LoadInst(Output, outputs[i]->getName()+".reload");
+    codeReplacer->getInstList().push_back(load);
+    std::vector<User*> Users(outputs[i]->use_begin(), outputs[i]->use_end());
+    for (unsigned u = 0, e = Users.size(); u != e; ++u) {
+      Instruction *inst = cast<Instruction>(Users[u]);
+      if (!BlocksToExtract.count(inst->getParent()))
+        inst->replaceUsesOfWith(outputs[i], load);
+    }
+  }
+
+  // Now we can emit a switch statement using the call as a value.
+  SwitchInst *TheSwitch =
+      SwitchInst::Create(ConstantInt::getNullValue(Type::Int16Ty),
+                         codeReplacer, 0, codeReplacer);
+
+  // Since there may be multiple exits from the original region, make the new
+  // function return an unsigned, switch on that number.  This loop iterates
+  // over all of the blocks in the extracted region, updating any terminator
+  // instructions in the to-be-extracted region that branch to blocks that are
+  // not in the region to be extracted.
+  std::map<BasicBlock*, BasicBlock*> ExitBlockMap;
+
+  unsigned switchVal = 0;
+  for (std::set<BasicBlock*>::const_iterator i = BlocksToExtract.begin(),
+         e = BlocksToExtract.end(); i != e; ++i) {
+    TerminatorInst *TI = (*i)->getTerminator();
+    for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i)
+      if (!BlocksToExtract.count(TI->getSuccessor(i))) {
+        BasicBlock *OldTarget = TI->getSuccessor(i);
+        // add a new basic block which returns the appropriate value
+        BasicBlock *&NewTarget = ExitBlockMap[OldTarget];
+        if (!NewTarget) {
+          // If we don't already have an exit stub for this non-extracted
+          // destination, create one now!
+          NewTarget = BasicBlock::Create(OldTarget->getName() + ".exitStub",
+                                         newFunction);
+          unsigned SuccNum = switchVal++;
+
+          Value *brVal = 0;
+          switch (NumExitBlocks) {
+          case 0:
+          case 1: break;  // No value needed.
+          case 2:         // Conditional branch, return a bool
+            brVal = ConstantInt::get(Type::Int1Ty, !SuccNum);
+            break;
+          default:
+            brVal = ConstantInt::get(Type::Int16Ty, SuccNum);
+            break;
+          }
+
+          ReturnInst *NTRet = ReturnInst::Create(brVal, NewTarget);
+
+          // Update the switch instruction.
+          TheSwitch->addCase(ConstantInt::get(Type::Int16Ty, SuccNum),
+                             OldTarget);
+
+          // Restore values just before we exit
+          Function::arg_iterator OAI = OutputArgBegin;
+          for (unsigned out = 0, e = outputs.size(); out != e; ++out) {
+            // For an invoke, the normal destination is the only one that is
+            // dominated by the result of the invocation
+            BasicBlock *DefBlock = cast<Instruction>(outputs[out])->getParent();
+
+            bool DominatesDef = true;
+
+            if (InvokeInst *Invoke = dyn_cast<InvokeInst>(outputs[out])) {
+              DefBlock = Invoke->getNormalDest();
+
+              // Make sure we are looking at the original successor block, not
+              // at a newly inserted exit block, which won't be in the dominator
+              // info.
+              for (std::map<BasicBlock*, BasicBlock*>::iterator I =
+                     ExitBlockMap.begin(), E = ExitBlockMap.end(); I != E; ++I)
+                if (DefBlock == I->second) {
+                  DefBlock = I->first;
+                  break;
+                }
+
+              // In the extract block case, if the block we are extracting ends
+              // with an invoke instruction, make sure that we don't emit a
+              // store of the invoke value for the unwind block.
+              if (!DT && DefBlock != OldTarget)
+                DominatesDef = false;
+            }
+
+            if (DT)
+              DominatesDef = DT->dominates(DefBlock, OldTarget);
+
+            if (DominatesDef) {
+              if (AggregateArgs) {
+                Value *Idx[2];
+                Idx[0] = Constant::getNullValue(Type::Int32Ty);
+                Idx[1] = ConstantInt::get(Type::Int32Ty,FirstOut+out);
+                GetElementPtrInst *GEP =
+                  GetElementPtrInst::Create(OAI, Idx, Idx + 2,
+                                            "gep_" + outputs[out]->getName(),
+                                            NTRet);
+                new StoreInst(outputs[out], GEP, NTRet);
+              } else {
+                new StoreInst(outputs[out], OAI, NTRet);
+              }
+            }
+            // Advance output iterator even if we don't emit a store
+            if (!AggregateArgs) ++OAI;
+          }
+        }
+
+        // rewrite the original branch instruction with this new target
+        TI->setSuccessor(i, NewTarget);
+      }
+  }
+
+  // Now that we've done the deed, simplify the switch instruction.
+  const Type *OldFnRetTy = TheSwitch->getParent()->getParent()->getReturnType();
+  switch (NumExitBlocks) {
+  case 0:
+    // There are no successors (the block containing the switch itself), which
+    // means that previously this was the last part of the function, and hence
+    // this should be rewritten as a `ret'
+
+    // Check if the function should return a value
+    if (OldFnRetTy == Type::VoidTy) {
+      ReturnInst::Create(0, TheSwitch);  // Return void
+    } else if (OldFnRetTy == TheSwitch->getCondition()->getType()) {
+      // return what we have
+      ReturnInst::Create(TheSwitch->getCondition(), TheSwitch);
+    } else {
+      // Otherwise we must have code extracted an unwind or something, just
+      // return whatever we want.
+      ReturnInst::Create(Constant::getNullValue(OldFnRetTy), TheSwitch);
+    }
+
+    TheSwitch->eraseFromParent();
+    break;
+  case 1:
+    // Only a single destination, change the switch into an unconditional
+    // branch.
+    BranchInst::Create(TheSwitch->getSuccessor(1), TheSwitch);
+    TheSwitch->eraseFromParent();
+    break;
+  case 2:
+    BranchInst::Create(TheSwitch->getSuccessor(1), TheSwitch->getSuccessor(2),
+                       call, TheSwitch);
+    TheSwitch->eraseFromParent();
+    break;
+  default:
+    // Otherwise, make the default destination of the switch instruction be one
+    // of the other successors.
+    TheSwitch->setOperand(0, call);
+    TheSwitch->setSuccessor(0, TheSwitch->getSuccessor(NumExitBlocks));
+    TheSwitch->removeCase(NumExitBlocks);  // Remove redundant case
+    break;
+  }
+}
+
+void CodeExtractor::moveCodeToFunction(Function *newFunction) {
+  Function *oldFunc = (*BlocksToExtract.begin())->getParent();
+  Function::BasicBlockListType &oldBlocks = oldFunc->getBasicBlockList();
+  Function::BasicBlockListType &newBlocks = newFunction->getBasicBlockList();
+
+  for (std::set<BasicBlock*>::const_iterator i = BlocksToExtract.begin(),
+         e = BlocksToExtract.end(); i != e; ++i) {
+    // Delete the basic block from the old function, and the list of blocks
+    oldBlocks.remove(*i);
+
+    // Insert this basic block into the new function
+    newBlocks.push_back(*i);
+  }
+}
+
+/// ExtractRegion - Removes a loop from a function, replaces it with a call to
+/// new function. Returns pointer to the new function.
+///
+/// algorithm:
+///
+/// find inputs and outputs for the region
+///
+/// for inputs: add to function as args, map input instr* to arg#
+/// for outputs: add allocas for scalars,
+///             add to func as args, map output instr* to arg#
+///
+/// rewrite func to use argument #s instead of instr*
+///
+/// for each scalar output in the function: at every exit, store intermediate
+/// computed result back into memory.
+///
+Function *CodeExtractor::
+ExtractCodeRegion(const std::vector<BasicBlock*> &code) {
+  if (!isEligible(code))
+    return 0;
+
+  // 1) Find inputs, outputs
+  // 2) Construct new function
+  //  * Add allocas for defs, pass as args by reference
+  //  * Pass in uses as args
+  // 3) Move code region, add call instr to func
+  //
+  BlocksToExtract.insert(code.begin(), code.end());
+
+  Values inputs, outputs;
+
+  // Assumption: this is a single-entry code region, and the header is the first
+  // block in the region.
+  BasicBlock *header = code[0];
+
+  for (unsigned i = 1, e = code.size(); i != e; ++i)
+    for (pred_iterator PI = pred_begin(code[i]), E = pred_end(code[i]);
+         PI != E; ++PI)
+      assert(BlocksToExtract.count(*PI) &&
+             "No blocks in this region may have entries from outside the region"
+             " except for the first block!");
+
+  // If we have to split PHI nodes or the entry block, do so now.
+  severSplitPHINodes(header);
+
+  // If we have any return instructions in the region, split those blocks so
+  // that the return is not in the region.
+  splitReturnBlocks();
+
+  Function *oldFunction = header->getParent();
+
+  // This takes place of the original loop
+  BasicBlock *codeReplacer = BasicBlock::Create("codeRepl", oldFunction,
+                                                header);
+
+  // The new function needs a root node because other nodes can branch to the
+  // head of the region, but the entry node of a function cannot have preds.
+  BasicBlock *newFuncRoot = BasicBlock::Create("newFuncRoot");
+  newFuncRoot->getInstList().push_back(BranchInst::Create(header));
+
+  // Find inputs to, outputs from the code region.
+  findInputsOutputs(inputs, outputs);
+
+  // Construct new function based on inputs/outputs & add allocas for all defs.
+  Function *newFunction = constructFunction(inputs, outputs, header,
+                                            newFuncRoot,
+                                            codeReplacer, oldFunction,
+                                            oldFunction->getParent());
+
+  emitCallAndSwitchStatement(newFunction, codeReplacer, inputs, outputs);
+
+  moveCodeToFunction(newFunction);
+
+  // Loop over all of the PHI nodes in the header block, and change any
+  // references to the old incoming edge to be the new incoming edge.
+  for (BasicBlock::iterator I = header->begin(); isa<PHINode>(I); ++I) {
+    PHINode *PN = cast<PHINode>(I);
+    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+      if (!BlocksToExtract.count(PN->getIncomingBlock(i)))
+        PN->setIncomingBlock(i, newFuncRoot);
+  }
+
+  // Look at all successors of the codeReplacer block.  If any of these blocks
+  // had PHI nodes in them, we need to update the "from" block to be the code
+  // replacer, not the original block in the extracted region.
+  std::vector<BasicBlock*> Succs(succ_begin(codeReplacer),
+                                 succ_end(codeReplacer));
+  for (unsigned i = 0, e = Succs.size(); i != e; ++i)
+    for (BasicBlock::iterator I = Succs[i]->begin(); isa<PHINode>(I); ++I) {
+      PHINode *PN = cast<PHINode>(I);
+      std::set<BasicBlock*> ProcessedPreds;
+      for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+        if (BlocksToExtract.count(PN->getIncomingBlock(i))) {
+          if (ProcessedPreds.insert(PN->getIncomingBlock(i)).second)
+            PN->setIncomingBlock(i, codeReplacer);
+          else {
+            // There were multiple entries in the PHI for this block, now there
+            // is only one, so remove the duplicated entries.
+            PN->removeIncomingValue(i, false);
+            --i; --e;
+          }
+        }
+    }
+
+  //cerr << "NEW FUNCTION: " << *newFunction;
+  //  verifyFunction(*newFunction);
+
+  //  cerr << "OLD FUNCTION: " << *oldFunction;
+  //  verifyFunction(*oldFunction);
+
+  DEBUG(if (verifyFunction(*newFunction)) abort());
+  return newFunction;
+}
+
+bool CodeExtractor::isEligible(const std::vector<BasicBlock*> &code) {
+  // Deny code region if it contains allocas or vastarts.
+  for (std::vector<BasicBlock*>::const_iterator BB = code.begin(), e=code.end();
+       BB != e; ++BB)
+    for (BasicBlock::const_iterator I = (*BB)->begin(), Ie = (*BB)->end();
+         I != Ie; ++I)
+      if (isa<AllocaInst>(*I))
+        return false;
+      else if (const CallInst *CI = dyn_cast<CallInst>(I))
+        if (const Function *F = CI->getCalledFunction())
+          if (F->getIntrinsicID() == Intrinsic::vastart)
+            return false;
+  return true;
+}
+
+
+/// ExtractCodeRegion - slurp a sequence of basic blocks into a brand new
+/// function
+///
+Function* llvm::ExtractCodeRegion(DominatorTree &DT,
+                                  const std::vector<BasicBlock*> &code,
+                                  bool AggregateArgs) {
+  return CodeExtractor(&DT, AggregateArgs).ExtractCodeRegion(code);
+}
+
+/// ExtractBasicBlock - slurp a natural loop into a brand new function
+///
+Function* llvm::ExtractLoop(DominatorTree &DT, Loop *L, bool AggregateArgs) {
+  return CodeExtractor(&DT, AggregateArgs).ExtractCodeRegion(L->getBlocks());
+}
+
+/// ExtractBasicBlock - slurp a basic block into a brand new function
+///
+Function* llvm::ExtractBasicBlock(BasicBlock *BB, bool AggregateArgs) {
+  std::vector<BasicBlock*> Blocks;
+  Blocks.push_back(BB);
+  return CodeExtractor(0, AggregateArgs).ExtractCodeRegion(Blocks);
+}
diff --git a/lib/Transforms/Utils/DemoteRegToStack.cpp b/lib/Transforms/Utils/DemoteRegToStack.cpp
new file mode 100644
index 0000000..b8dd754
--- /dev/null
+++ b/lib/Transforms/Utils/DemoteRegToStack.cpp
@@ -0,0 +1,144 @@
+//===- DemoteRegToStack.cpp - Move a virtual register to the stack --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provide the function DemoteRegToStack().  This function takes a
+// virtual register computed by an Instruction and replaces it with a slot in
+// the stack frame, allocated via alloca. It returns the pointer to the
+// AllocaInst inserted.  After this function is called on an instruction, we are
+// guaranteed that the only user of the instruction is a store that is
+// immediately after it.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Function.h"
+#include "llvm/Instructions.h"
+#include "llvm/Type.h"
+#include <map>
+using namespace llvm;
+
+/// DemoteRegToStack - This function takes a virtual register computed by an
+/// Instruction and replaces it with a slot in the stack frame, allocated via
+/// alloca.  This allows the CFG to be changed around without fear of
+/// invalidating the SSA information for the value.  It returns the pointer to
+/// the alloca inserted to create a stack slot for I.
+///
+AllocaInst* llvm::DemoteRegToStack(Instruction &I, bool VolatileLoads,
+                                   Instruction *AllocaPoint) {
+  if (I.use_empty()) {
+    I.eraseFromParent();
+    return 0;
+  }
+  
+  // Create a stack slot to hold the value.
+  AllocaInst *Slot;
+  if (AllocaPoint) {
+    Slot = new AllocaInst(I.getType(), 0, I.getName()+".reg2mem", AllocaPoint);
+  } else {
+    Function *F = I.getParent()->getParent();
+    Slot = new AllocaInst(I.getType(), 0, I.getName()+".reg2mem",
+                          F->getEntryBlock().begin());
+  }
+  
+  // Change all of the users of the instruction to read from the stack slot
+  // instead.
+  while (!I.use_empty()) {
+    Instruction *U = cast<Instruction>(I.use_back());
+    if (PHINode *PN = dyn_cast<PHINode>(U)) {
+      // If this is a PHI node, we can't insert a load of the value before the
+      // use.  Instead, insert the load in the predecessor block corresponding
+      // to the incoming value.
+      //
+      // Note that if there are multiple edges from a basic block to this PHI
+      // node that we cannot multiple loads.  The problem is that the resultant
+      // PHI node will have multiple values (from each load) coming in from the
+      // same block, which is illegal SSA form.  For this reason, we keep track
+      // and reuse loads we insert.
+      std::map<BasicBlock*, Value*> Loads;
+      for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+        if (PN->getIncomingValue(i) == &I) {
+          Value *&V = Loads[PN->getIncomingBlock(i)];
+          if (V == 0) {
+            // Insert the load into the predecessor block
+            V = new LoadInst(Slot, I.getName()+".reload", VolatileLoads, 
+                             PN->getIncomingBlock(i)->getTerminator());
+          }
+          PN->setIncomingValue(i, V);
+        }
+
+    } else {
+      // If this is a normal instruction, just insert a load.
+      Value *V = new LoadInst(Slot, I.getName()+".reload", VolatileLoads, U);
+      U->replaceUsesOfWith(&I, V);
+    }
+  }
+
+
+  // Insert stores of the computed value into the stack slot.  We have to be
+  // careful is I is an invoke instruction though, because we can't insert the
+  // store AFTER the terminator instruction.
+  BasicBlock::iterator InsertPt;
+  if (!isa<TerminatorInst>(I)) {
+    InsertPt = &I;
+    ++InsertPt;
+  } else {
+    // We cannot demote invoke instructions to the stack if their normal edge
+    // is critical.
+    InvokeInst &II = cast<InvokeInst>(I);
+    assert(II.getNormalDest()->getSinglePredecessor() &&
+           "Cannot demote invoke with a critical successor!");
+    InsertPt = II.getNormalDest()->begin();
+  }
+
+  for (; isa<PHINode>(InsertPt); ++InsertPt)
+  /* empty */;   // Don't insert before any PHI nodes.
+  new StoreInst(&I, Slot, InsertPt);
+
+  return Slot;
+}
+
+
+/// DemotePHIToStack - This function takes a virtual register computed by a phi
+/// node and replaces it with a slot in the stack frame, allocated via alloca.
+/// The phi node is deleted and it returns the pointer to the alloca inserted.
+AllocaInst* llvm::DemotePHIToStack(PHINode *P, Instruction *AllocaPoint) {
+  if (P->use_empty()) {
+    P->eraseFromParent();    
+    return 0;                
+  }
+
+  // Create a stack slot to hold the value.
+  AllocaInst *Slot;
+  if (AllocaPoint) {
+    Slot = new AllocaInst(P->getType(), 0, P->getName()+".reg2mem", AllocaPoint);
+  } else {
+    Function *F = P->getParent()->getParent();
+    Slot = new AllocaInst(P->getType(), 0, P->getName()+".reg2mem",
+                          F->getEntryBlock().begin());
+  }
+  
+  // Iterate over each operand, insert store in each predecessor.
+  for (unsigned i = 0, e = P->getNumIncomingValues(); i < e; ++i) {
+    if (InvokeInst *II = dyn_cast<InvokeInst>(P->getIncomingValue(i))) {
+      assert(II->getParent() != P->getIncomingBlock(i) && 
+             "Invoke edge not supported yet"); II=II;
+    }
+    new StoreInst(P->getIncomingValue(i), Slot, 
+                  P->getIncomingBlock(i)->getTerminator());
+  }
+  
+  // Insert load in place of the phi and replace all uses.
+  Value *V = new LoadInst(Slot, P->getName()+".reload", P);
+  P->replaceAllUsesWith(V);
+  
+  // Delete phi.
+  P->eraseFromParent();
+  
+  return Slot;
+}
diff --git a/lib/Transforms/Utils/InlineCost.cpp b/lib/Transforms/Utils/InlineCost.cpp
new file mode 100644
index 0000000..87aff01
--- /dev/null
+++ b/lib/Transforms/Utils/InlineCost.cpp
@@ -0,0 +1,315 @@
+//===- InlineCost.cpp - Cost analysis for inliner -------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements inline cost analysis.
+//
+//===----------------------------------------------------------------------===//
+
+
+#include "llvm/Transforms/Utils/InlineCost.h"
+#include "llvm/Support/CallSite.h"
+#include "llvm/CallingConv.h"
+#include "llvm/IntrinsicInst.h"
+
+using namespace llvm;
+
+// CountCodeReductionForConstant - Figure out an approximation for how many
+// instructions will be constant folded if the specified value is constant.
+//
+unsigned InlineCostAnalyzer::FunctionInfo::
+         CountCodeReductionForConstant(Value *V) {
+  unsigned Reduction = 0;
+  for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); UI != E; ++UI)
+    if (isa<BranchInst>(*UI))
+      Reduction += 40;          // Eliminating a conditional branch is a big win
+    else if (SwitchInst *SI = dyn_cast<SwitchInst>(*UI))
+      // Eliminating a switch is a big win, proportional to the number of edges
+      // deleted.
+      Reduction += (SI->getNumSuccessors()-1) * 40;
+    else if (CallInst *CI = dyn_cast<CallInst>(*UI)) {
+      // Turning an indirect call into a direct call is a BIG win
+      Reduction += CI->getCalledValue() == V ? 500 : 0;
+    } else if (InvokeInst *II = dyn_cast<InvokeInst>(*UI)) {
+      // Turning an indirect call into a direct call is a BIG win
+      Reduction += II->getCalledValue() == V ? 500 : 0;
+    } else {
+      // Figure out if this instruction will be removed due to simple constant
+      // propagation.
+      Instruction &Inst = cast<Instruction>(**UI);
+      bool AllOperandsConstant = true;
+      for (unsigned i = 0, e = Inst.getNumOperands(); i != e; ++i)
+        if (!isa<Constant>(Inst.getOperand(i)) && Inst.getOperand(i) != V) {
+          AllOperandsConstant = false;
+          break;
+        }
+
+      if (AllOperandsConstant) {
+        // We will get to remove this instruction...
+        Reduction += 7;
+
+        // And any other instructions that use it which become constants
+        // themselves.
+        Reduction += CountCodeReductionForConstant(&Inst);
+      }
+    }
+
+  return Reduction;
+}
+
+// CountCodeReductionForAlloca - Figure out an approximation of how much smaller
+// the function will be if it is inlined into a context where an argument
+// becomes an alloca.
+//
+unsigned InlineCostAnalyzer::FunctionInfo::
+         CountCodeReductionForAlloca(Value *V) {
+  if (!isa<PointerType>(V->getType())) return 0;  // Not a pointer
+  unsigned Reduction = 0;
+  for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); UI != E;++UI){
+    Instruction *I = cast<Instruction>(*UI);
+    if (isa<LoadInst>(I) || isa<StoreInst>(I))
+      Reduction += 10;
+    else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I)) {
+      // If the GEP has variable indices, we won't be able to do much with it.
+      if (!GEP->hasAllConstantIndices())
+        Reduction += CountCodeReductionForAlloca(GEP)+15;
+    } else {
+      // If there is some other strange instruction, we're not going to be able
+      // to do much if we inline this.
+      return 0;
+    }
+  }
+
+  return Reduction;
+}
+
+/// analyzeFunction - Fill in the current structure with information gleaned
+/// from the specified function.
+void InlineCostAnalyzer::FunctionInfo::analyzeFunction(Function *F) {
+  unsigned NumInsts = 0, NumBlocks = 0, NumVectorInsts = 0;
+
+  // Look at the size of the callee.  Each basic block counts as 20 units, and
+  // each instruction counts as 5.
+  for (Function::const_iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
+    for (BasicBlock::const_iterator II = BB->begin(), E = BB->end();
+         II != E; ++II) {
+      if (isa<PHINode>(II)) continue;           // PHI nodes don't count.
+
+      // Special handling for calls.
+      if (isa<CallInst>(II) || isa<InvokeInst>(II)) {
+        if (isa<DbgInfoIntrinsic>(II))
+          continue;  // Debug intrinsics don't count as size.
+        
+        CallSite CS = CallSite::get(const_cast<Instruction*>(&*II));
+        
+        // If this function contains a call to setjmp or _setjmp, never inline
+        // it.  This is a hack because we depend on the user marking their local
+        // variables as volatile if they are live across a setjmp call, and they
+        // probably won't do this in callers.
+        if (Function *F = CS.getCalledFunction())
+          if (F->isDeclaration() && 
+              (F->isName("setjmp") || F->isName("_setjmp"))) {
+            NeverInline = true;
+            return;
+          }
+
+        // Calls often compile into many machine instructions.  Bump up their
+        // cost to reflect this.
+        if (!isa<IntrinsicInst>(II))
+          NumInsts += 5;
+      }
+      
+      if (const AllocaInst *AI = dyn_cast<AllocaInst>(II)) {
+        if (!AI->isStaticAlloca())
+          this->usesDynamicAlloca = true;
+      }
+
+      if (isa<ExtractElementInst>(II) || isa<VectorType>(II->getType()))
+        ++NumVectorInsts; 
+      
+      // Noop casts, including ptr <-> int,  don't count.
+      if (const CastInst *CI = dyn_cast<CastInst>(II)) {
+        if (CI->isLosslessCast() || isa<IntToPtrInst>(CI) || 
+            isa<PtrToIntInst>(CI))
+          continue;
+      } else if (const GetElementPtrInst *GEPI =
+                 dyn_cast<GetElementPtrInst>(II)) {
+        // If a GEP has all constant indices, it will probably be folded with
+        // a load/store.
+        if (GEPI->hasAllConstantIndices())
+          continue;
+      }
+      
+      ++NumInsts;
+    }
+
+    ++NumBlocks;
+  }
+
+  this->NumBlocks      = NumBlocks;
+  this->NumInsts       = NumInsts;
+  this->NumVectorInsts = NumVectorInsts;
+
+  // Check out all of the arguments to the function, figuring out how much
+  // code can be eliminated if one of the arguments is a constant.
+  for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(); I != E; ++I)
+    ArgumentWeights.push_back(ArgInfo(CountCodeReductionForConstant(I),
+                                      CountCodeReductionForAlloca(I)));
+}
+
+
+
+// getInlineCost - The heuristic used to determine if we should inline the
+// function call or not.
+//
+InlineCost InlineCostAnalyzer::getInlineCost(CallSite CS,
+                               SmallPtrSet<const Function *, 16> &NeverInline) {
+  Instruction *TheCall = CS.getInstruction();
+  Function *Callee = CS.getCalledFunction();
+  Function *Caller = TheCall->getParent()->getParent();
+
+      // Don't inline functions which can be redefined at link-time to mean
+      // something else.
+   if (Callee->mayBeOverridden() ||
+       // Don't inline functions marked noinline.
+       Callee->hasFnAttr(Attribute::NoInline) || NeverInline.count(Callee))
+    return llvm::InlineCost::getNever();
+
+  // InlineCost - This value measures how good of an inline candidate this call
+  // site is to inline.  A lower inline cost make is more likely for the call to
+  // be inlined.  This value may go negative.
+  //
+  int InlineCost = 0;
+  
+  // If there is only one call of the function, and it has internal linkage,
+  // make it almost guaranteed to be inlined.
+  //
+  if ((Callee->hasLocalLinkage() || Callee->hasAvailableExternallyLinkage()) && 
+      Callee->hasOneUse())
+    InlineCost -= 15000;
+  
+  // If this function uses the coldcc calling convention, prefer not to inline
+  // it.
+  if (Callee->getCallingConv() == CallingConv::Cold)
+    InlineCost += 2000;
+  
+  // If the instruction after the call, or if the normal destination of the
+  // invoke is an unreachable instruction, the function is noreturn.  As such,
+  // there is little point in inlining this.
+  if (InvokeInst *II = dyn_cast<InvokeInst>(TheCall)) {
+    if (isa<UnreachableInst>(II->getNormalDest()->begin()))
+      InlineCost += 10000;
+  } else if (isa<UnreachableInst>(++BasicBlock::iterator(TheCall)))
+    InlineCost += 10000;
+  
+  // Get information about the callee...
+  FunctionInfo &CalleeFI = CachedFunctionInfo[Callee];
+  
+  // If we haven't calculated this information yet, do so now.
+  if (CalleeFI.NumBlocks == 0)
+    CalleeFI.analyzeFunction(Callee);
+
+  // If we should never inline this, return a huge cost.
+  if (CalleeFI.NeverInline)
+    return InlineCost::getNever();
+
+  // FIXME: It would be nice to kill off CalleeFI.NeverInline. Then we
+  // could move this up and avoid computing the FunctionInfo for
+  // things we are going to just return always inline for. This
+  // requires handling setjmp somewhere else, however.
+  if (!Callee->isDeclaration() && Callee->hasFnAttr(Attribute::AlwaysInline))
+    return InlineCost::getAlways();
+    
+  if (CalleeFI.usesDynamicAlloca) {
+    // Get infomation about the caller...
+    FunctionInfo &CallerFI = CachedFunctionInfo[Caller];
+
+    // If we haven't calculated this information yet, do so now.
+    if (CallerFI.NumBlocks == 0)
+      CallerFI.analyzeFunction(Caller);
+
+    // Don't inline a callee with dynamic alloca into a caller without them.
+    // Functions containing dynamic alloca's are inefficient in various ways;
+    // don't create more inefficiency.
+    if (!CallerFI.usesDynamicAlloca)
+      return InlineCost::getNever();
+  }
+
+  // Add to the inline quality for properties that make the call valuable to
+  // inline.  This includes factors that indicate that the result of inlining
+  // the function will be optimizable.  Currently this just looks at arguments
+  // passed into the function.
+  //
+  unsigned ArgNo = 0;
+  for (CallSite::arg_iterator I = CS.arg_begin(), E = CS.arg_end();
+       I != E; ++I, ++ArgNo) {
+    // Each argument passed in has a cost at both the caller and the callee
+    // sides.  This favors functions that take many arguments over functions
+    // that take few arguments.
+    InlineCost -= 20;
+    
+    // If this is a function being passed in, it is very likely that we will be
+    // able to turn an indirect function call into a direct function call.
+    if (isa<Function>(I))
+      InlineCost -= 100;
+    
+    // If an alloca is passed in, inlining this function is likely to allow
+    // significant future optimization possibilities (like scalar promotion, and
+    // scalarization), so encourage the inlining of the function.
+    //
+    else if (isa<AllocaInst>(I)) {
+      if (ArgNo < CalleeFI.ArgumentWeights.size())
+        InlineCost -= CalleeFI.ArgumentWeights[ArgNo].AllocaWeight;
+      
+      // If this is a constant being passed into the function, use the argument
+      // weights calculated for the callee to determine how much will be folded
+      // away with this information.
+    } else if (isa<Constant>(I)) {
+      if (ArgNo < CalleeFI.ArgumentWeights.size())
+        InlineCost -= CalleeFI.ArgumentWeights[ArgNo].ConstantWeight;
+    }
+  }
+  
+  // Now that we have considered all of the factors that make the call site more
+  // likely to be inlined, look at factors that make us not want to inline it.
+  
+  // Don't inline into something too big, which would make it bigger.
+  //
+  InlineCost += Caller->size()/15;
+  
+  // Look at the size of the callee. Each instruction counts as 5.
+  InlineCost += CalleeFI.NumInsts*5;
+
+  return llvm::InlineCost::get(InlineCost);
+}
+
+// getInlineFudgeFactor - Return a > 1.0 factor if the inliner should use a
+// higher threshold to determine if the function call should be inlined.
+float InlineCostAnalyzer::getInlineFudgeFactor(CallSite CS) {
+  Function *Callee = CS.getCalledFunction();
+  
+  // Get information about the callee...
+  FunctionInfo &CalleeFI = CachedFunctionInfo[Callee];
+  
+  // If we haven't calculated this information yet, do so now.
+  if (CalleeFI.NumBlocks == 0)
+    CalleeFI.analyzeFunction(Callee);
+
+  float Factor = 1.0f;
+  // Single BB functions are often written to be inlined.
+  if (CalleeFI.NumBlocks == 1)
+    Factor += 0.5f;
+
+  // Be more aggressive if the function contains a good chunk (if it mades up
+  // at least 10% of the instructions) of vector instructions.
+  if (CalleeFI.NumVectorInsts > CalleeFI.NumInsts/2)
+    Factor += 2.0f;
+  else if (CalleeFI.NumVectorInsts > CalleeFI.NumInsts/10)
+    Factor += 1.5f;
+  return Factor;
+}
diff --git a/lib/Transforms/Utils/InlineFunction.cpp b/lib/Transforms/Utils/InlineFunction.cpp
new file mode 100644
index 0000000..4989c00
--- /dev/null
+++ b/lib/Transforms/Utils/InlineFunction.cpp
@@ -0,0 +1,656 @@
+//===- InlineFunction.cpp - Code to perform function inlining -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements inlining of a function into a call site, resolving
+// parameters and the return value as appropriate.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Module.h"
+#include "llvm/Instructions.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/Attributes.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/Analysis/DebugInfo.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/CallSite.h"
+using namespace llvm;
+
+bool llvm::InlineFunction(CallInst *CI, CallGraph *CG, const TargetData *TD) {
+  return InlineFunction(CallSite(CI), CG, TD);
+}
+bool llvm::InlineFunction(InvokeInst *II, CallGraph *CG, const TargetData *TD) {
+  return InlineFunction(CallSite(II), CG, TD);
+}
+
+/// HandleInlinedInvoke - If we inlined an invoke site, we need to convert calls
+/// in the body of the inlined function into invokes and turn unwind
+/// instructions into branches to the invoke unwind dest.
+///
+/// II is the invoke instruction being inlined.  FirstNewBlock is the first
+/// block of the inlined code (the last block is the end of the function),
+/// and InlineCodeInfo is information about the code that got inlined.
+static void HandleInlinedInvoke(InvokeInst *II, BasicBlock *FirstNewBlock,
+                                ClonedCodeInfo &InlinedCodeInfo,
+                                CallGraph *CG) {
+  BasicBlock *InvokeDest = II->getUnwindDest();
+  std::vector<Value*> InvokeDestPHIValues;
+
+  // If there are PHI nodes in the unwind destination block, we need to
+  // keep track of which values came into them from this invoke, then remove
+  // the entry for this block.
+  BasicBlock *InvokeBlock = II->getParent();
+  for (BasicBlock::iterator I = InvokeDest->begin(); isa<PHINode>(I); ++I) {
+    PHINode *PN = cast<PHINode>(I);
+    // Save the value to use for this edge.
+    InvokeDestPHIValues.push_back(PN->getIncomingValueForBlock(InvokeBlock));
+  }
+
+  Function *Caller = FirstNewBlock->getParent();
+
+  // The inlined code is currently at the end of the function, scan from the
+  // start of the inlined code to its end, checking for stuff we need to
+  // rewrite.
+  if (InlinedCodeInfo.ContainsCalls || InlinedCodeInfo.ContainsUnwinds) {
+    for (Function::iterator BB = FirstNewBlock, E = Caller->end();
+         BB != E; ++BB) {
+      if (InlinedCodeInfo.ContainsCalls) {
+        for (BasicBlock::iterator BBI = BB->begin(), E = BB->end(); BBI != E; ){
+          Instruction *I = BBI++;
+
+          // We only need to check for function calls: inlined invoke
+          // instructions require no special handling.
+          if (!isa<CallInst>(I)) continue;
+          CallInst *CI = cast<CallInst>(I);
+
+          // If this call cannot unwind, don't convert it to an invoke.
+          if (CI->doesNotThrow())
+            continue;
+
+          // Convert this function call into an invoke instruction.
+          // First, split the basic block.
+          BasicBlock *Split = BB->splitBasicBlock(CI, CI->getName()+".noexc");
+
+          // Next, create the new invoke instruction, inserting it at the end
+          // of the old basic block.
+          SmallVector<Value*, 8> InvokeArgs(CI->op_begin()+1, CI->op_end());
+          InvokeInst *II =
+            InvokeInst::Create(CI->getCalledValue(), Split, InvokeDest,
+                               InvokeArgs.begin(), InvokeArgs.end(),
+                               CI->getName(), BB->getTerminator());
+          II->setCallingConv(CI->getCallingConv());
+          II->setAttributes(CI->getAttributes());
+
+          // Make sure that anything using the call now uses the invoke!
+          CI->replaceAllUsesWith(II);
+
+          // Update the callgraph.
+          if (CG) {
+            // We should be able to do this:
+            //   (*CG)[Caller]->replaceCallSite(CI, II);
+            // but that fails if the old call site isn't in the call graph,
+            // which, because of LLVM bug 3601, it sometimes isn't.
+            CallGraphNode *CGN = (*CG)[Caller];
+            for (CallGraphNode::iterator NI = CGN->begin(), NE = CGN->end();
+                 NI != NE; ++NI) {
+              if (NI->first == CI) {
+                NI->first = II;
+                break;
+              }
+            }
+          }
+
+          // Delete the unconditional branch inserted by splitBasicBlock
+          BB->getInstList().pop_back();
+          Split->getInstList().pop_front();  // Delete the original call
+
+          // Update any PHI nodes in the exceptional block to indicate that
+          // there is now a new entry in them.
+          unsigned i = 0;
+          for (BasicBlock::iterator I = InvokeDest->begin();
+               isa<PHINode>(I); ++I, ++i) {
+            PHINode *PN = cast<PHINode>(I);
+            PN->addIncoming(InvokeDestPHIValues[i], BB);
+          }
+
+          // This basic block is now complete, start scanning the next one.
+          break;
+        }
+      }
+
+      if (UnwindInst *UI = dyn_cast<UnwindInst>(BB->getTerminator())) {
+        // An UnwindInst requires special handling when it gets inlined into an
+        // invoke site.  Once this happens, we know that the unwind would cause
+        // a control transfer to the invoke exception destination, so we can
+        // transform it into a direct branch to the exception destination.
+        BranchInst::Create(InvokeDest, UI);
+
+        // Delete the unwind instruction!
+        UI->eraseFromParent();
+
+        // Update any PHI nodes in the exceptional block to indicate that
+        // there is now a new entry in them.
+        unsigned i = 0;
+        for (BasicBlock::iterator I = InvokeDest->begin();
+             isa<PHINode>(I); ++I, ++i) {
+          PHINode *PN = cast<PHINode>(I);
+          PN->addIncoming(InvokeDestPHIValues[i], BB);
+        }
+      }
+    }
+  }
+
+  // Now that everything is happy, we have one final detail.  The PHI nodes in
+  // the exception destination block still have entries due to the original
+  // invoke instruction.  Eliminate these entries (which might even delete the
+  // PHI node) now.
+  InvokeDest->removePredecessor(II->getParent());
+}
+
+/// UpdateCallGraphAfterInlining - Once we have cloned code over from a callee
+/// into the caller, update the specified callgraph to reflect the changes we
+/// made.  Note that it's possible that not all code was copied over, so only
+/// some edges of the callgraph may remain.
+static void UpdateCallGraphAfterInlining(CallSite CS,
+                                         Function::iterator FirstNewBlock,
+                                       DenseMap<const Value*, Value*> &ValueMap,
+                                         CallGraph &CG) {
+  const Function *Caller = CS.getInstruction()->getParent()->getParent();
+  const Function *Callee = CS.getCalledFunction();
+  CallGraphNode *CalleeNode = CG[Callee];
+  CallGraphNode *CallerNode = CG[Caller];
+
+  // Since we inlined some uninlined call sites in the callee into the caller,
+  // add edges from the caller to all of the callees of the callee.
+  CallGraphNode::iterator I = CalleeNode->begin(), E = CalleeNode->end();
+
+  // Consider the case where CalleeNode == CallerNode.
+  CallGraphNode::CalledFunctionsVector CallCache;
+  if (CalleeNode == CallerNode) {
+    CallCache.assign(I, E);
+    I = CallCache.begin();
+    E = CallCache.end();
+  }
+
+  for (; I != E; ++I) {
+    const Instruction *OrigCall = I->first.getInstruction();
+
+    DenseMap<const Value*, Value*>::iterator VMI = ValueMap.find(OrigCall);
+    // Only copy the edge if the call was inlined!
+    if (VMI != ValueMap.end() && VMI->second) {
+      // If the call was inlined, but then constant folded, there is no edge to
+      // add.  Check for this case.
+      if (Instruction *NewCall = dyn_cast<Instruction>(VMI->second))
+        CallerNode->addCalledFunction(CallSite::get(NewCall), I->second);
+    }
+  }
+  // Update the call graph by deleting the edge from Callee to Caller.  We must
+  // do this after the loop above in case Caller and Callee are the same.
+  CallerNode->removeCallEdgeFor(CS);
+}
+
+/// findFnRegionEndMarker - This is a utility routine that is used by
+/// InlineFunction. Return llvm.dbg.region.end intrinsic that corresponds
+/// to the llvm.dbg.func.start of the function F. Otherwise return NULL.
+static const DbgRegionEndInst *findFnRegionEndMarker(const Function *F) {
+
+  GlobalVariable *FnStart = NULL;
+  const DbgRegionEndInst *FnEnd = NULL;
+  for (Function::const_iterator FI = F->begin(), FE =F->end(); FI != FE; ++FI) 
+    for (BasicBlock::const_iterator BI = FI->begin(), BE = FI->end(); BI != BE;
+         ++BI) {
+      if (FnStart == NULL)  {
+        if (const DbgFuncStartInst *FSI = dyn_cast<DbgFuncStartInst>(BI)) {
+          DISubprogram SP(cast<GlobalVariable>(FSI->getSubprogram()));
+          assert (SP.isNull() == false && "Invalid llvm.dbg.func.start");
+          if (SP.describes(F))
+            FnStart = SP.getGV();
+        }
+      } else {
+        if (const DbgRegionEndInst *REI = dyn_cast<DbgRegionEndInst>(BI))
+          if (REI->getContext() == FnStart)
+            FnEnd = REI;
+      }
+    }
+  return FnEnd;
+}
+
+// InlineFunction - This function inlines the called function into the basic
+// block of the caller.  This returns false if it is not possible to inline this
+// call.  The program is still in a well defined state if this occurs though.
+//
+// Note that this only does one level of inlining.  For example, if the
+// instruction 'call B' is inlined, and 'B' calls 'C', then the call to 'C' now
+// exists in the instruction stream.  Similiarly this will inline a recursive
+// function by one level.
+//
+bool llvm::InlineFunction(CallSite CS, CallGraph *CG, const TargetData *TD) {
+  Instruction *TheCall = CS.getInstruction();
+  assert(TheCall->getParent() && TheCall->getParent()->getParent() &&
+         "Instruction not in function!");
+
+  const Function *CalledFunc = CS.getCalledFunction();
+  if (CalledFunc == 0 ||          // Can't inline external function or indirect
+      CalledFunc->isDeclaration() || // call, or call to a vararg function!
+      CalledFunc->getFunctionType()->isVarArg()) return false;
+
+
+  // If the call to the callee is not a tail call, we must clear the 'tail'
+  // flags on any calls that we inline.
+  bool MustClearTailCallFlags =
+    !(isa<CallInst>(TheCall) && cast<CallInst>(TheCall)->isTailCall());
+
+  // If the call to the callee cannot throw, set the 'nounwind' flag on any
+  // calls that we inline.
+  bool MarkNoUnwind = CS.doesNotThrow();
+
+  BasicBlock *OrigBB = TheCall->getParent();
+  Function *Caller = OrigBB->getParent();
+
+  // GC poses two hazards to inlining, which only occur when the callee has GC:
+  //  1. If the caller has no GC, then the callee's GC must be propagated to the
+  //     caller.
+  //  2. If the caller has a differing GC, it is invalid to inline.
+  if (CalledFunc->hasGC()) {
+    if (!Caller->hasGC())
+      Caller->setGC(CalledFunc->getGC());
+    else if (CalledFunc->getGC() != Caller->getGC())
+      return false;
+  }
+
+  // Get an iterator to the last basic block in the function, which will have
+  // the new function inlined after it.
+  //
+  Function::iterator LastBlock = &Caller->back();
+
+  // Make sure to capture all of the return instructions from the cloned
+  // function.
+  std::vector<ReturnInst*> Returns;
+  ClonedCodeInfo InlinedFunctionInfo;
+  Function::iterator FirstNewBlock;
+
+  { // Scope to destroy ValueMap after cloning.
+    DenseMap<const Value*, Value*> ValueMap;
+
+    assert(CalledFunc->arg_size() == CS.arg_size() &&
+           "No varargs calls can be inlined!");
+
+    // Calculate the vector of arguments to pass into the function cloner, which
+    // matches up the formal to the actual argument values.
+    CallSite::arg_iterator AI = CS.arg_begin();
+    unsigned ArgNo = 0;
+    for (Function::const_arg_iterator I = CalledFunc->arg_begin(),
+         E = CalledFunc->arg_end(); I != E; ++I, ++AI, ++ArgNo) {
+      Value *ActualArg = *AI;
+
+      // When byval arguments actually inlined, we need to make the copy implied
+      // by them explicit.  However, we don't do this if the callee is readonly
+      // or readnone, because the copy would be unneeded: the callee doesn't
+      // modify the struct.
+      if (CalledFunc->paramHasAttr(ArgNo+1, Attribute::ByVal) &&
+          !CalledFunc->onlyReadsMemory()) {
+        const Type *AggTy = cast<PointerType>(I->getType())->getElementType();
+        const Type *VoidPtrTy = PointerType::getUnqual(Type::Int8Ty);
+
+        // Create the alloca.  If we have TargetData, use nice alignment.
+        unsigned Align = 1;
+        if (TD) Align = TD->getPrefTypeAlignment(AggTy);
+        Value *NewAlloca = new AllocaInst(AggTy, 0, Align, I->getName(),
+                                          Caller->begin()->begin());
+        // Emit a memcpy.
+        const Type *Tys[] = { Type::Int64Ty };
+        Function *MemCpyFn = Intrinsic::getDeclaration(Caller->getParent(),
+                                                       Intrinsic::memcpy, 
+                                                       Tys, 1);
+        Value *DestCast = new BitCastInst(NewAlloca, VoidPtrTy, "tmp", TheCall);
+        Value *SrcCast = new BitCastInst(*AI, VoidPtrTy, "tmp", TheCall);
+
+        Value *Size;
+        if (TD == 0)
+          Size = ConstantExpr::getSizeOf(AggTy);
+        else
+          Size = ConstantInt::get(Type::Int64Ty, TD->getTypeStoreSize(AggTy));
+
+        // Always generate a memcpy of alignment 1 here because we don't know
+        // the alignment of the src pointer.  Other optimizations can infer
+        // better alignment.
+        Value *CallArgs[] = {
+          DestCast, SrcCast, Size, ConstantInt::get(Type::Int32Ty, 1)
+        };
+        CallInst *TheMemCpy =
+          CallInst::Create(MemCpyFn, CallArgs, CallArgs+4, "", TheCall);
+
+        // If we have a call graph, update it.
+        if (CG) {
+          CallGraphNode *MemCpyCGN = CG->getOrInsertFunction(MemCpyFn);
+          CallGraphNode *CallerNode = (*CG)[Caller];
+          CallerNode->addCalledFunction(TheMemCpy, MemCpyCGN);
+        }
+
+        // Uses of the argument in the function should use our new alloca
+        // instead.
+        ActualArg = NewAlloca;
+      }
+
+      ValueMap[I] = ActualArg;
+    }
+
+    // Adjust llvm.dbg.region.end. If the CalledFunc has region end
+    // marker then clone that marker after next stop point at the 
+    // call site. The function body cloner does not clone original
+    // region end marker from the CalledFunc. This will ensure that
+    // inlined function's scope ends at the right place. 
+    const DbgRegionEndInst *DREI = findFnRegionEndMarker(CalledFunc);
+    if (DREI) {
+      for (BasicBlock::iterator BI = TheCall, 
+             BE = TheCall->getParent()->end(); BI != BE; ++BI) {
+        if (DbgStopPointInst *DSPI = dyn_cast<DbgStopPointInst>(BI)) {
+          if (DbgRegionEndInst *NewDREI = 
+              dyn_cast<DbgRegionEndInst>(DREI->clone()))
+            NewDREI->insertAfter(DSPI);
+          break;
+        }
+      }
+    }
+
+    // We want the inliner to prune the code as it copies.  We would LOVE to
+    // have no dead or constant instructions leftover after inlining occurs
+    // (which can happen, e.g., because an argument was constant), but we'll be
+    // happy with whatever the cloner can do.
+    CloneAndPruneFunctionInto(Caller, CalledFunc, ValueMap, Returns, ".i",
+                              &InlinedFunctionInfo, TD);
+
+    // Remember the first block that is newly cloned over.
+    FirstNewBlock = LastBlock; ++FirstNewBlock;
+
+    // Update the callgraph if requested.
+    if (CG)
+      UpdateCallGraphAfterInlining(CS, FirstNewBlock, ValueMap, *CG);
+  }
+
+  // If there are any alloca instructions in the block that used to be the entry
+  // block for the callee, move them to the entry block of the caller.  First
+  // calculate which instruction they should be inserted before.  We insert the
+  // instructions at the end of the current alloca list.
+  //
+  {
+    BasicBlock::iterator InsertPoint = Caller->begin()->begin();
+    for (BasicBlock::iterator I = FirstNewBlock->begin(),
+           E = FirstNewBlock->end(); I != E; )
+      if (AllocaInst *AI = dyn_cast<AllocaInst>(I++)) {
+        // If the alloca is now dead, remove it.  This often occurs due to code
+        // specialization.
+        if (AI->use_empty()) {
+          AI->eraseFromParent();
+          continue;
+        }
+
+        if (isa<Constant>(AI->getArraySize())) {
+          // Scan for the block of allocas that we can move over, and move them
+          // all at once.
+          while (isa<AllocaInst>(I) &&
+                 isa<Constant>(cast<AllocaInst>(I)->getArraySize()))
+            ++I;
+
+          // Transfer all of the allocas over in a block.  Using splice means
+          // that the instructions aren't removed from the symbol table, then
+          // reinserted.
+          Caller->getEntryBlock().getInstList().splice(
+              InsertPoint,
+              FirstNewBlock->getInstList(),
+              AI, I);
+        }
+      }
+  }
+
+  // If the inlined code contained dynamic alloca instructions, wrap the inlined
+  // code with llvm.stacksave/llvm.stackrestore intrinsics.
+  if (InlinedFunctionInfo.ContainsDynamicAllocas) {
+    Module *M = Caller->getParent();
+    // Get the two intrinsics we care about.
+    Constant *StackSave, *StackRestore;
+    StackSave    = Intrinsic::getDeclaration(M, Intrinsic::stacksave);
+    StackRestore = Intrinsic::getDeclaration(M, Intrinsic::stackrestore);
+
+    // If we are preserving the callgraph, add edges to the stacksave/restore
+    // functions for the calls we insert.
+    CallGraphNode *StackSaveCGN = 0, *StackRestoreCGN = 0, *CallerNode = 0;
+    if (CG) {
+      // We know that StackSave/StackRestore are Function*'s, because they are
+      // intrinsics which must have the right types.
+      StackSaveCGN    = CG->getOrInsertFunction(cast<Function>(StackSave));
+      StackRestoreCGN = CG->getOrInsertFunction(cast<Function>(StackRestore));
+      CallerNode = (*CG)[Caller];
+    }
+
+    // Insert the llvm.stacksave.
+    CallInst *SavedPtr = CallInst::Create(StackSave, "savedstack",
+                                          FirstNewBlock->begin());
+    if (CG) CallerNode->addCalledFunction(SavedPtr, StackSaveCGN);
+
+    // Insert a call to llvm.stackrestore before any return instructions in the
+    // inlined function.
+    for (unsigned i = 0, e = Returns.size(); i != e; ++i) {
+      CallInst *CI = CallInst::Create(StackRestore, SavedPtr, "", Returns[i]);
+      if (CG) CallerNode->addCalledFunction(CI, StackRestoreCGN);
+    }
+
+    // Count the number of StackRestore calls we insert.
+    unsigned NumStackRestores = Returns.size();
+
+    // If we are inlining an invoke instruction, insert restores before each
+    // unwind.  These unwinds will be rewritten into branches later.
+    if (InlinedFunctionInfo.ContainsUnwinds && isa<InvokeInst>(TheCall)) {
+      for (Function::iterator BB = FirstNewBlock, E = Caller->end();
+           BB != E; ++BB)
+        if (UnwindInst *UI = dyn_cast<UnwindInst>(BB->getTerminator())) {
+          CallInst::Create(StackRestore, SavedPtr, "", UI);
+          ++NumStackRestores;
+        }
+    }
+  }
+
+  // If we are inlining tail call instruction through a call site that isn't
+  // marked 'tail', we must remove the tail marker for any calls in the inlined
+  // code.  Also, calls inlined through a 'nounwind' call site should be marked
+  // 'nounwind'.
+  if (InlinedFunctionInfo.ContainsCalls &&
+      (MustClearTailCallFlags || MarkNoUnwind)) {
+    for (Function::iterator BB = FirstNewBlock, E = Caller->end();
+         BB != E; ++BB)
+      for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I)
+        if (CallInst *CI = dyn_cast<CallInst>(I)) {
+          if (MustClearTailCallFlags)
+            CI->setTailCall(false);
+          if (MarkNoUnwind)
+            CI->setDoesNotThrow();
+        }
+  }
+
+  // If we are inlining through a 'nounwind' call site then any inlined 'unwind'
+  // instructions are unreachable.
+  if (InlinedFunctionInfo.ContainsUnwinds && MarkNoUnwind)
+    for (Function::iterator BB = FirstNewBlock, E = Caller->end();
+         BB != E; ++BB) {
+      TerminatorInst *Term = BB->getTerminator();
+      if (isa<UnwindInst>(Term)) {
+        new UnreachableInst(Term);
+        BB->getInstList().erase(Term);
+      }
+    }
+
+  // If we are inlining for an invoke instruction, we must make sure to rewrite
+  // any inlined 'unwind' instructions into branches to the invoke exception
+  // destination, and call instructions into invoke instructions.
+  if (InvokeInst *II = dyn_cast<InvokeInst>(TheCall))
+    HandleInlinedInvoke(II, FirstNewBlock, InlinedFunctionInfo, CG);
+
+  // If we cloned in _exactly one_ basic block, and if that block ends in a
+  // return instruction, we splice the body of the inlined callee directly into
+  // the calling basic block.
+  if (Returns.size() == 1 && std::distance(FirstNewBlock, Caller->end()) == 1) {
+    // Move all of the instructions right before the call.
+    OrigBB->getInstList().splice(TheCall, FirstNewBlock->getInstList(),
+                                 FirstNewBlock->begin(), FirstNewBlock->end());
+    // Remove the cloned basic block.
+    Caller->getBasicBlockList().pop_back();
+
+    // If the call site was an invoke instruction, add a branch to the normal
+    // destination.
+    if (InvokeInst *II = dyn_cast<InvokeInst>(TheCall))
+      BranchInst::Create(II->getNormalDest(), TheCall);
+
+    // If the return instruction returned a value, replace uses of the call with
+    // uses of the returned value.
+    if (!TheCall->use_empty()) {
+      ReturnInst *R = Returns[0];
+      if (TheCall == R->getReturnValue())
+        TheCall->replaceAllUsesWith(UndefValue::get(TheCall->getType()));
+      else
+        TheCall->replaceAllUsesWith(R->getReturnValue());
+    }
+    // Since we are now done with the Call/Invoke, we can delete it.
+    TheCall->eraseFromParent();
+
+    // Since we are now done with the return instruction, delete it also.
+    Returns[0]->eraseFromParent();
+
+    // We are now done with the inlining.
+    return true;
+  }
+
+  // Otherwise, we have the normal case, of more than one block to inline or
+  // multiple return sites.
+
+  // We want to clone the entire callee function into the hole between the
+  // "starter" and "ender" blocks.  How we accomplish this depends on whether
+  // this is an invoke instruction or a call instruction.
+  BasicBlock *AfterCallBB;
+  if (InvokeInst *II = dyn_cast<InvokeInst>(TheCall)) {
+
+    // Add an unconditional branch to make this look like the CallInst case...
+    BranchInst *NewBr = BranchInst::Create(II->getNormalDest(), TheCall);
+
+    // Split the basic block.  This guarantees that no PHI nodes will have to be
+    // updated due to new incoming edges, and make the invoke case more
+    // symmetric to the call case.
+    AfterCallBB = OrigBB->splitBasicBlock(NewBr,
+                                          CalledFunc->getName()+".exit");
+
+  } else {  // It's a call
+    // If this is a call instruction, we need to split the basic block that
+    // the call lives in.
+    //
+    AfterCallBB = OrigBB->splitBasicBlock(TheCall,
+                                          CalledFunc->getName()+".exit");
+  }
+
+  // Change the branch that used to go to AfterCallBB to branch to the first
+  // basic block of the inlined function.
+  //
+  TerminatorInst *Br = OrigBB->getTerminator();
+  assert(Br && Br->getOpcode() == Instruction::Br &&
+         "splitBasicBlock broken!");
+  Br->setOperand(0, FirstNewBlock);
+
+
+  // Now that the function is correct, make it a little bit nicer.  In
+  // particular, move the basic blocks inserted from the end of the function
+  // into the space made by splitting the source basic block.
+  Caller->getBasicBlockList().splice(AfterCallBB, Caller->getBasicBlockList(),
+                                     FirstNewBlock, Caller->end());
+
+  // Handle all of the return instructions that we just cloned in, and eliminate
+  // any users of the original call/invoke instruction.
+  const Type *RTy = CalledFunc->getReturnType();
+
+  if (Returns.size() > 1) {
+    // The PHI node should go at the front of the new basic block to merge all
+    // possible incoming values.
+    PHINode *PHI = 0;
+    if (!TheCall->use_empty()) {
+      PHI = PHINode::Create(RTy, TheCall->getName(),
+                            AfterCallBB->begin());
+      // Anything that used the result of the function call should now use the
+      // PHI node as their operand.
+      TheCall->replaceAllUsesWith(PHI);
+    }
+
+    // Loop over all of the return instructions adding entries to the PHI node
+    // as appropriate.
+    if (PHI) {
+      for (unsigned i = 0, e = Returns.size(); i != e; ++i) {
+        ReturnInst *RI = Returns[i];
+        assert(RI->getReturnValue()->getType() == PHI->getType() &&
+               "Ret value not consistent in function!");
+        PHI->addIncoming(RI->getReturnValue(), RI->getParent());
+      }
+    }
+
+    // Add a branch to the merge points and remove return instructions.
+    for (unsigned i = 0, e = Returns.size(); i != e; ++i) {
+      ReturnInst *RI = Returns[i];
+      BranchInst::Create(AfterCallBB, RI);
+      RI->eraseFromParent();
+    }
+  } else if (!Returns.empty()) {
+    // Otherwise, if there is exactly one return value, just replace anything
+    // using the return value of the call with the computed value.
+    if (!TheCall->use_empty()) {
+      if (TheCall == Returns[0]->getReturnValue())
+        TheCall->replaceAllUsesWith(UndefValue::get(TheCall->getType()));
+      else
+        TheCall->replaceAllUsesWith(Returns[0]->getReturnValue());
+    }
+
+    // Splice the code from the return block into the block that it will return
+    // to, which contains the code that was after the call.
+    BasicBlock *ReturnBB = Returns[0]->getParent();
+    AfterCallBB->getInstList().splice(AfterCallBB->begin(),
+                                      ReturnBB->getInstList());
+
+    // Update PHI nodes that use the ReturnBB to use the AfterCallBB.
+    ReturnBB->replaceAllUsesWith(AfterCallBB);
+
+    // Delete the return instruction now and empty ReturnBB now.
+    Returns[0]->eraseFromParent();
+    ReturnBB->eraseFromParent();
+  } else if (!TheCall->use_empty()) {
+    // No returns, but something is using the return value of the call.  Just
+    // nuke the result.
+    TheCall->replaceAllUsesWith(UndefValue::get(TheCall->getType()));
+  }
+
+  // Since we are now done with the Call/Invoke, we can delete it.
+  TheCall->eraseFromParent();
+
+  // We should always be able to fold the entry block of the function into the
+  // single predecessor of the block...
+  assert(cast<BranchInst>(Br)->isUnconditional() && "splitBasicBlock broken!");
+  BasicBlock *CalleeEntry = cast<BranchInst>(Br)->getSuccessor(0);
+
+  // Splice the code entry block into calling block, right before the
+  // unconditional branch.
+  OrigBB->getInstList().splice(Br, CalleeEntry->getInstList());
+  CalleeEntry->replaceAllUsesWith(OrigBB);  // Update PHI nodes
+
+  // Remove the unconditional branch.
+  OrigBB->getInstList().erase(Br);
+
+  // Now we can remove the CalleeEntry block, which is now empty.
+  Caller->getBasicBlockList().erase(CalleeEntry);
+
+  return true;
+}
diff --git a/lib/Transforms/Utils/InstructionNamer.cpp b/lib/Transforms/Utils/InstructionNamer.cpp
new file mode 100644
index 0000000..4f8a160
--- /dev/null
+++ b/lib/Transforms/Utils/InstructionNamer.cpp
@@ -0,0 +1,63 @@
+//===- InstructionNamer.cpp - Give anonymous instructions names -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This is a little utility pass that gives instructions names, this is mostly
+// useful when diffing the effect of an optimization because deleting an
+// unnamed instruction can change all other instruction numbering, making the
+// diff very noisy.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Function.h"
+#include "llvm/Pass.h"
+#include "llvm/Type.h"
+using namespace llvm;
+
+namespace {
+  struct InstNamer : public FunctionPass {
+    static char ID; // Pass identification, replacement for typeid
+    InstNamer() : FunctionPass(&ID) {}
+    
+    void getAnalysisUsage(AnalysisUsage &Info) const {
+      Info.setPreservesAll();
+    }
+
+    bool runOnFunction(Function &F) {
+      for (Function::arg_iterator AI = F.arg_begin(), AE = F.arg_end();
+           AI != AE; ++AI)
+        if (!AI->hasName() && AI->getType() != Type::VoidTy)
+          AI->setName("tmp");
+
+      for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
+        if (!BB->hasName())
+          BB->setName("BB");
+        
+        for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I)
+          if (!I->hasName() && I->getType() != Type::VoidTy)
+            I->setName("tmp");
+      }
+      return true;
+    }
+  };
+  
+  char InstNamer::ID = 0;
+  static RegisterPass<InstNamer> X("instnamer",
+                                   "Assign names to anonymous instructions");
+}
+
+
+const PassInfo *const llvm::InstructionNamerID = &X;
+//===----------------------------------------------------------------------===//
+//
+// InstructionNamer - Give any unnamed non-void instructions "tmp" names.
+//
+FunctionPass *llvm::createInstructionNamerPass() {
+  return new InstNamer();
+}
diff --git a/lib/Transforms/Utils/LCSSA.cpp b/lib/Transforms/Utils/LCSSA.cpp
new file mode 100644
index 0000000..7d4f3a3
--- /dev/null
+++ b/lib/Transforms/Utils/LCSSA.cpp
@@ -0,0 +1,276 @@
+//===-- LCSSA.cpp - Convert loops into loop-closed SSA form ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass transforms loops by placing phi nodes at the end of the loops for
+// all values that are live across the loop boundary.  For example, it turns
+// the left into the right code:
+// 
+// for (...)                for (...)
+//   if (c)                   if (c)
+//     X1 = ...                 X1 = ...
+//   else                     else
+//     X2 = ...                 X2 = ...
+//   X3 = phi(X1, X2)         X3 = phi(X1, X2)
+// ... = X3 + 4             X4 = phi(X3)
+//                          ... = X4 + 4
+//
+// This is still valid LLVM; the extra phi nodes are purely redundant, and will
+// be trivially eliminated by InstCombine.  The major benefit of this 
+// transformation is that it makes many other loop optimizations, such as 
+// LoopUnswitching, simpler.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "lcssa"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Constants.h"
+#include "llvm/Pass.h"
+#include "llvm/Function.h"
+#include "llvm/Instructions.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/PredIteratorCache.h"
+#include <algorithm>
+#include <map>
+using namespace llvm;
+
+STATISTIC(NumLCSSA, "Number of live out of a loop variables");
+
+namespace {
+  struct VISIBILITY_HIDDEN LCSSA : public LoopPass {
+    static char ID; // Pass identification, replacement for typeid
+    LCSSA() : LoopPass(&ID) {}
+
+    // Cached analysis information for the current function.
+    LoopInfo *LI;
+    DominatorTree *DT;
+    std::vector<BasicBlock*> LoopBlocks;
+    PredIteratorCache PredCache;
+    
+    virtual bool runOnLoop(Loop *L, LPPassManager &LPM);
+
+    void ProcessInstruction(Instruction* Instr,
+                            const SmallVector<BasicBlock*, 8>& exitBlocks);
+    
+    /// This transformation requires natural loop information & requires that
+    /// loop preheaders be inserted into the CFG.  It maintains both of these,
+    /// as well as the CFG.  It also requires dominator information.
+    ///
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesCFG();
+      AU.addRequiredID(LoopSimplifyID);
+      AU.addPreservedID(LoopSimplifyID);
+      AU.addRequired<LoopInfo>();
+      AU.addPreserved<LoopInfo>();
+      AU.addRequired<DominatorTree>();
+      AU.addPreserved<ScalarEvolution>();
+      AU.addPreserved<DominatorTree>();
+
+      // Request DominanceFrontier now, even though LCSSA does
+      // not use it. This allows Pass Manager to schedule Dominance
+      // Frontier early enough such that one LPPassManager can handle
+      // multiple loop transformation passes.
+      AU.addRequired<DominanceFrontier>(); 
+      AU.addPreserved<DominanceFrontier>();
+    }
+  private:
+    void getLoopValuesUsedOutsideLoop(Loop *L,
+                                      SetVector<Instruction*> &AffectedValues,
+                                 const SmallVector<BasicBlock*, 8>& exitBlocks);
+
+    Value *GetValueForBlock(DomTreeNode *BB, Instruction *OrigInst,
+                            DenseMap<DomTreeNode*, Value*> &Phis);
+
+    /// inLoop - returns true if the given block is within the current loop
+    bool inLoop(BasicBlock* B) {
+      return std::binary_search(LoopBlocks.begin(), LoopBlocks.end(), B);
+    }
+  };
+}
+  
+char LCSSA::ID = 0;
+static RegisterPass<LCSSA> X("lcssa", "Loop-Closed SSA Form Pass");
+
+Pass *llvm::createLCSSAPass() { return new LCSSA(); }
+const PassInfo *const llvm::LCSSAID = &X;
+
+/// runOnFunction - Process all loops in the function, inner-most out.
+bool LCSSA::runOnLoop(Loop *L, LPPassManager &LPM) {
+  PredCache.clear();
+  
+  LI = &LPM.getAnalysis<LoopInfo>();
+  DT = &getAnalysis<DominatorTree>();
+
+  // Speed up queries by creating a sorted list of blocks
+  LoopBlocks.clear();
+  LoopBlocks.insert(LoopBlocks.end(), L->block_begin(), L->block_end());
+  std::sort(LoopBlocks.begin(), LoopBlocks.end());
+  
+  SmallVector<BasicBlock*, 8> exitBlocks;
+  L->getExitBlocks(exitBlocks);
+  
+  SetVector<Instruction*> AffectedValues;
+  getLoopValuesUsedOutsideLoop(L, AffectedValues, exitBlocks);
+  
+  // If no values are affected, we can save a lot of work, since we know that
+  // nothing will be changed.
+  if (AffectedValues.empty())
+    return false;
+  
+  // Iterate over all affected values for this loop and insert Phi nodes
+  // for them in the appropriate exit blocks
+  
+  for (SetVector<Instruction*>::iterator I = AffectedValues.begin(),
+       E = AffectedValues.end(); I != E; ++I)
+    ProcessInstruction(*I, exitBlocks);
+  
+  assert(L->isLCSSAForm());
+  
+  return true;
+}
+
+/// processInstruction - Given a live-out instruction, insert LCSSA Phi nodes,
+/// eliminate all out-of-loop uses.
+void LCSSA::ProcessInstruction(Instruction *Instr,
+                               const SmallVector<BasicBlock*, 8>& exitBlocks) {
+  ++NumLCSSA; // We are applying the transformation
+
+  // Keep track of the blocks that have the value available already.
+  DenseMap<DomTreeNode*, Value*> Phis;
+
+  DomTreeNode *InstrNode = DT->getNode(Instr->getParent());
+
+  // Insert the LCSSA phi's into the exit blocks (dominated by the value), and
+  // add them to the Phi's map.
+  for (SmallVector<BasicBlock*, 8>::const_iterator BBI = exitBlocks.begin(),
+      BBE = exitBlocks.end(); BBI != BBE; ++BBI) {
+    BasicBlock *BB = *BBI;
+    DomTreeNode *ExitBBNode = DT->getNode(BB);
+    Value *&Phi = Phis[ExitBBNode];
+    if (!Phi && DT->dominates(InstrNode, ExitBBNode)) {
+      PHINode *PN = PHINode::Create(Instr->getType(), Instr->getName()+".lcssa",
+                                    BB->begin());
+      PN->reserveOperandSpace(PredCache.GetNumPreds(BB));
+
+      // Remember that this phi makes the value alive in this block.
+      Phi = PN;
+
+      // Add inputs from inside the loop for this PHI.
+      for (BasicBlock** PI = PredCache.GetPreds(BB); *PI; ++PI)
+        PN->addIncoming(Instr, *PI);
+    }
+  }
+  
+  
+  // Record all uses of Instr outside the loop.  We need to rewrite these.  The
+  // LCSSA phis won't be included because they use the value in the loop.
+  for (Value::use_iterator UI = Instr->use_begin(), E = Instr->use_end();
+       UI != E;) {
+    BasicBlock *UserBB = cast<Instruction>(*UI)->getParent();
+    if (PHINode *P = dyn_cast<PHINode>(*UI)) {
+      UserBB = P->getIncomingBlock(UI);
+    }
+    
+    // If the user is in the loop, don't rewrite it!
+    if (UserBB == Instr->getParent() || inLoop(UserBB)) {
+      ++UI;
+      continue;
+    }
+    
+    // Otherwise, patch up uses of the value with the appropriate LCSSA Phi,
+    // inserting PHI nodes into join points where needed.
+    Value *Val = GetValueForBlock(DT->getNode(UserBB), Instr, Phis);
+    
+    // Preincrement the iterator to avoid invalidating it when we change the
+    // value.
+    Use &U = UI.getUse();
+    ++UI;
+    U.set(Val);
+  }
+}
+
+/// getLoopValuesUsedOutsideLoop - Return any values defined in the loop that
+/// are used by instructions outside of it.
+void LCSSA::getLoopValuesUsedOutsideLoop(Loop *L,
+                                      SetVector<Instruction*> &AffectedValues,
+                                const SmallVector<BasicBlock*, 8>& exitBlocks) {
+  // FIXME: For large loops, we may be able to avoid a lot of use-scanning
+  // by using dominance information.  In particular, if a block does not
+  // dominate any of the loop exits, then none of the values defined in the
+  // block could be used outside the loop.
+  for (Loop::block_iterator BB = L->block_begin(), BE = L->block_end();
+       BB != BE; ++BB) {
+    for (BasicBlock::iterator I = (*BB)->begin(), E = (*BB)->end(); I != E; ++I)
+      for (Value::use_iterator UI = I->use_begin(), UE = I->use_end(); UI != UE;
+           ++UI) {
+        BasicBlock *UserBB = cast<Instruction>(*UI)->getParent();
+        if (PHINode* p = dyn_cast<PHINode>(*UI)) {
+          UserBB = p->getIncomingBlock(UI);
+        }
+        
+        if (*BB != UserBB && !inLoop(UserBB)) {
+          AffectedValues.insert(I);
+          break;
+        }
+      }
+  }
+}
+
+/// GetValueForBlock - Get the value to use within the specified basic block.
+/// available values are in Phis.
+Value *LCSSA::GetValueForBlock(DomTreeNode *BB, Instruction *OrigInst,
+                               DenseMap<DomTreeNode*, Value*> &Phis) {
+  // If there is no dominator info for this BB, it is unreachable.
+  if (BB == 0)
+    return UndefValue::get(OrigInst->getType());
+                                 
+  // If we have already computed this value, return the previously computed val.
+  if (Phis.count(BB)) return Phis[BB];
+
+  DomTreeNode *IDom = BB->getIDom();
+
+  // Otherwise, there are two cases: we either have to insert a PHI node or we
+  // don't.  We need to insert a PHI node if this block is not dominated by one
+  // of the exit nodes from the loop (the loop could have multiple exits, and
+  // though the value defined *inside* the loop dominated all its uses, each
+  // exit by itself may not dominate all the uses).
+  //
+  // The simplest way to check for this condition is by checking to see if the
+  // idom is in the loop.  If so, we *know* that none of the exit blocks
+  // dominate this block.  Note that we *know* that the block defining the
+  // original instruction is in the idom chain, because if it weren't, then the
+  // original value didn't dominate this use.
+  if (!inLoop(IDom->getBlock())) {
+    // Idom is not in the loop, we must still be "below" the exit block and must
+    // be fully dominated by the value live in the idom.
+    Value* val = GetValueForBlock(IDom, OrigInst, Phis);
+    Phis.insert(std::make_pair(BB, val));
+    return val;
+  }
+  
+  BasicBlock *BBN = BB->getBlock();
+  
+  // Otherwise, the idom is the loop, so we need to insert a PHI node.  Do so
+  // now, then get values to fill in the incoming values for the PHI.
+  PHINode *PN = PHINode::Create(OrigInst->getType(),
+                                OrigInst->getName() + ".lcssa", BBN->begin());
+  PN->reserveOperandSpace(PredCache.GetNumPreds(BBN));
+  Phis.insert(std::make_pair(BB, PN));
+                                 
+  // Fill in the incoming values for the block.
+  for (BasicBlock** PI = PredCache.GetPreds(BBN); *PI; ++PI)
+    PN->addIncoming(GetValueForBlock(DT->getNode(*PI), OrigInst, Phis), *PI);
+  return PN;
+}
+
diff --git a/lib/Transforms/Utils/Local.cpp b/lib/Transforms/Utils/Local.cpp
new file mode 100644
index 0000000..94483b8
--- /dev/null
+++ b/lib/Transforms/Utils/Local.cpp
@@ -0,0 +1,338 @@
+//===-- Local.cpp - Functions to perform local transformations ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This family of functions perform various local transformations to the
+// program.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Constants.h"
+#include "llvm/GlobalVariable.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Instructions.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/DebugInfo.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Support/GetElementPtrTypeIterator.h"
+#include "llvm/Support/MathExtras.h"
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+//  Local constant propagation.
+//
+
+// ConstantFoldTerminator - If a terminator instruction is predicated on a
+// constant value, convert it into an unconditional branch to the constant
+// destination.
+//
+bool llvm::ConstantFoldTerminator(BasicBlock *BB) {
+  TerminatorInst *T = BB->getTerminator();
+
+  // Branch - See if we are conditional jumping on constant
+  if (BranchInst *BI = dyn_cast<BranchInst>(T)) {
+    if (BI->isUnconditional()) return false;  // Can't optimize uncond branch
+    BasicBlock *Dest1 = BI->getSuccessor(0);
+    BasicBlock *Dest2 = BI->getSuccessor(1);
+
+    if (ConstantInt *Cond = dyn_cast<ConstantInt>(BI->getCondition())) {
+      // Are we branching on constant?
+      // YES.  Change to unconditional branch...
+      BasicBlock *Destination = Cond->getZExtValue() ? Dest1 : Dest2;
+      BasicBlock *OldDest     = Cond->getZExtValue() ? Dest2 : Dest1;
+
+      //cerr << "Function: " << T->getParent()->getParent()
+      //     << "\nRemoving branch from " << T->getParent()
+      //     << "\n\nTo: " << OldDest << endl;
+
+      // Let the basic block know that we are letting go of it.  Based on this,
+      // it will adjust it's PHI nodes.
+      assert(BI->getParent() && "Terminator not inserted in block!");
+      OldDest->removePredecessor(BI->getParent());
+
+      // Set the unconditional destination, and change the insn to be an
+      // unconditional branch.
+      BI->setUnconditionalDest(Destination);
+      return true;
+    } else if (Dest2 == Dest1) {       // Conditional branch to same location?
+      // This branch matches something like this:
+      //     br bool %cond, label %Dest, label %Dest
+      // and changes it into:  br label %Dest
+
+      // Let the basic block know that we are letting go of one copy of it.
+      assert(BI->getParent() && "Terminator not inserted in block!");
+      Dest1->removePredecessor(BI->getParent());
+
+      // Change a conditional branch to unconditional.
+      BI->setUnconditionalDest(Dest1);
+      return true;
+    }
+  } else if (SwitchInst *SI = dyn_cast<SwitchInst>(T)) {
+    // If we are switching on a constant, we can convert the switch into a
+    // single branch instruction!
+    ConstantInt *CI = dyn_cast<ConstantInt>(SI->getCondition());
+    BasicBlock *TheOnlyDest = SI->getSuccessor(0);  // The default dest
+    BasicBlock *DefaultDest = TheOnlyDest;
+    assert(TheOnlyDest == SI->getDefaultDest() &&
+           "Default destination is not successor #0?");
+
+    // Figure out which case it goes to...
+    for (unsigned i = 1, e = SI->getNumSuccessors(); i != e; ++i) {
+      // Found case matching a constant operand?
+      if (SI->getSuccessorValue(i) == CI) {
+        TheOnlyDest = SI->getSuccessor(i);
+        break;
+      }
+
+      // Check to see if this branch is going to the same place as the default
+      // dest.  If so, eliminate it as an explicit compare.
+      if (SI->getSuccessor(i) == DefaultDest) {
+        // Remove this entry...
+        DefaultDest->removePredecessor(SI->getParent());
+        SI->removeCase(i);
+        --i; --e;  // Don't skip an entry...
+        continue;
+      }
+
+      // Otherwise, check to see if the switch only branches to one destination.
+      // We do this by reseting "TheOnlyDest" to null when we find two non-equal
+      // destinations.
+      if (SI->getSuccessor(i) != TheOnlyDest) TheOnlyDest = 0;
+    }
+
+    if (CI && !TheOnlyDest) {
+      // Branching on a constant, but not any of the cases, go to the default
+      // successor.
+      TheOnlyDest = SI->getDefaultDest();
+    }
+
+    // If we found a single destination that we can fold the switch into, do so
+    // now.
+    if (TheOnlyDest) {
+      // Insert the new branch..
+      BranchInst::Create(TheOnlyDest, SI);
+      BasicBlock *BB = SI->getParent();
+
+      // Remove entries from PHI nodes which we no longer branch to...
+      for (unsigned i = 0, e = SI->getNumSuccessors(); i != e; ++i) {
+        // Found case matching a constant operand?
+        BasicBlock *Succ = SI->getSuccessor(i);
+        if (Succ == TheOnlyDest)
+          TheOnlyDest = 0;  // Don't modify the first branch to TheOnlyDest
+        else
+          Succ->removePredecessor(BB);
+      }
+
+      // Delete the old switch...
+      BB->getInstList().erase(SI);
+      return true;
+    } else if (SI->getNumSuccessors() == 2) {
+      // Otherwise, we can fold this switch into a conditional branch
+      // instruction if it has only one non-default destination.
+      Value *Cond = new ICmpInst(ICmpInst::ICMP_EQ, SI->getCondition(),
+                                 SI->getSuccessorValue(1), "cond", SI);
+      // Insert the new branch...
+      BranchInst::Create(SI->getSuccessor(1), SI->getSuccessor(0), Cond, SI);
+
+      // Delete the old switch...
+      SI->eraseFromParent();
+      return true;
+    }
+  }
+  return false;
+}
+
+
+//===----------------------------------------------------------------------===//
+//  Local dead code elimination...
+//
+
+/// isInstructionTriviallyDead - Return true if the result produced by the
+/// instruction is not used, and the instruction has no side effects.
+///
+bool llvm::isInstructionTriviallyDead(Instruction *I) {
+  if (!I->use_empty() || isa<TerminatorInst>(I)) return false;
+
+  // We don't want debug info removed by anything this general.
+  if (isa<DbgInfoIntrinsic>(I)) return false;
+
+  if (!I->mayHaveSideEffects()) return true;
+
+  // Special case intrinsics that "may have side effects" but can be deleted
+  // when dead.
+  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I))
+    // Safe to delete llvm.stacksave if dead.
+    if (II->getIntrinsicID() == Intrinsic::stacksave)
+      return true;
+  return false;
+}
+
+/// RecursivelyDeleteTriviallyDeadInstructions - If the specified value is a
+/// trivially dead instruction, delete it.  If that makes any of its operands
+/// trivially dead, delete them too, recursively.
+void llvm::RecursivelyDeleteTriviallyDeadInstructions(Value *V) {
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (!I || !I->use_empty() || !isInstructionTriviallyDead(I))
+    return;
+  
+  SmallVector<Instruction*, 16> DeadInsts;
+  DeadInsts.push_back(I);
+  
+  while (!DeadInsts.empty()) {
+    I = DeadInsts.pop_back_val();
+
+    // Null out all of the instruction's operands to see if any operand becomes
+    // dead as we go.
+    for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
+      Value *OpV = I->getOperand(i);
+      I->setOperand(i, 0);
+      
+      if (!OpV->use_empty()) continue;
+    
+      // If the operand is an instruction that became dead as we nulled out the
+      // operand, and if it is 'trivially' dead, delete it in a future loop
+      // iteration.
+      if (Instruction *OpI = dyn_cast<Instruction>(OpV))
+        if (isInstructionTriviallyDead(OpI))
+          DeadInsts.push_back(OpI);
+    }
+    
+    I->eraseFromParent();
+  }
+}
+
+/// RecursivelyDeleteDeadPHINode - If the specified value is an effectively
+/// dead PHI node, due to being a def-use chain of single-use nodes that
+/// either forms a cycle or is terminated by a trivially dead instruction,
+/// delete it.  If that makes any of its operands trivially dead, delete them
+/// too, recursively.
+void
+llvm::RecursivelyDeleteDeadPHINode(PHINode *PN) {
+
+  // We can remove a PHI if it is on a cycle in the def-use graph
+  // where each node in the cycle has degree one, i.e. only one use,
+  // and is an instruction with no side effects.
+  if (!PN->hasOneUse())
+    return;
+
+  SmallPtrSet<PHINode *, 4> PHIs;
+  PHIs.insert(PN);
+  for (Instruction *J = cast<Instruction>(*PN->use_begin());
+       J->hasOneUse() && !J->mayHaveSideEffects();
+       J = cast<Instruction>(*J->use_begin()))
+    // If we find a PHI more than once, we're on a cycle that
+    // won't prove fruitful.
+    if (PHINode *JP = dyn_cast<PHINode>(J))
+      if (!PHIs.insert(cast<PHINode>(JP))) {
+        // Break the cycle and delete the PHI and its operands.
+        JP->replaceAllUsesWith(UndefValue::get(JP->getType()));
+        RecursivelyDeleteTriviallyDeadInstructions(JP);
+        break;
+      }
+}
+
+//===----------------------------------------------------------------------===//
+//  Control Flow Graph Restructuring...
+//
+
+/// MergeBasicBlockIntoOnlyPred - DestBB is a block with one predecessor and its
+/// predecessor is known to have one successor (DestBB!).  Eliminate the edge
+/// between them, moving the instructions in the predecessor into DestBB and
+/// deleting the predecessor block.
+///
+void llvm::MergeBasicBlockIntoOnlyPred(BasicBlock *DestBB) {
+  // If BB has single-entry PHI nodes, fold them.
+  while (PHINode *PN = dyn_cast<PHINode>(DestBB->begin())) {
+    Value *NewVal = PN->getIncomingValue(0);
+    // Replace self referencing PHI with undef, it must be dead.
+    if (NewVal == PN) NewVal = UndefValue::get(PN->getType());
+    PN->replaceAllUsesWith(NewVal);
+    PN->eraseFromParent();
+  }
+  
+  BasicBlock *PredBB = DestBB->getSinglePredecessor();
+  assert(PredBB && "Block doesn't have a single predecessor!");
+  
+  // Splice all the instructions from PredBB to DestBB.
+  PredBB->getTerminator()->eraseFromParent();
+  DestBB->getInstList().splice(DestBB->begin(), PredBB->getInstList());
+  
+  // Anything that branched to PredBB now branches to DestBB.
+  PredBB->replaceAllUsesWith(DestBB);
+  
+  // Nuke BB.
+  PredBB->eraseFromParent();
+}
+
+/// OnlyUsedByDbgIntrinsics - Return true if the instruction I is only used
+/// by DbgIntrinsics. If DbgInUses is specified then the vector is filled 
+/// with the DbgInfoIntrinsic that use the instruction I.
+bool llvm::OnlyUsedByDbgInfoIntrinsics(Instruction *I, 
+                               SmallVectorImpl<DbgInfoIntrinsic *> *DbgInUses) {
+  if (DbgInUses)
+    DbgInUses->clear();
+
+  for (Value::use_iterator UI = I->use_begin(), UE = I->use_end(); UI != UE; 
+       ++UI) {
+    if (DbgInfoIntrinsic *DI = dyn_cast<DbgInfoIntrinsic>(*UI)) {
+      if (DbgInUses)
+        DbgInUses->push_back(DI);
+    } else {
+      if (DbgInUses)
+        DbgInUses->clear();
+      return false;
+    }
+  }
+  return true;
+}
+
+/// UserIsDebugInfo - Return true if U is a constant expr used by 
+/// llvm.dbg.variable or llvm.dbg.global_variable
+bool llvm::UserIsDebugInfo(User *U) {
+  ConstantExpr *CE = dyn_cast<ConstantExpr>(U);
+
+  if (!CE || CE->getNumUses() != 1)
+    return false;
+
+  Constant *Init = dyn_cast<Constant>(CE->use_back());
+  if (!Init || Init->getNumUses() != 1)
+    return false;
+
+  GlobalVariable *GV = dyn_cast<GlobalVariable>(Init->use_back());
+  if (!GV || !GV->hasInitializer() || GV->getInitializer() != Init)
+    return false;
+
+  DIVariable DV(GV);
+  if (!DV.isNull()) 
+    return true; // User is llvm.dbg.variable
+
+  DIGlobalVariable DGV(GV);
+  if (!DGV.isNull())
+    return true; // User is llvm.dbg.global_variable
+
+  return false;
+}
+
+/// RemoveDbgInfoUser - Remove an User which is representing debug info.
+void llvm::RemoveDbgInfoUser(User *U) {
+  assert (UserIsDebugInfo(U) && "Unexpected User!");
+  ConstantExpr *CE = cast<ConstantExpr>(U);
+  while (!CE->use_empty()) {
+    Constant *C = cast<Constant>(CE->use_back());
+    while (!C->use_empty()) {
+      GlobalVariable *GV = cast<GlobalVariable>(C->use_back());
+      GV->eraseFromParent();
+    }
+    C->destroyConstant();
+  }
+  CE->destroyConstant();
+}
diff --git a/lib/Transforms/Utils/LoopSimplify.cpp b/lib/Transforms/Utils/LoopSimplify.cpp
new file mode 100644
index 0000000..03d273d
--- /dev/null
+++ b/lib/Transforms/Utils/LoopSimplify.cpp
@@ -0,0 +1,600 @@
+//===- LoopSimplify.cpp - Loop Canonicalization Pass ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass performs several transformations to transform natural loops into a
+// simpler form, which makes subsequent analyses and transformations simpler and
+// more effective.
+//
+// Loop pre-header insertion guarantees that there is a single, non-critical
+// entry edge from outside of the loop to the loop header.  This simplifies a
+// number of analyses and transformations, such as LICM.
+//
+// Loop exit-block insertion guarantees that all exit blocks from the loop
+// (blocks which are outside of the loop that have predecessors inside of the
+// loop) only have predecessors from inside of the loop (and are thus dominated
+// by the loop header).  This simplifies transformations such as store-sinking
+// that are built into LICM.
+//
+// This pass also guarantees that loops will have exactly one backedge.
+//
+// Note that the simplifycfg pass will clean up blocks which are split out but
+// end up being unnecessary, so usage of this pass should not pessimize
+// generated code.
+//
+// This pass obviously modifies the CFG, but updates loop information and
+// dominator information.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "loopsimplify"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Constants.h"
+#include "llvm/Instructions.h"
+#include "llvm/Function.h"
+#include "llvm/Type.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/ADT/SetOperations.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+using namespace llvm;
+
+STATISTIC(NumInserted, "Number of pre-header or exit blocks inserted");
+STATISTIC(NumNested  , "Number of nested loops split out");
+
+namespace {
+  struct VISIBILITY_HIDDEN LoopSimplify : public FunctionPass {
+    static char ID; // Pass identification, replacement for typeid
+    LoopSimplify() : FunctionPass(&ID) {}
+
+    // AA - If we have an alias analysis object to update, this is it, otherwise
+    // this is null.
+    AliasAnalysis *AA;
+    LoopInfo *LI;
+    DominatorTree *DT;
+    virtual bool runOnFunction(Function &F);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      // We need loop information to identify the loops...
+      AU.addRequired<LoopInfo>();
+      AU.addRequired<DominatorTree>();
+
+      AU.addPreserved<LoopInfo>();
+      AU.addPreserved<DominatorTree>();
+      AU.addPreserved<DominanceFrontier>();
+      AU.addPreserved<AliasAnalysis>();
+      AU.addPreservedID(BreakCriticalEdgesID);  // No critical edges added.
+    }
+
+    /// verifyAnalysis() - Verify loop nest.
+    void verifyAnalysis() const {
+#ifndef NDEBUG
+      LoopInfo *NLI = &getAnalysis<LoopInfo>();
+      for (LoopInfo::iterator I = NLI->begin(), E = NLI->end(); I != E; ++I) 
+        (*I)->verifyLoop();
+#endif  
+    }
+
+  private:
+    bool ProcessLoop(Loop *L);
+    BasicBlock *RewriteLoopExitBlock(Loop *L, BasicBlock *Exit);
+    void InsertPreheaderForLoop(Loop *L);
+    Loop *SeparateNestedLoop(Loop *L);
+    void InsertUniqueBackedgeBlock(Loop *L);
+    void PlaceSplitBlockCarefully(BasicBlock *NewBB,
+                                  SmallVectorImpl<BasicBlock*> &SplitPreds,
+                                  Loop *L);
+  };
+}
+
+char LoopSimplify::ID = 0;
+static RegisterPass<LoopSimplify>
+X("loopsimplify", "Canonicalize natural loops", true);
+
+// Publically exposed interface to pass...
+const PassInfo *const llvm::LoopSimplifyID = &X;
+FunctionPass *llvm::createLoopSimplifyPass() { return new LoopSimplify(); }
+
+/// runOnFunction - Run down all loops in the CFG (recursively, but we could do
+/// it in any convenient order) inserting preheaders...
+///
+bool LoopSimplify::runOnFunction(Function &F) {
+  bool Changed = false;
+  LI = &getAnalysis<LoopInfo>();
+  AA = getAnalysisIfAvailable<AliasAnalysis>();
+  DT = &getAnalysis<DominatorTree>();
+
+  // Check to see that no blocks (other than the header) in loops have
+  // predecessors that are not in loops.  This is not valid for natural loops,
+  // but can occur if the blocks are unreachable.  Since they are unreachable we
+  // can just shamelessly destroy their terminators to make them not branch into
+  // the loop!
+  for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
+    // This case can only occur for unreachable blocks.  Blocks that are
+    // unreachable can't be in loops, so filter those blocks out.
+    if (LI->getLoopFor(BB)) continue;
+    
+    bool BlockUnreachable = false;
+    TerminatorInst *TI = BB->getTerminator();
+
+    // Check to see if any successors of this block are non-loop-header loops
+    // that are not the header.
+    for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i) {
+      // If this successor is not in a loop, BB is clearly ok.
+      Loop *L = LI->getLoopFor(TI->getSuccessor(i));
+      if (!L) continue;
+      
+      // If the succ is the loop header, and if L is a top-level loop, then this
+      // is an entrance into a loop through the header, which is also ok.
+      if (L->getHeader() == TI->getSuccessor(i) && L->getParentLoop() == 0)
+        continue;
+      
+      // Otherwise, this is an entrance into a loop from some place invalid.
+      // Either the loop structure is invalid and this is not a natural loop (in
+      // which case the compiler is buggy somewhere else) or BB is unreachable.
+      BlockUnreachable = true;
+      break;
+    }
+    
+    // If this block is ok, check the next one.
+    if (!BlockUnreachable) continue;
+    
+    // Otherwise, this block is dead.  To clean up the CFG and to allow later
+    // loop transformations to ignore this case, we delete the edges into the
+    // loop by replacing the terminator.
+    
+    // Remove PHI entries from the successors.
+    for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i)
+      TI->getSuccessor(i)->removePredecessor(BB);
+   
+    // Add a new unreachable instruction before the old terminator.
+    new UnreachableInst(TI);
+    
+    // Delete the dead terminator.
+    if (AA) AA->deleteValue(TI);
+    if (!TI->use_empty())
+      TI->replaceAllUsesWith(UndefValue::get(TI->getType()));
+    TI->eraseFromParent();
+    Changed |= true;
+  }
+  
+  for (LoopInfo::iterator I = LI->begin(), E = LI->end(); I != E; ++I)
+    Changed |= ProcessLoop(*I);
+
+  return Changed;
+}
+
+/// ProcessLoop - Walk the loop structure in depth first order, ensuring that
+/// all loops have preheaders.
+///
+bool LoopSimplify::ProcessLoop(Loop *L) {
+  bool Changed = false;
+ReprocessLoop:
+  
+  // Canonicalize inner loops before outer loops.  Inner loop canonicalization
+  // can provide work for the outer loop to canonicalize.
+  for (Loop::iterator I = L->begin(), E = L->end(); I != E; ++I)
+    Changed |= ProcessLoop(*I);
+  
+  assert(L->getBlocks()[0] == L->getHeader() &&
+         "Header isn't first block in loop?");
+
+  // Does the loop already have a preheader?  If so, don't insert one.
+  if (L->getLoopPreheader() == 0) {
+    InsertPreheaderForLoop(L);
+    NumInserted++;
+    Changed = true;
+  }
+
+  // Next, check to make sure that all exit nodes of the loop only have
+  // predecessors that are inside of the loop.  This check guarantees that the
+  // loop preheader/header will dominate the exit blocks.  If the exit block has
+  // predecessors from outside of the loop, split the edge now.
+  SmallVector<BasicBlock*, 8> ExitBlocks;
+  L->getExitBlocks(ExitBlocks);
+    
+  SetVector<BasicBlock*> ExitBlockSet(ExitBlocks.begin(), ExitBlocks.end());
+  for (SetVector<BasicBlock*>::iterator I = ExitBlockSet.begin(),
+         E = ExitBlockSet.end(); I != E; ++I) {
+    BasicBlock *ExitBlock = *I;
+    for (pred_iterator PI = pred_begin(ExitBlock), PE = pred_end(ExitBlock);
+         PI != PE; ++PI)
+      // Must be exactly this loop: no subloops, parent loops, or non-loop preds
+      // allowed.
+      if (!L->contains(*PI)) {
+        RewriteLoopExitBlock(L, ExitBlock);
+        NumInserted++;
+        Changed = true;
+        break;
+      }
+  }
+
+  // If the header has more than two predecessors at this point (from the
+  // preheader and from multiple backedges), we must adjust the loop.
+  unsigned NumBackedges = L->getNumBackEdges();
+  if (NumBackedges != 1) {
+    // If this is really a nested loop, rip it out into a child loop.  Don't do
+    // this for loops with a giant number of backedges, just factor them into a
+    // common backedge instead.
+    if (NumBackedges < 8) {
+      if (Loop *NL = SeparateNestedLoop(L)) {
+        ++NumNested;
+        // This is a big restructuring change, reprocess the whole loop.
+        ProcessLoop(NL);
+        Changed = true;
+        // GCC doesn't tail recursion eliminate this.
+        goto ReprocessLoop;
+      }
+    }
+
+    // If we either couldn't, or didn't want to, identify nesting of the loops,
+    // insert a new block that all backedges target, then make it jump to the
+    // loop header.
+    InsertUniqueBackedgeBlock(L);
+    NumInserted++;
+    Changed = true;
+  }
+
+  // Scan over the PHI nodes in the loop header.  Since they now have only two
+  // incoming values (the loop is canonicalized), we may have simplified the PHI
+  // down to 'X = phi [X, Y]', which should be replaced with 'Y'.
+  PHINode *PN;
+  for (BasicBlock::iterator I = L->getHeader()->begin();
+       (PN = dyn_cast<PHINode>(I++)); )
+    if (Value *V = PN->hasConstantValue()) {
+      if (AA) AA->deleteValue(PN);
+      PN->replaceAllUsesWith(V);
+      PN->eraseFromParent();
+    }
+
+  return Changed;
+}
+
+/// InsertPreheaderForLoop - Once we discover that a loop doesn't have a
+/// preheader, this method is called to insert one.  This method has two phases:
+/// preheader insertion and analysis updating.
+///
+void LoopSimplify::InsertPreheaderForLoop(Loop *L) {
+  BasicBlock *Header = L->getHeader();
+
+  // Compute the set of predecessors of the loop that are not in the loop.
+  SmallVector<BasicBlock*, 8> OutsideBlocks;
+  for (pred_iterator PI = pred_begin(Header), PE = pred_end(Header);
+       PI != PE; ++PI)
+    if (!L->contains(*PI))           // Coming in from outside the loop?
+      OutsideBlocks.push_back(*PI);  // Keep track of it...
+
+  // Split out the loop pre-header.
+  BasicBlock *NewBB =
+    SplitBlockPredecessors(Header, &OutsideBlocks[0], OutsideBlocks.size(),
+                           ".preheader", this);
+  
+
+  //===--------------------------------------------------------------------===//
+  //  Update analysis results now that we have performed the transformation
+  //
+
+  // We know that we have loop information to update... update it now.
+  if (Loop *Parent = L->getParentLoop())
+    Parent->addBasicBlockToLoop(NewBB, LI->getBase());
+
+  // Make sure that NewBB is put someplace intelligent, which doesn't mess up
+  // code layout too horribly.
+  PlaceSplitBlockCarefully(NewBB, OutsideBlocks, L);
+}
+
+/// RewriteLoopExitBlock - Ensure that the loop preheader dominates all exit
+/// blocks.  This method is used to split exit blocks that have predecessors
+/// outside of the loop.
+BasicBlock *LoopSimplify::RewriteLoopExitBlock(Loop *L, BasicBlock *Exit) {
+  SmallVector<BasicBlock*, 8> LoopBlocks;
+  for (pred_iterator I = pred_begin(Exit), E = pred_end(Exit); I != E; ++I)
+    if (L->contains(*I))
+      LoopBlocks.push_back(*I);
+
+  assert(!LoopBlocks.empty() && "No edges coming in from outside the loop?");
+  BasicBlock *NewBB = SplitBlockPredecessors(Exit, &LoopBlocks[0], 
+                                             LoopBlocks.size(), ".loopexit",
+                                             this);
+
+  // Update Loop Information - we know that the new block will be in whichever
+  // loop the Exit block is in.  Note that it may not be in that immediate loop,
+  // if the successor is some other loop header.  In that case, we continue 
+  // walking up the loop tree to find a loop that contains both the successor
+  // block and the predecessor block.
+  Loop *SuccLoop = LI->getLoopFor(Exit);
+  while (SuccLoop && !SuccLoop->contains(L->getHeader()))
+    SuccLoop = SuccLoop->getParentLoop();
+  if (SuccLoop)
+    SuccLoop->addBasicBlockToLoop(NewBB, LI->getBase());
+
+  return NewBB;
+}
+
+/// AddBlockAndPredsToSet - Add the specified block, and all of its
+/// predecessors, to the specified set, if it's not already in there.  Stop
+/// predecessor traversal when we reach StopBlock.
+static void AddBlockAndPredsToSet(BasicBlock *InputBB, BasicBlock *StopBlock,
+                                  std::set<BasicBlock*> &Blocks) {
+  std::vector<BasicBlock *> WorkList;
+  WorkList.push_back(InputBB);
+  do {
+    BasicBlock *BB = WorkList.back(); WorkList.pop_back();
+    if (Blocks.insert(BB).second && BB != StopBlock)
+      // If BB is not already processed and it is not a stop block then
+      // insert its predecessor in the work list
+      for (pred_iterator I = pred_begin(BB), E = pred_end(BB); I != E; ++I) {
+        BasicBlock *WBB = *I;
+        WorkList.push_back(WBB);
+      }
+  } while(!WorkList.empty());
+}
+
+/// FindPHIToPartitionLoops - The first part of loop-nestification is to find a
+/// PHI node that tells us how to partition the loops.
+static PHINode *FindPHIToPartitionLoops(Loop *L, DominatorTree *DT,
+                                        AliasAnalysis *AA) {
+  for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ) {
+    PHINode *PN = cast<PHINode>(I);
+    ++I;
+    if (Value *V = PN->hasConstantValue())
+      if (!isa<Instruction>(V) || DT->dominates(cast<Instruction>(V), PN)) {
+        // This is a degenerate PHI already, don't modify it!
+        PN->replaceAllUsesWith(V);
+        if (AA) AA->deleteValue(PN);
+        PN->eraseFromParent();
+        continue;
+      }
+
+    // Scan this PHI node looking for a use of the PHI node by itself.
+    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+      if (PN->getIncomingValue(i) == PN &&
+          L->contains(PN->getIncomingBlock(i)))
+        // We found something tasty to remove.
+        return PN;
+  }
+  return 0;
+}
+
+// PlaceSplitBlockCarefully - If the block isn't already, move the new block to
+// right after some 'outside block' block.  This prevents the preheader from
+// being placed inside the loop body, e.g. when the loop hasn't been rotated.
+void LoopSimplify::PlaceSplitBlockCarefully(BasicBlock *NewBB,
+                                       SmallVectorImpl<BasicBlock*> &SplitPreds,
+                                            Loop *L) {
+  // Check to see if NewBB is already well placed.
+  Function::iterator BBI = NewBB; --BBI;
+  for (unsigned i = 0, e = SplitPreds.size(); i != e; ++i) {
+    if (&*BBI == SplitPreds[i])
+      return;
+  }
+  
+  // If it isn't already after an outside block, move it after one.  This is
+  // always good as it makes the uncond branch from the outside block into a
+  // fall-through.
+  
+  // Figure out *which* outside block to put this after.  Prefer an outside
+  // block that neighbors a BB actually in the loop.
+  BasicBlock *FoundBB = 0;
+  for (unsigned i = 0, e = SplitPreds.size(); i != e; ++i) {
+    Function::iterator BBI = SplitPreds[i];
+    if (++BBI != NewBB->getParent()->end() && 
+        L->contains(BBI)) {
+      FoundBB = SplitPreds[i];
+      break;
+    }
+  }
+  
+  // If our heuristic for a *good* bb to place this after doesn't find
+  // anything, just pick something.  It's likely better than leaving it within
+  // the loop.
+  if (!FoundBB)
+    FoundBB = SplitPreds[0];
+  NewBB->moveAfter(FoundBB);
+}
+
+
+/// SeparateNestedLoop - If this loop has multiple backedges, try to pull one of
+/// them out into a nested loop.  This is important for code that looks like
+/// this:
+///
+///  Loop:
+///     ...
+///     br cond, Loop, Next
+///     ...
+///     br cond2, Loop, Out
+///
+/// To identify this common case, we look at the PHI nodes in the header of the
+/// loop.  PHI nodes with unchanging values on one backedge correspond to values
+/// that change in the "outer" loop, but not in the "inner" loop.
+///
+/// If we are able to separate out a loop, return the new outer loop that was
+/// created.
+///
+Loop *LoopSimplify::SeparateNestedLoop(Loop *L) {
+  PHINode *PN = FindPHIToPartitionLoops(L, DT, AA);
+  if (PN == 0) return 0;  // No known way to partition.
+
+  // Pull out all predecessors that have varying values in the loop.  This
+  // handles the case when a PHI node has multiple instances of itself as
+  // arguments.
+  SmallVector<BasicBlock*, 8> OuterLoopPreds;
+  for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+    if (PN->getIncomingValue(i) != PN ||
+        !L->contains(PN->getIncomingBlock(i)))
+      OuterLoopPreds.push_back(PN->getIncomingBlock(i));
+
+  BasicBlock *Header = L->getHeader();
+  BasicBlock *NewBB = SplitBlockPredecessors(Header, &OuterLoopPreds[0],
+                                             OuterLoopPreds.size(),
+                                             ".outer", this);
+
+  // Make sure that NewBB is put someplace intelligent, which doesn't mess up
+  // code layout too horribly.
+  PlaceSplitBlockCarefully(NewBB, OuterLoopPreds, L);
+  
+  // Create the new outer loop.
+  Loop *NewOuter = new Loop();
+
+  // Change the parent loop to use the outer loop as its child now.
+  if (Loop *Parent = L->getParentLoop())
+    Parent->replaceChildLoopWith(L, NewOuter);
+  else
+    LI->changeTopLevelLoop(L, NewOuter);
+
+  // This block is going to be our new header block: add it to this loop and all
+  // parent loops.
+  NewOuter->addBasicBlockToLoop(NewBB, LI->getBase());
+
+  // L is now a subloop of our outer loop.
+  NewOuter->addChildLoop(L);
+
+  for (Loop::block_iterator I = L->block_begin(), E = L->block_end();
+       I != E; ++I)
+    NewOuter->addBlockEntry(*I);
+
+  // Determine which blocks should stay in L and which should be moved out to
+  // the Outer loop now.
+  std::set<BasicBlock*> BlocksInL;
+  for (pred_iterator PI = pred_begin(Header), E = pred_end(Header); PI!=E; ++PI)
+    if (DT->dominates(Header, *PI))
+      AddBlockAndPredsToSet(*PI, Header, BlocksInL);
+
+
+  // Scan all of the loop children of L, moving them to OuterLoop if they are
+  // not part of the inner loop.
+  const std::vector<Loop*> &SubLoops = L->getSubLoops();
+  for (size_t I = 0; I != SubLoops.size(); )
+    if (BlocksInL.count(SubLoops[I]->getHeader()))
+      ++I;   // Loop remains in L
+    else
+      NewOuter->addChildLoop(L->removeChildLoop(SubLoops.begin() + I));
+
+  // Now that we know which blocks are in L and which need to be moved to
+  // OuterLoop, move any blocks that need it.
+  for (unsigned i = 0; i != L->getBlocks().size(); ++i) {
+    BasicBlock *BB = L->getBlocks()[i];
+    if (!BlocksInL.count(BB)) {
+      // Move this block to the parent, updating the exit blocks sets
+      L->removeBlockFromLoop(BB);
+      if ((*LI)[BB] == L)
+        LI->changeLoopFor(BB, NewOuter);
+      --i;
+    }
+  }
+
+  return NewOuter;
+}
+
+
+
+/// InsertUniqueBackedgeBlock - This method is called when the specified loop
+/// has more than one backedge in it.  If this occurs, revector all of these
+/// backedges to target a new basic block and have that block branch to the loop
+/// header.  This ensures that loops have exactly one backedge.
+///
+void LoopSimplify::InsertUniqueBackedgeBlock(Loop *L) {
+  assert(L->getNumBackEdges() > 1 && "Must have > 1 backedge!");
+
+  // Get information about the loop
+  BasicBlock *Preheader = L->getLoopPreheader();
+  BasicBlock *Header = L->getHeader();
+  Function *F = Header->getParent();
+
+  // Figure out which basic blocks contain back-edges to the loop header.
+  std::vector<BasicBlock*> BackedgeBlocks;
+  for (pred_iterator I = pred_begin(Header), E = pred_end(Header); I != E; ++I)
+    if (*I != Preheader) BackedgeBlocks.push_back(*I);
+
+  // Create and insert the new backedge block...
+  BasicBlock *BEBlock = BasicBlock::Create(Header->getName()+".backedge", F);
+  BranchInst *BETerminator = BranchInst::Create(Header, BEBlock);
+
+  // Move the new backedge block to right after the last backedge block.
+  Function::iterator InsertPos = BackedgeBlocks.back(); ++InsertPos;
+  F->getBasicBlockList().splice(InsertPos, F->getBasicBlockList(), BEBlock);
+
+  // Now that the block has been inserted into the function, create PHI nodes in
+  // the backedge block which correspond to any PHI nodes in the header block.
+  for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(I); ++I) {
+    PHINode *PN = cast<PHINode>(I);
+    PHINode *NewPN = PHINode::Create(PN->getType(), PN->getName()+".be",
+                                     BETerminator);
+    NewPN->reserveOperandSpace(BackedgeBlocks.size());
+    if (AA) AA->copyValue(PN, NewPN);
+
+    // Loop over the PHI node, moving all entries except the one for the
+    // preheader over to the new PHI node.
+    unsigned PreheaderIdx = ~0U;
+    bool HasUniqueIncomingValue = true;
+    Value *UniqueValue = 0;
+    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+      BasicBlock *IBB = PN->getIncomingBlock(i);
+      Value *IV = PN->getIncomingValue(i);
+      if (IBB == Preheader) {
+        PreheaderIdx = i;
+      } else {
+        NewPN->addIncoming(IV, IBB);
+        if (HasUniqueIncomingValue) {
+          if (UniqueValue == 0)
+            UniqueValue = IV;
+          else if (UniqueValue != IV)
+            HasUniqueIncomingValue = false;
+        }
+      }
+    }
+
+    // Delete all of the incoming values from the old PN except the preheader's
+    assert(PreheaderIdx != ~0U && "PHI has no preheader entry??");
+    if (PreheaderIdx != 0) {
+      PN->setIncomingValue(0, PN->getIncomingValue(PreheaderIdx));
+      PN->setIncomingBlock(0, PN->getIncomingBlock(PreheaderIdx));
+    }
+    // Nuke all entries except the zero'th.
+    for (unsigned i = 0, e = PN->getNumIncomingValues()-1; i != e; ++i)
+      PN->removeIncomingValue(e-i, false);
+
+    // Finally, add the newly constructed PHI node as the entry for the BEBlock.
+    PN->addIncoming(NewPN, BEBlock);
+
+    // As an optimization, if all incoming values in the new PhiNode (which is a
+    // subset of the incoming values of the old PHI node) have the same value,
+    // eliminate the PHI Node.
+    if (HasUniqueIncomingValue) {
+      NewPN->replaceAllUsesWith(UniqueValue);
+      if (AA) AA->deleteValue(NewPN);
+      BEBlock->getInstList().erase(NewPN);
+    }
+  }
+
+  // Now that all of the PHI nodes have been inserted and adjusted, modify the
+  // backedge blocks to just to the BEBlock instead of the header.
+  for (unsigned i = 0, e = BackedgeBlocks.size(); i != e; ++i) {
+    TerminatorInst *TI = BackedgeBlocks[i]->getTerminator();
+    for (unsigned Op = 0, e = TI->getNumSuccessors(); Op != e; ++Op)
+      if (TI->getSuccessor(Op) == Header)
+        TI->setSuccessor(Op, BEBlock);
+  }
+
+  //===--- Update all analyses which we must preserve now -----------------===//
+
+  // Update Loop Information - we know that this block is now in the current
+  // loop and all parent loops.
+  L->addBasicBlockToLoop(BEBlock, LI->getBase());
+
+  // Update dominator information
+  DT->splitBlock(BEBlock);
+  if (DominanceFrontier *DF = getAnalysisIfAvailable<DominanceFrontier>())
+    DF->splitBlock(BEBlock);
+}
diff --git a/lib/Transforms/Utils/LowerAllocations.cpp b/lib/Transforms/Utils/LowerAllocations.cpp
new file mode 100644
index 0000000..3249895
--- /dev/null
+++ b/lib/Transforms/Utils/LowerAllocations.cpp
@@ -0,0 +1,177 @@
+//===- LowerAllocations.cpp - Reduce malloc & free insts to calls ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// The LowerAllocations transformation is a target-dependent tranformation
+// because it depends on the size of data types and alignment constraints.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "lowerallocs"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/UnifyFunctionExitNodes.h"
+#include "llvm/Module.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Instructions.h"
+#include "llvm/Constants.h"
+#include "llvm/Pass.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Support/Compiler.h"
+using namespace llvm;
+
+STATISTIC(NumLowered, "Number of allocations lowered");
+
+namespace {
+  /// LowerAllocations - Turn malloc and free instructions into %malloc and
+  /// %free calls.
+  ///
+  class VISIBILITY_HIDDEN LowerAllocations : public BasicBlockPass {
+    Constant *MallocFunc;   // Functions in the module we are processing
+    Constant *FreeFunc;     // Initialized by doInitialization
+    bool LowerMallocArgToInteger;
+  public:
+    static char ID; // Pass ID, replacement for typeid
+    explicit LowerAllocations(bool LowerToInt = false)
+      : BasicBlockPass(&ID), MallocFunc(0), FreeFunc(0), 
+        LowerMallocArgToInteger(LowerToInt) {}
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequired<TargetData>();
+      AU.setPreservesCFG();
+
+      // This is a cluster of orthogonal Transforms:
+      AU.addPreserved<UnifyFunctionExitNodes>();
+      AU.addPreservedID(PromoteMemoryToRegisterID);
+      AU.addPreservedID(LowerSwitchID);
+      AU.addPreservedID(LowerInvokePassID);
+    }
+
+    /// doPassInitialization - For the lower allocations pass, this ensures that
+    /// a module contains a declaration for a malloc and a free function.
+    ///
+    bool doInitialization(Module &M);
+
+    virtual bool doInitialization(Function &F) {
+      return doInitialization(*F.getParent());
+    }
+
+    /// runOnBasicBlock - This method does the actual work of converting
+    /// instructions over, assuming that the pass has already been initialized.
+    ///
+    bool runOnBasicBlock(BasicBlock &BB);
+  };
+}
+
+char LowerAllocations::ID = 0;
+static RegisterPass<LowerAllocations>
+X("lowerallocs", "Lower allocations from instructions to calls");
+
+// Publically exposed interface to pass...
+const PassInfo *const llvm::LowerAllocationsID = &X;
+// createLowerAllocationsPass - Interface to this file...
+Pass *llvm::createLowerAllocationsPass(bool LowerMallocArgToInteger) {
+  return new LowerAllocations(LowerMallocArgToInteger);
+}
+
+
+// doInitialization - For the lower allocations pass, this ensures that a
+// module contains a declaration for a malloc and a free function.
+//
+// This function is always successful.
+//
+bool LowerAllocations::doInitialization(Module &M) {
+  const Type *BPTy = PointerType::getUnqual(Type::Int8Ty);
+  // Prototype malloc as "char* malloc(...)", because we don't know in
+  // doInitialization whether size_t is int or long.
+  FunctionType *FT = FunctionType::get(BPTy, std::vector<const Type*>(), true);
+  MallocFunc = M.getOrInsertFunction("malloc", FT);
+  FreeFunc = M.getOrInsertFunction("free"  , Type::VoidTy, BPTy, (Type *)0);
+  return true;
+}
+
+// runOnBasicBlock - This method does the actual work of converting
+// instructions over, assuming that the pass has already been initialized.
+//
+bool LowerAllocations::runOnBasicBlock(BasicBlock &BB) {
+  bool Changed = false;
+  assert(MallocFunc && FreeFunc && "Pass not initialized!");
+
+  BasicBlock::InstListType &BBIL = BB.getInstList();
+
+  const TargetData &TD = getAnalysis<TargetData>();
+  const Type *IntPtrTy = TD.getIntPtrType();
+
+  // Loop over all of the instructions, looking for malloc or free instructions
+  for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; ++I) {
+    if (MallocInst *MI = dyn_cast<MallocInst>(I)) {
+      const Type *AllocTy = MI->getType()->getElementType();
+
+      // malloc(type) becomes sbyte *malloc(size)
+      Value *MallocArg;
+      if (LowerMallocArgToInteger)
+        MallocArg = ConstantInt::get(Type::Int64Ty,
+                                     TD.getTypeAllocSize(AllocTy));
+      else
+        MallocArg = ConstantExpr::getSizeOf(AllocTy);
+      MallocArg = ConstantExpr::getTruncOrBitCast(cast<Constant>(MallocArg), 
+                                                  IntPtrTy);
+
+      if (MI->isArrayAllocation()) {
+        if (isa<ConstantInt>(MallocArg) &&
+            cast<ConstantInt>(MallocArg)->isOne()) {
+          MallocArg = MI->getOperand(0);         // Operand * 1 = Operand
+        } else if (Constant *CO = dyn_cast<Constant>(MI->getOperand(0))) {
+          CO = ConstantExpr::getIntegerCast(CO, IntPtrTy, false /*ZExt*/);
+          MallocArg = ConstantExpr::getMul(CO, cast<Constant>(MallocArg));
+        } else {
+          Value *Scale = MI->getOperand(0);
+          if (Scale->getType() != IntPtrTy)
+            Scale = CastInst::CreateIntegerCast(Scale, IntPtrTy, false /*ZExt*/,
+                                                "", I);
+
+          // Multiply it by the array size if necessary...
+          MallocArg = BinaryOperator::Create(Instruction::Mul, Scale,
+                                             MallocArg, "", I);
+        }
+      }
+
+      // Create the call to Malloc.
+      CallInst *MCall = CallInst::Create(MallocFunc, MallocArg, "", I);
+      MCall->setTailCall();
+
+      // Create a cast instruction to convert to the right type...
+      Value *MCast;
+      if (MCall->getType() != Type::VoidTy)
+        MCast = new BitCastInst(MCall, MI->getType(), "", I);
+      else
+        MCast = Constant::getNullValue(MI->getType());
+
+      // Replace all uses of the old malloc inst with the cast inst
+      MI->replaceAllUsesWith(MCast);
+      I = --BBIL.erase(I);         // remove and delete the malloc instr...
+      Changed = true;
+      ++NumLowered;
+    } else if (FreeInst *FI = dyn_cast<FreeInst>(I)) {
+      Value *PtrCast = 
+        new BitCastInst(FI->getOperand(0),
+                        PointerType::getUnqual(Type::Int8Ty), "", I);
+
+      // Insert a call to the free function...
+      CallInst::Create(FreeFunc, PtrCast, "", I)->setTailCall();
+
+      // Delete the old free instruction
+      I = --BBIL.erase(I);
+      Changed = true;
+      ++NumLowered;
+    }
+  }
+
+  return Changed;
+}
+
diff --git a/lib/Transforms/Utils/LowerInvoke.cpp b/lib/Transforms/Utils/LowerInvoke.cpp
new file mode 100644
index 0000000..1f6b1a2
--- /dev/null
+++ b/lib/Transforms/Utils/LowerInvoke.cpp
@@ -0,0 +1,614 @@
+//===- LowerInvoke.cpp - Eliminate Invoke & Unwind instructions -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This transformation is designed for use by code generators which do not yet
+// support stack unwinding.  This pass supports two models of exception handling
+// lowering, the 'cheap' support and the 'expensive' support.
+//
+// 'Cheap' exception handling support gives the program the ability to execute
+// any program which does not "throw an exception", by turning 'invoke'
+// instructions into calls and by turning 'unwind' instructions into calls to
+// abort().  If the program does dynamically use the unwind instruction, the
+// program will print a message then abort.
+//
+// 'Expensive' exception handling support gives the full exception handling
+// support to the program at the cost of making the 'invoke' instruction
+// really expensive.  It basically inserts setjmp/longjmp calls to emulate the
+// exception handling as necessary.
+//
+// Because the 'expensive' support slows down programs a lot, and EH is only
+// used for a subset of the programs, it must be specifically enabled by an
+// option.
+//
+// Note that after this pass runs the CFG is not entirely accurate (exceptional
+// control flow edges are not correct anymore) so only very simple things should
+// be done after the lowerinvoke pass has run (like generation of native code).
+// This should not be used as a general purpose "my LLVM-to-LLVM pass doesn't
+// support the invoke instruction yet" lowering pass.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "lowerinvoke"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Instructions.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Target/TargetLowering.h"
+#include <csetjmp>
+#include <set>
+using namespace llvm;
+
+STATISTIC(NumInvokes, "Number of invokes replaced");
+STATISTIC(NumUnwinds, "Number of unwinds replaced");
+STATISTIC(NumSpilled, "Number of registers live across unwind edges");
+
+static cl::opt<bool> ExpensiveEHSupport("enable-correct-eh-support",
+ cl::desc("Make the -lowerinvoke pass insert expensive, but correct, EH code"));
+
+namespace {
+  class VISIBILITY_HIDDEN LowerInvoke : public FunctionPass {
+    // Used for both models.
+    Constant *WriteFn;
+    Constant *AbortFn;
+    Value *AbortMessage;
+    unsigned AbortMessageLength;
+
+    // Used for expensive EH support.
+    const Type *JBLinkTy;
+    GlobalVariable *JBListHead;
+    Constant *SetJmpFn, *LongJmpFn;
+
+    // We peek in TLI to grab the target's jmp_buf size and alignment
+    const TargetLowering *TLI;
+
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    explicit LowerInvoke(const TargetLowering *tli = NULL)
+      : FunctionPass(&ID), TLI(tli) { }
+    bool doInitialization(Module &M);
+    bool runOnFunction(Function &F);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      // This is a cluster of orthogonal Transforms
+      AU.addPreservedID(PromoteMemoryToRegisterID);
+      AU.addPreservedID(LowerSwitchID);
+      AU.addPreservedID(LowerAllocationsID);
+    }
+
+  private:
+    void createAbortMessage(Module *M);
+    void writeAbortMessage(Instruction *IB);
+    bool insertCheapEHSupport(Function &F);
+    void splitLiveRangesLiveAcrossInvokes(std::vector<InvokeInst*> &Invokes);
+    void rewriteExpensiveInvoke(InvokeInst *II, unsigned InvokeNo,
+                                AllocaInst *InvokeNum, SwitchInst *CatchSwitch);
+    bool insertExpensiveEHSupport(Function &F);
+  };
+}
+
+char LowerInvoke::ID = 0;
+static RegisterPass<LowerInvoke>
+X("lowerinvoke", "Lower invoke and unwind, for unwindless code generators");
+
+const PassInfo *const llvm::LowerInvokePassID = &X;
+
+// Public Interface To the LowerInvoke pass.
+FunctionPass *llvm::createLowerInvokePass(const TargetLowering *TLI) {
+  return new LowerInvoke(TLI);
+}
+
+// doInitialization - Make sure that there is a prototype for abort in the
+// current module.
+bool LowerInvoke::doInitialization(Module &M) {
+  const Type *VoidPtrTy = PointerType::getUnqual(Type::Int8Ty);
+  AbortMessage = 0;
+  if (ExpensiveEHSupport) {
+    // Insert a type for the linked list of jump buffers.
+    unsigned JBSize = TLI ? TLI->getJumpBufSize() : 0;
+    JBSize = JBSize ? JBSize : 200;
+    const Type *JmpBufTy = ArrayType::get(VoidPtrTy, JBSize);
+
+    { // The type is recursive, so use a type holder.
+      std::vector<const Type*> Elements;
+      Elements.push_back(JmpBufTy);
+      OpaqueType *OT = OpaqueType::get();
+      Elements.push_back(PointerType::getUnqual(OT));
+      PATypeHolder JBLType(StructType::get(Elements));
+      OT->refineAbstractTypeTo(JBLType.get());  // Complete the cycle.
+      JBLinkTy = JBLType.get();
+      M.addTypeName("llvm.sjljeh.jmpbufty", JBLinkTy);
+    }
+
+    const Type *PtrJBList = PointerType::getUnqual(JBLinkTy);
+
+    // Now that we've done that, insert the jmpbuf list head global, unless it
+    // already exists.
+    if (!(JBListHead = M.getGlobalVariable("llvm.sjljeh.jblist", PtrJBList))) {
+      JBListHead = new GlobalVariable(PtrJBList, false,
+                                      GlobalValue::LinkOnceAnyLinkage,
+                                      Constant::getNullValue(PtrJBList),
+                                      "llvm.sjljeh.jblist", &M);
+    }
+
+// VisualStudio defines setjmp as _setjmp via #include <csetjmp> / <setjmp.h>,
+// so it looks like Intrinsic::_setjmp
+#if defined(_MSC_VER) && defined(setjmp)
+#define setjmp_undefined_for_visual_studio
+#undef setjmp
+#endif
+
+    SetJmpFn = Intrinsic::getDeclaration(&M, Intrinsic::setjmp);
+
+#if defined(_MSC_VER) && defined(setjmp_undefined_for_visual_studio)
+// let's return it to _setjmp state in case anyone ever needs it after this
+// point under VisualStudio
+#define setjmp _setjmp
+#endif
+
+    LongJmpFn = Intrinsic::getDeclaration(&M, Intrinsic::longjmp);
+  }
+
+  // We need the 'write' and 'abort' functions for both models.
+  AbortFn = M.getOrInsertFunction("abort", Type::VoidTy, (Type *)0);
+#if 0 // "write" is Unix-specific.. code is going away soon anyway.
+  WriteFn = M.getOrInsertFunction("write", Type::VoidTy, Type::Int32Ty,
+                                  VoidPtrTy, Type::Int32Ty, (Type *)0);
+#else
+  WriteFn = 0;
+#endif
+  return true;
+}
+
+void LowerInvoke::createAbortMessage(Module *M) {
+  if (ExpensiveEHSupport) {
+    // The abort message for expensive EH support tells the user that the
+    // program 'unwound' without an 'invoke' instruction.
+    Constant *Msg =
+      ConstantArray::get("ERROR: Exception thrown, but not caught!\n");
+    AbortMessageLength = Msg->getNumOperands()-1;  // don't include \0
+
+    GlobalVariable *MsgGV = new GlobalVariable(Msg->getType(), true,
+                                               GlobalValue::InternalLinkage,
+                                               Msg, "abortmsg", M);
+    std::vector<Constant*> GEPIdx(2, Constant::getNullValue(Type::Int32Ty));
+    AbortMessage = ConstantExpr::getGetElementPtr(MsgGV, &GEPIdx[0], 2);
+  } else {
+    // The abort message for cheap EH support tells the user that EH is not
+    // enabled.
+    Constant *Msg =
+      ConstantArray::get("Exception handler needed, but not enabled.  Recompile"
+                         " program with -enable-correct-eh-support.\n");
+    AbortMessageLength = Msg->getNumOperands()-1;  // don't include \0
+
+    GlobalVariable *MsgGV = new GlobalVariable(Msg->getType(), true,
+                                               GlobalValue::InternalLinkage,
+                                               Msg, "abortmsg", M);
+    std::vector<Constant*> GEPIdx(2, Constant::getNullValue(Type::Int32Ty));
+    AbortMessage = ConstantExpr::getGetElementPtr(MsgGV, &GEPIdx[0], 2);
+  }
+}
+
+
+void LowerInvoke::writeAbortMessage(Instruction *IB) {
+#if 0
+  if (AbortMessage == 0)
+    createAbortMessage(IB->getParent()->getParent()->getParent());
+
+  // These are the arguments we WANT...
+  Value* Args[3];
+  Args[0] = ConstantInt::get(Type::Int32Ty, 2);
+  Args[1] = AbortMessage;
+  Args[2] = ConstantInt::get(Type::Int32Ty, AbortMessageLength);
+  (new CallInst(WriteFn, Args, 3, "", IB))->setTailCall();
+#endif
+}
+
+bool LowerInvoke::insertCheapEHSupport(Function &F) {
+  bool Changed = false;
+  for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB)
+    if (InvokeInst *II = dyn_cast<InvokeInst>(BB->getTerminator())) {
+      std::vector<Value*> CallArgs(II->op_begin()+3, II->op_end());
+      // Insert a normal call instruction...
+      CallInst *NewCall = CallInst::Create(II->getCalledValue(),
+                                           CallArgs.begin(), CallArgs.end(), "",II);
+      NewCall->takeName(II);
+      NewCall->setCallingConv(II->getCallingConv());
+      NewCall->setAttributes(II->getAttributes());
+      II->replaceAllUsesWith(NewCall);
+
+      // Insert an unconditional branch to the normal destination.
+      BranchInst::Create(II->getNormalDest(), II);
+
+      // Remove any PHI node entries from the exception destination.
+      II->getUnwindDest()->removePredecessor(BB);
+
+      // Remove the invoke instruction now.
+      BB->getInstList().erase(II);
+
+      ++NumInvokes; Changed = true;
+    } else if (UnwindInst *UI = dyn_cast<UnwindInst>(BB->getTerminator())) {
+      // Insert a new call to write(2, AbortMessage, AbortMessageLength);
+      writeAbortMessage(UI);
+
+      // Insert a call to abort()
+      CallInst::Create(AbortFn, "", UI)->setTailCall();
+
+      // Insert a return instruction.  This really should be a "barrier", as it
+      // is unreachable.
+      ReturnInst::Create(F.getReturnType() == Type::VoidTy ? 0 :
+                         Constant::getNullValue(F.getReturnType()), UI);
+
+      // Remove the unwind instruction now.
+      BB->getInstList().erase(UI);
+
+      ++NumUnwinds; Changed = true;
+    }
+  return Changed;
+}
+
+/// rewriteExpensiveInvoke - Insert code and hack the function to replace the
+/// specified invoke instruction with a call.
+void LowerInvoke::rewriteExpensiveInvoke(InvokeInst *II, unsigned InvokeNo,
+                                         AllocaInst *InvokeNum,
+                                         SwitchInst *CatchSwitch) {
+  ConstantInt *InvokeNoC = ConstantInt::get(Type::Int32Ty, InvokeNo);
+
+  // If the unwind edge has phi nodes, split the edge.
+  if (isa<PHINode>(II->getUnwindDest()->begin())) {
+    SplitCriticalEdge(II, 1, this);
+
+    // If there are any phi nodes left, they must have a single predecessor.
+    while (PHINode *PN = dyn_cast<PHINode>(II->getUnwindDest()->begin())) {
+      PN->replaceAllUsesWith(PN->getIncomingValue(0));
+      PN->eraseFromParent();
+    }
+  }
+
+  // Insert a store of the invoke num before the invoke and store zero into the
+  // location afterward.
+  new StoreInst(InvokeNoC, InvokeNum, true, II);  // volatile
+
+  BasicBlock::iterator NI = II->getNormalDest()->getFirstNonPHI();
+  // nonvolatile.
+  new StoreInst(Constant::getNullValue(Type::Int32Ty), InvokeNum, false, NI);
+
+  // Add a switch case to our unwind block.
+  CatchSwitch->addCase(InvokeNoC, II->getUnwindDest());
+
+  // Insert a normal call instruction.
+  std::vector<Value*> CallArgs(II->op_begin()+3, II->op_end());
+  CallInst *NewCall = CallInst::Create(II->getCalledValue(),
+                                       CallArgs.begin(), CallArgs.end(), "",
+                                       II);
+  NewCall->takeName(II);
+  NewCall->setCallingConv(II->getCallingConv());
+  NewCall->setAttributes(II->getAttributes());
+  II->replaceAllUsesWith(NewCall);
+
+  // Replace the invoke with an uncond branch.
+  BranchInst::Create(II->getNormalDest(), NewCall->getParent());
+  II->eraseFromParent();
+}
+
+/// MarkBlocksLiveIn - Insert BB and all of its predescessors into LiveBBs until
+/// we reach blocks we've already seen.
+static void MarkBlocksLiveIn(BasicBlock *BB, std::set<BasicBlock*> &LiveBBs) {
+  if (!LiveBBs.insert(BB).second) return; // already been here.
+
+  for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI)
+    MarkBlocksLiveIn(*PI, LiveBBs);
+}
+
+// First thing we need to do is scan the whole function for values that are
+// live across unwind edges.  Each value that is live across an unwind edge
+// we spill into a stack location, guaranteeing that there is nothing live
+// across the unwind edge.  This process also splits all critical edges
+// coming out of invoke's.
+void LowerInvoke::
+splitLiveRangesLiveAcrossInvokes(std::vector<InvokeInst*> &Invokes) {
+  // First step, split all critical edges from invoke instructions.
+  for (unsigned i = 0, e = Invokes.size(); i != e; ++i) {
+    InvokeInst *II = Invokes[i];
+    SplitCriticalEdge(II, 0, this);
+    SplitCriticalEdge(II, 1, this);
+    assert(!isa<PHINode>(II->getNormalDest()) &&
+           !isa<PHINode>(II->getUnwindDest()) &&
+           "critical edge splitting left single entry phi nodes?");
+  }
+
+  Function *F = Invokes.back()->getParent()->getParent();
+
+  // To avoid having to handle incoming arguments specially, we lower each arg
+  // to a copy instruction in the entry block.  This ensures that the argument
+  // value itself cannot be live across the entry block.
+  BasicBlock::iterator AfterAllocaInsertPt = F->begin()->begin();
+  while (isa<AllocaInst>(AfterAllocaInsertPt) &&
+        isa<ConstantInt>(cast<AllocaInst>(AfterAllocaInsertPt)->getArraySize()))
+    ++AfterAllocaInsertPt;
+  for (Function::arg_iterator AI = F->arg_begin(), E = F->arg_end();
+       AI != E; ++AI) {
+    // This is always a no-op cast because we're casting AI to AI->getType() so
+    // src and destination types are identical. BitCast is the only possibility.
+    CastInst *NC = new BitCastInst(
+      AI, AI->getType(), AI->getName()+".tmp", AfterAllocaInsertPt);
+    AI->replaceAllUsesWith(NC);
+    // Normally its is forbidden to replace a CastInst's operand because it
+    // could cause the opcode to reflect an illegal conversion. However, we're
+    // replacing it here with the same value it was constructed with to simply
+    // make NC its user.
+    NC->setOperand(0, AI);
+  }
+
+  // Finally, scan the code looking for instructions with bad live ranges.
+  for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB)
+    for (BasicBlock::iterator II = BB->begin(), E = BB->end(); II != E; ++II) {
+      // Ignore obvious cases we don't have to handle.  In particular, most
+      // instructions either have no uses or only have a single use inside the
+      // current block.  Ignore them quickly.
+      Instruction *Inst = II;
+      if (Inst->use_empty()) continue;
+      if (Inst->hasOneUse() &&
+          cast<Instruction>(Inst->use_back())->getParent() == BB &&
+          !isa<PHINode>(Inst->use_back())) continue;
+
+      // If this is an alloca in the entry block, it's not a real register
+      // value.
+      if (AllocaInst *AI = dyn_cast<AllocaInst>(Inst))
+        if (isa<ConstantInt>(AI->getArraySize()) && BB == F->begin())
+          continue;
+
+      // Avoid iterator invalidation by copying users to a temporary vector.
+      std::vector<Instruction*> Users;
+      for (Value::use_iterator UI = Inst->use_begin(), E = Inst->use_end();
+           UI != E; ++UI) {
+        Instruction *User = cast<Instruction>(*UI);
+        if (User->getParent() != BB || isa<PHINode>(User))
+          Users.push_back(User);
+      }
+
+      // Scan all of the uses and see if the live range is live across an unwind
+      // edge.  If we find a use live across an invoke edge, create an alloca
+      // and spill the value.
+      std::set<InvokeInst*> InvokesWithStoreInserted;
+
+      // Find all of the blocks that this value is live in.
+      std::set<BasicBlock*> LiveBBs;
+      LiveBBs.insert(Inst->getParent());
+      while (!Users.empty()) {
+        Instruction *U = Users.back();
+        Users.pop_back();
+
+        if (!isa<PHINode>(U)) {
+          MarkBlocksLiveIn(U->getParent(), LiveBBs);
+        } else {
+          // Uses for a PHI node occur in their predecessor block.
+          PHINode *PN = cast<PHINode>(U);
+          for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+            if (PN->getIncomingValue(i) == Inst)
+              MarkBlocksLiveIn(PN->getIncomingBlock(i), LiveBBs);
+        }
+      }
+
+      // Now that we know all of the blocks that this thing is live in, see if
+      // it includes any of the unwind locations.
+      bool NeedsSpill = false;
+      for (unsigned i = 0, e = Invokes.size(); i != e; ++i) {
+        BasicBlock *UnwindBlock = Invokes[i]->getUnwindDest();
+        if (UnwindBlock != BB && LiveBBs.count(UnwindBlock)) {
+          NeedsSpill = true;
+        }
+      }
+
+      // If we decided we need a spill, do it.
+      if (NeedsSpill) {
+        ++NumSpilled;
+        DemoteRegToStack(*Inst, true);
+      }
+    }
+}
+
+bool LowerInvoke::insertExpensiveEHSupport(Function &F) {
+  std::vector<ReturnInst*> Returns;
+  std::vector<UnwindInst*> Unwinds;
+  std::vector<InvokeInst*> Invokes;
+
+  for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB)
+    if (ReturnInst *RI = dyn_cast<ReturnInst>(BB->getTerminator())) {
+      // Remember all return instructions in case we insert an invoke into this
+      // function.
+      Returns.push_back(RI);
+    } else if (InvokeInst *II = dyn_cast<InvokeInst>(BB->getTerminator())) {
+      Invokes.push_back(II);
+    } else if (UnwindInst *UI = dyn_cast<UnwindInst>(BB->getTerminator())) {
+      Unwinds.push_back(UI);
+    }
+
+  if (Unwinds.empty() && Invokes.empty()) return false;
+
+  NumInvokes += Invokes.size();
+  NumUnwinds += Unwinds.size();
+
+  // TODO: This is not an optimal way to do this.  In particular, this always
+  // inserts setjmp calls into the entries of functions with invoke instructions
+  // even though there are possibly paths through the function that do not
+  // execute any invokes.  In particular, for functions with early exits, e.g.
+  // the 'addMove' method in hexxagon, it would be nice to not have to do the
+  // setjmp stuff on the early exit path.  This requires a bit of dataflow, but
+  // would not be too hard to do.
+
+  // If we have an invoke instruction, insert a setjmp that dominates all
+  // invokes.  After the setjmp, use a cond branch that goes to the original
+  // code path on zero, and to a designated 'catch' block of nonzero.
+  Value *OldJmpBufPtr = 0;
+  if (!Invokes.empty()) {
+    // First thing we need to do is scan the whole function for values that are
+    // live across unwind edges.  Each value that is live across an unwind edge
+    // we spill into a stack location, guaranteeing that there is nothing live
+    // across the unwind edge.  This process also splits all critical edges
+    // coming out of invoke's.
+    splitLiveRangesLiveAcrossInvokes(Invokes);
+
+    BasicBlock *EntryBB = F.begin();
+
+    // Create an alloca for the incoming jump buffer ptr and the new jump buffer
+    // that needs to be restored on all exits from the function.  This is an
+    // alloca because the value needs to be live across invokes.
+    unsigned Align = TLI ? TLI->getJumpBufAlignment() : 0;
+    AllocaInst *JmpBuf =
+      new AllocaInst(JBLinkTy, 0, Align, "jblink", F.begin()->begin());
+
+    std::vector<Value*> Idx;
+    Idx.push_back(Constant::getNullValue(Type::Int32Ty));
+    Idx.push_back(ConstantInt::get(Type::Int32Ty, 1));
+    OldJmpBufPtr = GetElementPtrInst::Create(JmpBuf, Idx.begin(), Idx.end(),
+                                             "OldBuf", EntryBB->getTerminator());
+
+    // Copy the JBListHead to the alloca.
+    Value *OldBuf = new LoadInst(JBListHead, "oldjmpbufptr", true,
+                                 EntryBB->getTerminator());
+    new StoreInst(OldBuf, OldJmpBufPtr, true, EntryBB->getTerminator());
+
+    // Add the new jumpbuf to the list.
+    new StoreInst(JmpBuf, JBListHead, true, EntryBB->getTerminator());
+
+    // Create the catch block.  The catch block is basically a big switch
+    // statement that goes to all of the invoke catch blocks.
+    BasicBlock *CatchBB = BasicBlock::Create("setjmp.catch", &F);
+
+    // Create an alloca which keeps track of which invoke is currently
+    // executing.  For normal calls it contains zero.
+    AllocaInst *InvokeNum = new AllocaInst(Type::Int32Ty, 0, "invokenum",
+                                           EntryBB->begin());
+    new StoreInst(ConstantInt::get(Type::Int32Ty, 0), InvokeNum, true,
+                  EntryBB->getTerminator());
+
+    // Insert a load in the Catch block, and a switch on its value.  By default,
+    // we go to a block that just does an unwind (which is the correct action
+    // for a standard call).
+    BasicBlock *UnwindBB = BasicBlock::Create("unwindbb", &F);
+    Unwinds.push_back(new UnwindInst(UnwindBB));
+
+    Value *CatchLoad = new LoadInst(InvokeNum, "invoke.num", true, CatchBB);
+    SwitchInst *CatchSwitch =
+      SwitchInst::Create(CatchLoad, UnwindBB, Invokes.size(), CatchBB);
+
+    // Now that things are set up, insert the setjmp call itself.
+
+    // Split the entry block to insert the conditional branch for the setjmp.
+    BasicBlock *ContBlock = EntryBB->splitBasicBlock(EntryBB->getTerminator(),
+                                                     "setjmp.cont");
+
+    Idx[1] = ConstantInt::get(Type::Int32Ty, 0);
+    Value *JmpBufPtr = GetElementPtrInst::Create(JmpBuf, Idx.begin(), Idx.end(),
+                                                 "TheJmpBuf",
+                                                 EntryBB->getTerminator());
+    JmpBufPtr = new BitCastInst(JmpBufPtr, PointerType::getUnqual(Type::Int8Ty),
+                                "tmp", EntryBB->getTerminator());
+    Value *SJRet = CallInst::Create(SetJmpFn, JmpBufPtr, "sjret",
+                                    EntryBB->getTerminator());
+
+    // Compare the return value to zero.
+    Value *IsNormal = new ICmpInst(ICmpInst::ICMP_EQ, SJRet,
+                                   Constant::getNullValue(SJRet->getType()),
+      "notunwind", EntryBB->getTerminator());
+    // Nuke the uncond branch.
+    EntryBB->getTerminator()->eraseFromParent();
+
+    // Put in a new condbranch in its place.
+    BranchInst::Create(ContBlock, CatchBB, IsNormal, EntryBB);
+
+    // At this point, we are all set up, rewrite each invoke instruction.
+    for (unsigned i = 0, e = Invokes.size(); i != e; ++i)
+      rewriteExpensiveInvoke(Invokes[i], i+1, InvokeNum, CatchSwitch);
+  }
+
+  // We know that there is at least one unwind.
+
+  // Create three new blocks, the block to load the jmpbuf ptr and compare
+  // against null, the block to do the longjmp, and the error block for if it
+  // is null.  Add them at the end of the function because they are not hot.
+  BasicBlock *UnwindHandler = BasicBlock::Create("dounwind", &F);
+  BasicBlock *UnwindBlock = BasicBlock::Create("unwind", &F);
+  BasicBlock *TermBlock = BasicBlock::Create("unwinderror", &F);
+
+  // If this function contains an invoke, restore the old jumpbuf ptr.
+  Value *BufPtr;
+  if (OldJmpBufPtr) {
+    // Before the return, insert a copy from the saved value to the new value.
+    BufPtr = new LoadInst(OldJmpBufPtr, "oldjmpbufptr", UnwindHandler);
+    new StoreInst(BufPtr, JBListHead, UnwindHandler);
+  } else {
+    BufPtr = new LoadInst(JBListHead, "ehlist", UnwindHandler);
+  }
+
+  // Load the JBList, if it's null, then there was no catch!
+  Value *NotNull = new ICmpInst(ICmpInst::ICMP_NE, BufPtr,
+                                Constant::getNullValue(BufPtr->getType()),
+    "notnull", UnwindHandler);
+  BranchInst::Create(UnwindBlock, TermBlock, NotNull, UnwindHandler);
+
+  // Create the block to do the longjmp.
+  // Get a pointer to the jmpbuf and longjmp.
+  std::vector<Value*> Idx;
+  Idx.push_back(Constant::getNullValue(Type::Int32Ty));
+  Idx.push_back(ConstantInt::get(Type::Int32Ty, 0));
+  Idx[0] = GetElementPtrInst::Create(BufPtr, Idx.begin(), Idx.end(), "JmpBuf",
+                                     UnwindBlock);
+  Idx[0] = new BitCastInst(Idx[0], PointerType::getUnqual(Type::Int8Ty),
+                           "tmp", UnwindBlock);
+  Idx[1] = ConstantInt::get(Type::Int32Ty, 1);
+  CallInst::Create(LongJmpFn, Idx.begin(), Idx.end(), "", UnwindBlock);
+  new UnreachableInst(UnwindBlock);
+
+  // Set up the term block ("throw without a catch").
+  new UnreachableInst(TermBlock);
+
+  // Insert a new call to write(2, AbortMessage, AbortMessageLength);
+  writeAbortMessage(TermBlock->getTerminator());
+
+  // Insert a call to abort()
+  CallInst::Create(AbortFn, "",
+                   TermBlock->getTerminator())->setTailCall();
+
+
+  // Replace all unwinds with a branch to the unwind handler.
+  for (unsigned i = 0, e = Unwinds.size(); i != e; ++i) {
+    BranchInst::Create(UnwindHandler, Unwinds[i]);
+    Unwinds[i]->eraseFromParent();
+  }
+
+  // Finally, for any returns from this function, if this function contains an
+  // invoke, restore the old jmpbuf pointer to its input value.
+  if (OldJmpBufPtr) {
+    for (unsigned i = 0, e = Returns.size(); i != e; ++i) {
+      ReturnInst *R = Returns[i];
+
+      // Before the return, insert a copy from the saved value to the new value.
+      Value *OldBuf = new LoadInst(OldJmpBufPtr, "oldjmpbufptr", true, R);
+      new StoreInst(OldBuf, JBListHead, true, R);
+    }
+  }
+
+  return true;
+}
+
+bool LowerInvoke::runOnFunction(Function &F) {
+  if (ExpensiveEHSupport)
+    return insertExpensiveEHSupport(F);
+  else
+    return insertCheapEHSupport(F);
+}
diff --git a/lib/Transforms/Utils/LowerSwitch.cpp b/lib/Transforms/Utils/LowerSwitch.cpp
new file mode 100644
index 0000000..1da5936
--- /dev/null
+++ b/lib/Transforms/Utils/LowerSwitch.cpp
@@ -0,0 +1,323 @@
+//===- LowerSwitch.cpp - Eliminate Switch instructions --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// The LowerSwitch transformation rewrites switch instructions with a sequence
+// of branches, which allows targets to get away with not implementing the
+// switch instruction until it is convenient.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/UnifyFunctionExitNodes.h"
+#include "llvm/Constants.h"
+#include "llvm/Function.h"
+#include "llvm/Instructions.h"
+#include "llvm/Pass.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+using namespace llvm;
+
+namespace {
+  /// LowerSwitch Pass - Replace all SwitchInst instructions with chained branch
+  /// instructions.  Note that this cannot be a BasicBlock pass because it
+  /// modifies the CFG!
+  class VISIBILITY_HIDDEN LowerSwitch : public FunctionPass {
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    LowerSwitch() : FunctionPass(&ID) {} 
+
+    virtual bool runOnFunction(Function &F);
+    
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      // This is a cluster of orthogonal Transforms
+      AU.addPreserved<UnifyFunctionExitNodes>();
+      AU.addPreservedID(PromoteMemoryToRegisterID);
+      AU.addPreservedID(LowerInvokePassID);
+      AU.addPreservedID(LowerAllocationsID);
+    }
+
+    struct CaseRange {
+      Constant* Low;
+      Constant* High;
+      BasicBlock* BB;
+
+      CaseRange() : Low(0), High(0), BB(0) { }
+      CaseRange(Constant* low, Constant* high, BasicBlock* bb) :
+        Low(low), High(high), BB(bb) { }
+    };
+
+    typedef std::vector<CaseRange>           CaseVector;
+    typedef std::vector<CaseRange>::iterator CaseItr;
+  private:
+    void processSwitchInst(SwitchInst *SI);
+
+    BasicBlock* switchConvert(CaseItr Begin, CaseItr End, Value* Val,
+                              BasicBlock* OrigBlock, BasicBlock* Default);
+    BasicBlock* newLeafBlock(CaseRange& Leaf, Value* Val,
+                             BasicBlock* OrigBlock, BasicBlock* Default);
+    unsigned Clusterify(CaseVector& Cases, SwitchInst *SI);
+  };
+
+  /// The comparison function for sorting the switch case values in the vector.
+  /// WARNING: Case ranges should be disjoint!
+  struct CaseCmp {
+    bool operator () (const LowerSwitch::CaseRange& C1,
+                      const LowerSwitch::CaseRange& C2) {
+
+      const ConstantInt* CI1 = cast<const ConstantInt>(C1.Low);
+      const ConstantInt* CI2 = cast<const ConstantInt>(C2.High);
+      return CI1->getValue().slt(CI2->getValue());
+    }
+  };
+}
+
+char LowerSwitch::ID = 0;
+static RegisterPass<LowerSwitch>
+X("lowerswitch", "Lower SwitchInst's to branches");
+
+// Publically exposed interface to pass...
+const PassInfo *const llvm::LowerSwitchID = &X;
+// createLowerSwitchPass - Interface to this file...
+FunctionPass *llvm::createLowerSwitchPass() {
+  return new LowerSwitch();
+}
+
+bool LowerSwitch::runOnFunction(Function &F) {
+  bool Changed = false;
+
+  for (Function::iterator I = F.begin(), E = F.end(); I != E; ) {
+    BasicBlock *Cur = I++; // Advance over block so we don't traverse new blocks
+
+    if (SwitchInst *SI = dyn_cast<SwitchInst>(Cur->getTerminator())) {
+      Changed = true;
+      processSwitchInst(SI);
+    }
+  }
+
+  return Changed;
+}
+
+// operator<< - Used for debugging purposes.
+//
+static std::ostream& operator<<(std::ostream &O,
+                                const LowerSwitch::CaseVector &C) {
+  O << "[";
+
+  for (LowerSwitch::CaseVector::const_iterator B = C.begin(),
+         E = C.end(); B != E; ) {
+    O << *B->Low << " -" << *B->High;
+    if (++B != E) O << ", ";
+  }
+
+  return O << "]";
+}
+
+static OStream& operator<<(OStream &O, const LowerSwitch::CaseVector &C) {
+  if (O.stream()) *O.stream() << C;
+  return O;
+}
+
+// switchConvert - Convert the switch statement into a binary lookup of
+// the case values. The function recursively builds this tree.
+//
+BasicBlock* LowerSwitch::switchConvert(CaseItr Begin, CaseItr End,
+                                       Value* Val, BasicBlock* OrigBlock,
+                                       BasicBlock* Default)
+{
+  unsigned Size = End - Begin;
+
+  if (Size == 1)
+    return newLeafBlock(*Begin, Val, OrigBlock, Default);
+
+  unsigned Mid = Size / 2;
+  std::vector<CaseRange> LHS(Begin, Begin + Mid);
+  DOUT << "LHS: " << LHS << "\n";
+  std::vector<CaseRange> RHS(Begin + Mid, End);
+  DOUT << "RHS: " << RHS << "\n";
+
+  CaseRange& Pivot = *(Begin + Mid);
+  DEBUG(errs() << "Pivot ==> " 
+               << cast<ConstantInt>(Pivot.Low)->getValue() << " -"
+               << cast<ConstantInt>(Pivot.High)->getValue() << "\n");
+
+  BasicBlock* LBranch = switchConvert(LHS.begin(), LHS.end(), Val,
+                                      OrigBlock, Default);
+  BasicBlock* RBranch = switchConvert(RHS.begin(), RHS.end(), Val,
+                                      OrigBlock, Default);
+
+  // Create a new node that checks if the value is < pivot. Go to the
+  // left branch if it is and right branch if not.
+  Function* F = OrigBlock->getParent();
+  BasicBlock* NewNode = BasicBlock::Create("NodeBlock");
+  Function::iterator FI = OrigBlock;
+  F->getBasicBlockList().insert(++FI, NewNode);
+
+  ICmpInst* Comp = new ICmpInst(ICmpInst::ICMP_SLT, Val, Pivot.Low, "Pivot");
+  NewNode->getInstList().push_back(Comp);
+  BranchInst::Create(LBranch, RBranch, Comp, NewNode);
+  return NewNode;
+}
+
+// newLeafBlock - Create a new leaf block for the binary lookup tree. It
+// checks if the switch's value == the case's value. If not, then it
+// jumps to the default branch. At this point in the tree, the value
+// can't be another valid case value, so the jump to the "default" branch
+// is warranted.
+//
+BasicBlock* LowerSwitch::newLeafBlock(CaseRange& Leaf, Value* Val,
+                                      BasicBlock* OrigBlock,
+                                      BasicBlock* Default)
+{
+  Function* F = OrigBlock->getParent();
+  BasicBlock* NewLeaf = BasicBlock::Create("LeafBlock");
+  Function::iterator FI = OrigBlock;
+  F->getBasicBlockList().insert(++FI, NewLeaf);
+
+  // Emit comparison
+  ICmpInst* Comp = NULL;
+  if (Leaf.Low == Leaf.High) {
+    // Make the seteq instruction...
+    Comp = new ICmpInst(ICmpInst::ICMP_EQ, Val, Leaf.Low,
+                        "SwitchLeaf", NewLeaf);
+  } else {
+    // Make range comparison
+    if (cast<ConstantInt>(Leaf.Low)->isMinValue(true /*isSigned*/)) {
+      // Val >= Min && Val <= Hi --> Val <= Hi
+      Comp = new ICmpInst(ICmpInst::ICMP_SLE, Val, Leaf.High,
+                          "SwitchLeaf", NewLeaf);
+    } else if (cast<ConstantInt>(Leaf.Low)->isZero()) {
+      // Val >= 0 && Val <= Hi --> Val <=u Hi
+      Comp = new ICmpInst(ICmpInst::ICMP_ULE, Val, Leaf.High,
+                          "SwitchLeaf", NewLeaf);      
+    } else {
+      // Emit V-Lo <=u Hi-Lo
+      Constant* NegLo = ConstantExpr::getNeg(Leaf.Low);
+      Instruction* Add = BinaryOperator::CreateAdd(Val, NegLo,
+                                                   Val->getName()+".off",
+                                                   NewLeaf);
+      Constant *UpperBound = ConstantExpr::getAdd(NegLo, Leaf.High);
+      Comp = new ICmpInst(ICmpInst::ICMP_ULE, Add, UpperBound,
+                          "SwitchLeaf", NewLeaf);
+    }
+  }
+
+  // Make the conditional branch...
+  BasicBlock* Succ = Leaf.BB;
+  BranchInst::Create(Succ, Default, Comp, NewLeaf);
+
+  // If there were any PHI nodes in this successor, rewrite one entry
+  // from OrigBlock to come from NewLeaf.
+  for (BasicBlock::iterator I = Succ->begin(); isa<PHINode>(I); ++I) {
+    PHINode* PN = cast<PHINode>(I);
+    // Remove all but one incoming entries from the cluster
+    uint64_t Range = cast<ConstantInt>(Leaf.High)->getSExtValue() -
+                     cast<ConstantInt>(Leaf.Low)->getSExtValue();    
+    for (uint64_t j = 0; j < Range; ++j) {
+      PN->removeIncomingValue(OrigBlock);
+    }
+    
+    int BlockIdx = PN->getBasicBlockIndex(OrigBlock);
+    assert(BlockIdx != -1 && "Switch didn't go to this successor??");
+    PN->setIncomingBlock((unsigned)BlockIdx, NewLeaf);
+  }
+
+  return NewLeaf;
+}
+
+// Clusterify - Transform simple list of Cases into list of CaseRange's
+unsigned LowerSwitch::Clusterify(CaseVector& Cases, SwitchInst *SI) {
+  unsigned numCmps = 0;
+
+  // Start with "simple" cases
+  for (unsigned i = 1; i < SI->getNumSuccessors(); ++i)
+    Cases.push_back(CaseRange(SI->getSuccessorValue(i),
+                              SI->getSuccessorValue(i),
+                              SI->getSuccessor(i)));
+  std::sort(Cases.begin(), Cases.end(), CaseCmp());
+
+  // Merge case into clusters
+  if (Cases.size()>=2)
+    for (CaseItr I=Cases.begin(), J=next(Cases.begin()); J!=Cases.end(); ) {
+      int64_t nextValue = cast<ConstantInt>(J->Low)->getSExtValue();
+      int64_t currentValue = cast<ConstantInt>(I->High)->getSExtValue();
+      BasicBlock* nextBB = J->BB;
+      BasicBlock* currentBB = I->BB;
+
+      // If the two neighboring cases go to the same destination, merge them
+      // into a single case.
+      if ((nextValue-currentValue==1) && (currentBB == nextBB)) {
+        I->High = J->High;
+        J = Cases.erase(J);
+      } else {
+        I = J++;
+      }
+    }
+
+  for (CaseItr I=Cases.begin(), E=Cases.end(); I!=E; ++I, ++numCmps) {
+    if (I->Low != I->High)
+      // A range counts double, since it requires two compares.
+      ++numCmps;
+  }
+
+  return numCmps;
+}
+
+// processSwitchInst - Replace the specified switch instruction with a sequence
+// of chained if-then insts in a balanced binary search.
+//
+void LowerSwitch::processSwitchInst(SwitchInst *SI) {
+  BasicBlock *CurBlock = SI->getParent();
+  BasicBlock *OrigBlock = CurBlock;
+  Function *F = CurBlock->getParent();
+  Value *Val = SI->getOperand(0);  // The value we are switching on...
+  BasicBlock* Default = SI->getDefaultDest();
+
+  // If there is only the default destination, don't bother with the code below.
+  if (SI->getNumOperands() == 2) {
+    BranchInst::Create(SI->getDefaultDest(), CurBlock);
+    CurBlock->getInstList().erase(SI);
+    return;
+  }
+
+  // Create a new, empty default block so that the new hierarchy of
+  // if-then statements go to this and the PHI nodes are happy.
+  BasicBlock* NewDefault = BasicBlock::Create("NewDefault");
+  F->getBasicBlockList().insert(Default, NewDefault);
+
+  BranchInst::Create(Default, NewDefault);
+
+  // If there is an entry in any PHI nodes for the default edge, make sure
+  // to update them as well.
+  for (BasicBlock::iterator I = Default->begin(); isa<PHINode>(I); ++I) {
+    PHINode *PN = cast<PHINode>(I);
+    int BlockIdx = PN->getBasicBlockIndex(OrigBlock);
+    assert(BlockIdx != -1 && "Switch didn't go to this successor??");
+    PN->setIncomingBlock((unsigned)BlockIdx, NewDefault);
+  }
+
+  // Prepare cases vector.
+  CaseVector Cases;
+  unsigned numCmps = Clusterify(Cases, SI);
+
+  DOUT << "Clusterify finished. Total clusters: " << Cases.size()
+       << ". Total compares: " << numCmps << "\n";
+  DOUT << "Cases: " << Cases << "\n";
+  
+  BasicBlock* SwitchBlock = switchConvert(Cases.begin(), Cases.end(), Val,
+                                          OrigBlock, NewDefault);
+
+  // Branch to our shiny new if-then stuff...
+  BranchInst::Create(SwitchBlock, OrigBlock);
+
+  // We are now done with the switch instruction, delete it.
+  CurBlock->getInstList().erase(SI);
+}
diff --git a/lib/Transforms/Utils/Makefile b/lib/Transforms/Utils/Makefile
new file mode 100644
index 0000000..d1e9336
--- /dev/null
+++ b/lib/Transforms/Utils/Makefile
@@ -0,0 +1,15 @@
+##===- lib/Transforms/Utils/Makefile -----------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../../..
+LIBRARYNAME = LLVMTransformUtils
+BUILD_ARCHIVE = 1
+
+include $(LEVEL)/Makefile.common
+
diff --git a/lib/Transforms/Utils/Mem2Reg.cpp b/lib/Transforms/Utils/Mem2Reg.cpp
new file mode 100644
index 0000000..2b06d77
--- /dev/null
+++ b/lib/Transforms/Utils/Mem2Reg.cpp
@@ -0,0 +1,92 @@
+//===- Mem2Reg.cpp - The -mem2reg pass, a wrapper around the Utils lib ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass is a simple pass wrapper around the PromoteMemToReg function call
+// exposed by the Utils library.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "mem2reg"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/PromoteMemToReg.h"
+#include "llvm/Transforms/Utils/UnifyFunctionExitNodes.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Instructions.h"
+#include "llvm/Function.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/Compiler.h"
+using namespace llvm;
+
+STATISTIC(NumPromoted, "Number of alloca's promoted");
+
+namespace {
+  struct VISIBILITY_HIDDEN PromotePass : public FunctionPass {
+    static char ID; // Pass identification, replacement for typeid
+    PromotePass() : FunctionPass(&ID) {}
+
+    // runOnFunction - To run this pass, first we calculate the alloca
+    // instructions that are safe for promotion, then we promote each one.
+    //
+    virtual bool runOnFunction(Function &F);
+
+    // getAnalysisUsage - We need dominance frontiers
+    //
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequired<DominatorTree>();
+      AU.addRequired<DominanceFrontier>();
+      AU.setPreservesCFG();
+      // This is a cluster of orthogonal Transforms
+      AU.addPreserved<UnifyFunctionExitNodes>();
+      AU.addPreservedID(LowerSwitchID);
+      AU.addPreservedID(LowerInvokePassID);
+      AU.addPreservedID(LowerAllocationsID);
+    }
+  };
+}  // end of anonymous namespace
+
+char PromotePass::ID = 0;
+static RegisterPass<PromotePass> X("mem2reg", "Promote Memory to Register");
+
+bool PromotePass::runOnFunction(Function &F) {
+  std::vector<AllocaInst*> Allocas;
+
+  BasicBlock &BB = F.getEntryBlock();  // Get the entry node for the function
+
+  bool Changed  = false;
+
+  DominatorTree &DT = getAnalysis<DominatorTree>();
+  DominanceFrontier &DF = getAnalysis<DominanceFrontier>();
+
+  while (1) {
+    Allocas.clear();
+
+    // Find allocas that are safe to promote, by looking at all instructions in
+    // the entry node
+    for (BasicBlock::iterator I = BB.begin(), E = --BB.end(); I != E; ++I)
+      if (AllocaInst *AI = dyn_cast<AllocaInst>(I))       // Is it an alloca?
+        if (isAllocaPromotable(AI))
+          Allocas.push_back(AI);
+
+    if (Allocas.empty()) break;
+
+    PromoteMemToReg(Allocas, DT, DF);
+    NumPromoted += Allocas.size();
+    Changed = true;
+  }
+
+  return Changed;
+}
+
+// Publically exposed interface to pass...
+const PassInfo *const llvm::PromoteMemoryToRegisterID = &X;
+// createPromoteMemoryToRegister - Provide an entry point to create this pass.
+//
+FunctionPass *llvm::createPromoteMemoryToRegisterPass() {
+  return new PromotePass();
+}
diff --git a/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
new file mode 100644
index 0000000..b717699
--- /dev/null
+++ b/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
@@ -0,0 +1,1003 @@
+//===- PromoteMemoryToRegister.cpp - Convert allocas to registers ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file promotes memory references to be register references.  It promotes
+// alloca instructions which only have loads and stores as uses.  An alloca is
+// transformed by using dominator frontiers to place PHI nodes, then traversing
+// the function in depth-first order to rewrite loads and stores as appropriate.
+// This is just the standard SSA construction algorithm to construct "pruned"
+// SSA form.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "mem2reg"
+#include "llvm/Transforms/Utils/PromoteMemToReg.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
+#include "llvm/Instructions.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/AliasSetTracker.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/Compiler.h"
+#include <algorithm>
+using namespace llvm;
+
+STATISTIC(NumLocalPromoted, "Number of alloca's promoted within one block");
+STATISTIC(NumSingleStore,   "Number of alloca's promoted with a single store");
+STATISTIC(NumDeadAlloca,    "Number of dead alloca's removed");
+STATISTIC(NumPHIInsert,     "Number of PHI nodes inserted");
+
+// Provide DenseMapInfo for all pointers.
+namespace llvm {
+template<>
+struct DenseMapInfo<std::pair<BasicBlock*, unsigned> > {
+  typedef std::pair<BasicBlock*, unsigned> EltTy;
+  static inline EltTy getEmptyKey() {
+    return EltTy(reinterpret_cast<BasicBlock*>(-1), ~0U);
+  }
+  static inline EltTy getTombstoneKey() {
+    return EltTy(reinterpret_cast<BasicBlock*>(-2), 0U);
+  }
+  static unsigned getHashValue(const std::pair<BasicBlock*, unsigned> &Val) {
+    return DenseMapInfo<void*>::getHashValue(Val.first) + Val.second*2;
+  }
+  static bool isEqual(const EltTy &LHS, const EltTy &RHS) {
+    return LHS == RHS;
+  }
+  static bool isPod() { return true; }
+};
+}
+
+/// isAllocaPromotable - Return true if this alloca is legal for promotion.
+/// This is true if there are only loads and stores to the alloca.
+///
+bool llvm::isAllocaPromotable(const AllocaInst *AI) {
+  // FIXME: If the memory unit is of pointer or integer type, we can permit
+  // assignments to subsections of the memory unit.
+
+  // Only allow direct and non-volatile loads and stores...
+  for (Value::use_const_iterator UI = AI->use_begin(), UE = AI->use_end();
+       UI != UE; ++UI)     // Loop over all of the uses of the alloca
+    if (const LoadInst *LI = dyn_cast<LoadInst>(*UI)) {
+      if (LI->isVolatile())
+        return false;
+    } else if (const StoreInst *SI = dyn_cast<StoreInst>(*UI)) {
+      if (SI->getOperand(0) == AI)
+        return false;   // Don't allow a store OF the AI, only INTO the AI.
+      if (SI->isVolatile())
+        return false;
+    } else if (const BitCastInst *BC = dyn_cast<BitCastInst>(*UI)) {
+      // A bitcast that does not feed into debug info inhibits promotion.
+      if (!BC->hasOneUse() || !isa<DbgInfoIntrinsic>(*BC->use_begin()))
+        return false;
+      // If the only use is by debug info, this alloca will not exist in
+      // non-debug code, so don't try to promote; this ensures the same
+      // codegen with debug info.  Otherwise, debug info should not
+      // inhibit promotion (but we must examine other uses).
+      if (AI->hasOneUse())
+        return false;
+    } else {
+      return false;
+    }
+
+  return true;
+}
+
+namespace {
+  struct AllocaInfo;
+
+  // Data package used by RenamePass()
+  class VISIBILITY_HIDDEN RenamePassData {
+  public:
+    typedef std::vector<Value *> ValVector;
+    
+    RenamePassData() {}
+    RenamePassData(BasicBlock *B, BasicBlock *P,
+                   const ValVector &V) : BB(B), Pred(P), Values(V) {}
+    BasicBlock *BB;
+    BasicBlock *Pred;
+    ValVector Values;
+    
+    void swap(RenamePassData &RHS) {
+      std::swap(BB, RHS.BB);
+      std::swap(Pred, RHS.Pred);
+      Values.swap(RHS.Values);
+    }
+  };
+  
+  /// LargeBlockInfo - This assigns and keeps a per-bb relative ordering of
+  /// load/store instructions in the block that directly load or store an alloca.
+  ///
+  /// This functionality is important because it avoids scanning large basic
+  /// blocks multiple times when promoting many allocas in the same block.
+  class VISIBILITY_HIDDEN LargeBlockInfo {
+    /// InstNumbers - For each instruction that we track, keep the index of the
+    /// instruction.  The index starts out as the number of the instruction from
+    /// the start of the block.
+    DenseMap<const Instruction *, unsigned> InstNumbers;
+  public:
+    
+    /// isInterestingInstruction - This code only looks at accesses to allocas.
+    static bool isInterestingInstruction(const Instruction *I) {
+      return (isa<LoadInst>(I) && isa<AllocaInst>(I->getOperand(0))) ||
+             (isa<StoreInst>(I) && isa<AllocaInst>(I->getOperand(1)));
+    }
+    
+    /// getInstructionIndex - Get or calculate the index of the specified
+    /// instruction.
+    unsigned getInstructionIndex(const Instruction *I) {
+      assert(isInterestingInstruction(I) &&
+             "Not a load/store to/from an alloca?");
+      
+      // If we already have this instruction number, return it.
+      DenseMap<const Instruction *, unsigned>::iterator It = InstNumbers.find(I);
+      if (It != InstNumbers.end()) return It->second;
+      
+      // Scan the whole block to get the instruction.  This accumulates
+      // information for every interesting instruction in the block, in order to
+      // avoid gratuitus rescans.
+      const BasicBlock *BB = I->getParent();
+      unsigned InstNo = 0;
+      for (BasicBlock::const_iterator BBI = BB->begin(), E = BB->end();
+           BBI != E; ++BBI)
+        if (isInterestingInstruction(BBI))
+          InstNumbers[BBI] = InstNo++;
+      It = InstNumbers.find(I);
+      
+      assert(It != InstNumbers.end() && "Didn't insert instruction?");
+      return It->second;
+    }
+    
+    void deleteValue(const Instruction *I) {
+      InstNumbers.erase(I);
+    }
+    
+    void clear() {
+      InstNumbers.clear();
+    }
+  };
+
+  struct VISIBILITY_HIDDEN PromoteMem2Reg {
+    /// Allocas - The alloca instructions being promoted.
+    ///
+    std::vector<AllocaInst*> Allocas;
+    DominatorTree &DT;
+    DominanceFrontier &DF;
+
+    /// AST - An AliasSetTracker object to update.  If null, don't update it.
+    ///
+    AliasSetTracker *AST;
+
+    /// AllocaLookup - Reverse mapping of Allocas.
+    ///
+    std::map<AllocaInst*, unsigned>  AllocaLookup;
+
+    /// NewPhiNodes - The PhiNodes we're adding.
+    ///
+    DenseMap<std::pair<BasicBlock*, unsigned>, PHINode*> NewPhiNodes;
+    
+    /// PhiToAllocaMap - For each PHI node, keep track of which entry in Allocas
+    /// it corresponds to.
+    DenseMap<PHINode*, unsigned> PhiToAllocaMap;
+    
+    /// PointerAllocaValues - If we are updating an AliasSetTracker, then for
+    /// each alloca that is of pointer type, we keep track of what to copyValue
+    /// to the inserted PHI nodes here.
+    ///
+    std::vector<Value*> PointerAllocaValues;
+
+    /// Visited - The set of basic blocks the renamer has already visited.
+    ///
+    SmallPtrSet<BasicBlock*, 16> Visited;
+
+    /// BBNumbers - Contains a stable numbering of basic blocks to avoid
+    /// non-determinstic behavior.
+    DenseMap<BasicBlock*, unsigned> BBNumbers;
+
+    /// BBNumPreds - Lazily compute the number of predecessors a block has.
+    DenseMap<const BasicBlock*, unsigned> BBNumPreds;
+  public:
+    PromoteMem2Reg(const std::vector<AllocaInst*> &A, DominatorTree &dt,
+                   DominanceFrontier &df, AliasSetTracker *ast)
+      : Allocas(A), DT(dt), DF(df), AST(ast) {}
+
+    void run();
+
+    /// properlyDominates - Return true if I1 properly dominates I2.
+    ///
+    bool properlyDominates(Instruction *I1, Instruction *I2) const {
+      if (InvokeInst *II = dyn_cast<InvokeInst>(I1))
+        I1 = II->getNormalDest()->begin();
+      return DT.properlyDominates(I1->getParent(), I2->getParent());
+    }
+    
+    /// dominates - Return true if BB1 dominates BB2 using the DominatorTree.
+    ///
+    bool dominates(BasicBlock *BB1, BasicBlock *BB2) const {
+      return DT.dominates(BB1, BB2);
+    }
+
+  private:
+    void RemoveFromAllocasList(unsigned &AllocaIdx) {
+      Allocas[AllocaIdx] = Allocas.back();
+      Allocas.pop_back();
+      --AllocaIdx;
+    }
+
+    unsigned getNumPreds(const BasicBlock *BB) {
+      unsigned &NP = BBNumPreds[BB];
+      if (NP == 0)
+        NP = std::distance(pred_begin(BB), pred_end(BB))+1;
+      return NP-1;
+    }
+
+    void DetermineInsertionPoint(AllocaInst *AI, unsigned AllocaNum,
+                                 AllocaInfo &Info);
+    void ComputeLiveInBlocks(AllocaInst *AI, AllocaInfo &Info, 
+                             const SmallPtrSet<BasicBlock*, 32> &DefBlocks,
+                             SmallPtrSet<BasicBlock*, 32> &LiveInBlocks);
+    
+    void RewriteSingleStoreAlloca(AllocaInst *AI, AllocaInfo &Info,
+                                  LargeBlockInfo &LBI);
+    void PromoteSingleBlockAlloca(AllocaInst *AI, AllocaInfo &Info,
+                                  LargeBlockInfo &LBI);
+
+    
+    void RenamePass(BasicBlock *BB, BasicBlock *Pred,
+                    RenamePassData::ValVector &IncVals,
+                    std::vector<RenamePassData> &Worklist);
+    bool QueuePhiNode(BasicBlock *BB, unsigned AllocaIdx, unsigned &Version,
+                      SmallPtrSet<PHINode*, 16> &InsertedPHINodes);
+  };
+  
+  struct AllocaInfo {
+    std::vector<BasicBlock*> DefiningBlocks;
+    std::vector<BasicBlock*> UsingBlocks;
+    
+    StoreInst  *OnlyStore;
+    BasicBlock *OnlyBlock;
+    bool OnlyUsedInOneBlock;
+    
+    Value *AllocaPointerVal;
+    
+    void clear() {
+      DefiningBlocks.clear();
+      UsingBlocks.clear();
+      OnlyStore = 0;
+      OnlyBlock = 0;
+      OnlyUsedInOneBlock = true;
+      AllocaPointerVal = 0;
+    }
+    
+    /// AnalyzeAlloca - Scan the uses of the specified alloca, filling in our
+    /// ivars.
+    void AnalyzeAlloca(AllocaInst *AI) {
+      clear();
+
+      // As we scan the uses of the alloca instruction, keep track of stores,
+      // and decide whether all of the loads and stores to the alloca are within
+      // the same basic block.
+      for (Value::use_iterator U = AI->use_begin(), E = AI->use_end();
+           U != E;)  {
+        Instruction *User = cast<Instruction>(*U);
+        ++U;
+        if (BitCastInst *BC = dyn_cast<BitCastInst>(User)) {
+          // Remove any uses of this alloca in DbgInfoInstrinsics.
+          assert(BC->hasOneUse() && "Unexpected alloca uses!");
+          DbgInfoIntrinsic *DI = cast<DbgInfoIntrinsic>(*BC->use_begin());
+          DI->eraseFromParent();
+          BC->eraseFromParent();
+          continue;
+        } 
+        else if (StoreInst *SI = dyn_cast<StoreInst>(User)) {
+          // Remember the basic blocks which define new values for the alloca
+          DefiningBlocks.push_back(SI->getParent());
+          AllocaPointerVal = SI->getOperand(0);
+          OnlyStore = SI;
+        } else {
+          LoadInst *LI = cast<LoadInst>(User);
+          // Otherwise it must be a load instruction, keep track of variable
+          // reads.
+          UsingBlocks.push_back(LI->getParent());
+          AllocaPointerVal = LI;
+        }
+        
+        if (OnlyUsedInOneBlock) {
+          if (OnlyBlock == 0)
+            OnlyBlock = User->getParent();
+          else if (OnlyBlock != User->getParent())
+            OnlyUsedInOneBlock = false;
+        }
+      }
+    }
+  };
+}  // end of anonymous namespace
+
+
+void PromoteMem2Reg::run() {
+  Function &F = *DF.getRoot()->getParent();
+
+  if (AST) PointerAllocaValues.resize(Allocas.size());
+
+  AllocaInfo Info;
+  LargeBlockInfo LBI;
+
+  for (unsigned AllocaNum = 0; AllocaNum != Allocas.size(); ++AllocaNum) {
+    AllocaInst *AI = Allocas[AllocaNum];
+
+    assert(isAllocaPromotable(AI) &&
+           "Cannot promote non-promotable alloca!");
+    assert(AI->getParent()->getParent() == &F &&
+           "All allocas should be in the same function, which is same as DF!");
+
+    if (AI->use_empty()) {
+      // If there are no uses of the alloca, just delete it now.
+      if (AST) AST->deleteValue(AI);
+      AI->eraseFromParent();
+
+      // Remove the alloca from the Allocas list, since it has been processed
+      RemoveFromAllocasList(AllocaNum);
+      ++NumDeadAlloca;
+      continue;
+    }
+    
+    // Calculate the set of read and write-locations for each alloca.  This is
+    // analogous to finding the 'uses' and 'definitions' of each variable.
+    Info.AnalyzeAlloca(AI);
+
+    // If there is only a single store to this value, replace any loads of
+    // it that are directly dominated by the definition with the value stored.
+    if (Info.DefiningBlocks.size() == 1) {
+      RewriteSingleStoreAlloca(AI, Info, LBI);
+
+      // Finally, after the scan, check to see if the store is all that is left.
+      if (Info.UsingBlocks.empty()) {
+        // Remove the (now dead) store and alloca.
+        Info.OnlyStore->eraseFromParent();
+        LBI.deleteValue(Info.OnlyStore);
+
+        if (AST) AST->deleteValue(AI);
+        AI->eraseFromParent();
+        LBI.deleteValue(AI);
+        
+        // The alloca has been processed, move on.
+        RemoveFromAllocasList(AllocaNum);
+        
+        ++NumSingleStore;
+        continue;
+      }
+    }
+    
+    // If the alloca is only read and written in one basic block, just perform a
+    // linear sweep over the block to eliminate it.
+    if (Info.OnlyUsedInOneBlock) {
+      PromoteSingleBlockAlloca(AI, Info, LBI);
+      
+      // Finally, after the scan, check to see if the stores are all that is
+      // left.
+      if (Info.UsingBlocks.empty()) {
+        
+        // Remove the (now dead) stores and alloca.
+        while (!AI->use_empty()) {
+          StoreInst *SI = cast<StoreInst>(AI->use_back());
+          SI->eraseFromParent();
+          LBI.deleteValue(SI);
+        }
+        
+        if (AST) AST->deleteValue(AI);
+        AI->eraseFromParent();
+        LBI.deleteValue(AI);
+        
+        // The alloca has been processed, move on.
+        RemoveFromAllocasList(AllocaNum);
+        
+        ++NumLocalPromoted;
+        continue;
+      }
+    }
+    
+    // If we haven't computed a numbering for the BB's in the function, do so
+    // now.
+    if (BBNumbers.empty()) {
+      unsigned ID = 0;
+      for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I)
+        BBNumbers[I] = ID++;
+    }
+
+    // If we have an AST to keep updated, remember some pointer value that is
+    // stored into the alloca.
+    if (AST)
+      PointerAllocaValues[AllocaNum] = Info.AllocaPointerVal;
+    
+    // Keep the reverse mapping of the 'Allocas' array for the rename pass.
+    AllocaLookup[Allocas[AllocaNum]] = AllocaNum;
+
+    // At this point, we're committed to promoting the alloca using IDF's, and
+    // the standard SSA construction algorithm.  Determine which blocks need PHI
+    // nodes and see if we can optimize out some work by avoiding insertion of
+    // dead phi nodes.
+    DetermineInsertionPoint(AI, AllocaNum, Info);
+  }
+
+  if (Allocas.empty())
+    return; // All of the allocas must have been trivial!
+
+  LBI.clear();
+  
+  
+  // Set the incoming values for the basic block to be null values for all of
+  // the alloca's.  We do this in case there is a load of a value that has not
+  // been stored yet.  In this case, it will get this null value.
+  //
+  RenamePassData::ValVector Values(Allocas.size());
+  for (unsigned i = 0, e = Allocas.size(); i != e; ++i)
+    Values[i] = UndefValue::get(Allocas[i]->getAllocatedType());
+
+  // Walks all basic blocks in the function performing the SSA rename algorithm
+  // and inserting the phi nodes we marked as necessary
+  //
+  std::vector<RenamePassData> RenamePassWorkList;
+  RenamePassWorkList.push_back(RenamePassData(F.begin(), 0, Values));
+  while (!RenamePassWorkList.empty()) {
+    RenamePassData RPD;
+    RPD.swap(RenamePassWorkList.back());
+    RenamePassWorkList.pop_back();
+    // RenamePass may add new worklist entries.
+    RenamePass(RPD.BB, RPD.Pred, RPD.Values, RenamePassWorkList);
+  }
+  
+  // The renamer uses the Visited set to avoid infinite loops.  Clear it now.
+  Visited.clear();
+
+  // Remove the allocas themselves from the function.
+  for (unsigned i = 0, e = Allocas.size(); i != e; ++i) {
+    Instruction *A = Allocas[i];
+
+    // If there are any uses of the alloca instructions left, they must be in
+    // sections of dead code that were not processed on the dominance frontier.
+    // Just delete the users now.
+    //
+    if (!A->use_empty())
+      A->replaceAllUsesWith(UndefValue::get(A->getType()));
+    if (AST) AST->deleteValue(A);
+    A->eraseFromParent();
+  }
+
+  
+  // Loop over all of the PHI nodes and see if there are any that we can get
+  // rid of because they merge all of the same incoming values.  This can
+  // happen due to undef values coming into the PHI nodes.  This process is
+  // iterative, because eliminating one PHI node can cause others to be removed.
+  bool EliminatedAPHI = true;
+  while (EliminatedAPHI) {
+    EliminatedAPHI = false;
+    
+    for (DenseMap<std::pair<BasicBlock*, unsigned>, PHINode*>::iterator I =
+           NewPhiNodes.begin(), E = NewPhiNodes.end(); I != E;) {
+      PHINode *PN = I->second;
+      
+      // If this PHI node merges one value and/or undefs, get the value.
+      if (Value *V = PN->hasConstantValue(true)) {
+        if (!isa<Instruction>(V) ||
+            properlyDominates(cast<Instruction>(V), PN)) {
+          if (AST && isa<PointerType>(PN->getType()))
+            AST->deleteValue(PN);
+          PN->replaceAllUsesWith(V);
+          PN->eraseFromParent();
+          NewPhiNodes.erase(I++);
+          EliminatedAPHI = true;
+          continue;
+        }
+      }
+      ++I;
+    }
+  }
+  
+  // At this point, the renamer has added entries to PHI nodes for all reachable
+  // code.  Unfortunately, there may be unreachable blocks which the renamer
+  // hasn't traversed.  If this is the case, the PHI nodes may not
+  // have incoming values for all predecessors.  Loop over all PHI nodes we have
+  // created, inserting undef values if they are missing any incoming values.
+  //
+  for (DenseMap<std::pair<BasicBlock*, unsigned>, PHINode*>::iterator I =
+         NewPhiNodes.begin(), E = NewPhiNodes.end(); I != E; ++I) {
+    // We want to do this once per basic block.  As such, only process a block
+    // when we find the PHI that is the first entry in the block.
+    PHINode *SomePHI = I->second;
+    BasicBlock *BB = SomePHI->getParent();
+    if (&BB->front() != SomePHI)
+      continue;
+
+    // Only do work here if there the PHI nodes are missing incoming values.  We
+    // know that all PHI nodes that were inserted in a block will have the same
+    // number of incoming values, so we can just check any of them.
+    if (SomePHI->getNumIncomingValues() == getNumPreds(BB))
+      continue;
+
+    // Get the preds for BB.
+    SmallVector<BasicBlock*, 16> Preds(pred_begin(BB), pred_end(BB));
+    
+    // Ok, now we know that all of the PHI nodes are missing entries for some
+    // basic blocks.  Start by sorting the incoming predecessors for efficient
+    // access.
+    std::sort(Preds.begin(), Preds.end());
+    
+    // Now we loop through all BB's which have entries in SomePHI and remove
+    // them from the Preds list.
+    for (unsigned i = 0, e = SomePHI->getNumIncomingValues(); i != e; ++i) {
+      // Do a log(n) search of the Preds list for the entry we want.
+      SmallVector<BasicBlock*, 16>::iterator EntIt =
+        std::lower_bound(Preds.begin(), Preds.end(),
+                         SomePHI->getIncomingBlock(i));
+      assert(EntIt != Preds.end() && *EntIt == SomePHI->getIncomingBlock(i)&&
+             "PHI node has entry for a block which is not a predecessor!");
+
+      // Remove the entry
+      Preds.erase(EntIt);
+    }
+
+    // At this point, the blocks left in the preds list must have dummy
+    // entries inserted into every PHI nodes for the block.  Update all the phi
+    // nodes in this block that we are inserting (there could be phis before
+    // mem2reg runs).
+    unsigned NumBadPreds = SomePHI->getNumIncomingValues();
+    BasicBlock::iterator BBI = BB->begin();
+    while ((SomePHI = dyn_cast<PHINode>(BBI++)) &&
+           SomePHI->getNumIncomingValues() == NumBadPreds) {
+      Value *UndefVal = UndefValue::get(SomePHI->getType());
+      for (unsigned pred = 0, e = Preds.size(); pred != e; ++pred)
+        SomePHI->addIncoming(UndefVal, Preds[pred]);
+    }
+  }
+        
+  NewPhiNodes.clear();
+}
+
+
+/// ComputeLiveInBlocks - Determine which blocks the value is live in.  These
+/// are blocks which lead to uses.  Knowing this allows us to avoid inserting
+/// PHI nodes into blocks which don't lead to uses (thus, the inserted phi nodes
+/// would be dead).
+void PromoteMem2Reg::
+ComputeLiveInBlocks(AllocaInst *AI, AllocaInfo &Info, 
+                    const SmallPtrSet<BasicBlock*, 32> &DefBlocks,
+                    SmallPtrSet<BasicBlock*, 32> &LiveInBlocks) {
+  
+  // To determine liveness, we must iterate through the predecessors of blocks
+  // where the def is live.  Blocks are added to the worklist if we need to
+  // check their predecessors.  Start with all the using blocks.
+  SmallVector<BasicBlock*, 64> LiveInBlockWorklist;
+  LiveInBlockWorklist.insert(LiveInBlockWorklist.end(), 
+                             Info.UsingBlocks.begin(), Info.UsingBlocks.end());
+  
+  // If any of the using blocks is also a definition block, check to see if the
+  // definition occurs before or after the use.  If it happens before the use,
+  // the value isn't really live-in.
+  for (unsigned i = 0, e = LiveInBlockWorklist.size(); i != e; ++i) {
+    BasicBlock *BB = LiveInBlockWorklist[i];
+    if (!DefBlocks.count(BB)) continue;
+    
+    // Okay, this is a block that both uses and defines the value.  If the first
+    // reference to the alloca is a def (store), then we know it isn't live-in.
+    for (BasicBlock::iterator I = BB->begin(); ; ++I) {
+      if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
+        if (SI->getOperand(1) != AI) continue;
+        
+        // We found a store to the alloca before a load.  The alloca is not
+        // actually live-in here.
+        LiveInBlockWorklist[i] = LiveInBlockWorklist.back();
+        LiveInBlockWorklist.pop_back();
+        --i, --e;
+        break;
+      } else if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+        if (LI->getOperand(0) != AI) continue;
+        
+        // Okay, we found a load before a store to the alloca.  It is actually
+        // live into this block.
+        break;
+      }
+    }
+  }
+  
+  // Now that we have a set of blocks where the phi is live-in, recursively add
+  // their predecessors until we find the full region the value is live.
+  while (!LiveInBlockWorklist.empty()) {
+    BasicBlock *BB = LiveInBlockWorklist.pop_back_val();
+    
+    // The block really is live in here, insert it into the set.  If already in
+    // the set, then it has already been processed.
+    if (!LiveInBlocks.insert(BB))
+      continue;
+    
+    // Since the value is live into BB, it is either defined in a predecessor or
+    // live into it to.  Add the preds to the worklist unless they are a
+    // defining block.
+    for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) {
+      BasicBlock *P = *PI;
+      
+      // The value is not live into a predecessor if it defines the value.
+      if (DefBlocks.count(P))
+        continue;
+      
+      // Otherwise it is, add to the worklist.
+      LiveInBlockWorklist.push_back(P);
+    }
+  }
+}
+
+/// DetermineInsertionPoint - At this point, we're committed to promoting the
+/// alloca using IDF's, and the standard SSA construction algorithm.  Determine
+/// which blocks need phi nodes and see if we can optimize out some work by
+/// avoiding insertion of dead phi nodes.
+void PromoteMem2Reg::DetermineInsertionPoint(AllocaInst *AI, unsigned AllocaNum,
+                                             AllocaInfo &Info) {
+
+  // Unique the set of defining blocks for efficient lookup.
+  SmallPtrSet<BasicBlock*, 32> DefBlocks;
+  DefBlocks.insert(Info.DefiningBlocks.begin(), Info.DefiningBlocks.end());
+
+  // Determine which blocks the value is live in.  These are blocks which lead
+  // to uses.
+  SmallPtrSet<BasicBlock*, 32> LiveInBlocks;
+  ComputeLiveInBlocks(AI, Info, DefBlocks, LiveInBlocks);
+
+  // Compute the locations where PhiNodes need to be inserted.  Look at the
+  // dominance frontier of EACH basic-block we have a write in.
+  unsigned CurrentVersion = 0;
+  SmallPtrSet<PHINode*, 16> InsertedPHINodes;
+  std::vector<std::pair<unsigned, BasicBlock*> > DFBlocks;
+  while (!Info.DefiningBlocks.empty()) {
+    BasicBlock *BB = Info.DefiningBlocks.back();
+    Info.DefiningBlocks.pop_back();
+    
+    // Look up the DF for this write, add it to defining blocks.
+    DominanceFrontier::const_iterator it = DF.find(BB);
+    if (it == DF.end()) continue;
+    
+    const DominanceFrontier::DomSetType &S = it->second;
+    
+    // In theory we don't need the indirection through the DFBlocks vector.
+    // In practice, the order of calling QueuePhiNode would depend on the
+    // (unspecified) ordering of basic blocks in the dominance frontier,
+    // which would give PHI nodes non-determinstic subscripts.  Fix this by
+    // processing blocks in order of the occurance in the function.
+    for (DominanceFrontier::DomSetType::const_iterator P = S.begin(),
+         PE = S.end(); P != PE; ++P) {
+      // If the frontier block is not in the live-in set for the alloca, don't
+      // bother processing it.
+      if (!LiveInBlocks.count(*P))
+        continue;
+      
+      DFBlocks.push_back(std::make_pair(BBNumbers[*P], *P));
+    }
+    
+    // Sort by which the block ordering in the function.
+    if (DFBlocks.size() > 1)
+      std::sort(DFBlocks.begin(), DFBlocks.end());
+    
+    for (unsigned i = 0, e = DFBlocks.size(); i != e; ++i) {
+      BasicBlock *BB = DFBlocks[i].second;
+      if (QueuePhiNode(BB, AllocaNum, CurrentVersion, InsertedPHINodes))
+        Info.DefiningBlocks.push_back(BB);
+    }
+    DFBlocks.clear();
+  }
+}
+
+/// RewriteSingleStoreAlloca - If there is only a single store to this value,
+/// replace any loads of it that are directly dominated by the definition with
+/// the value stored.
+void PromoteMem2Reg::RewriteSingleStoreAlloca(AllocaInst *AI,
+                                              AllocaInfo &Info,
+                                              LargeBlockInfo &LBI) {
+  StoreInst *OnlyStore = Info.OnlyStore;
+  bool StoringGlobalVal = !isa<Instruction>(OnlyStore->getOperand(0));
+  BasicBlock *StoreBB = OnlyStore->getParent();
+  int StoreIndex = -1;
+
+  // Clear out UsingBlocks.  We will reconstruct it here if needed.
+  Info.UsingBlocks.clear();
+  
+  for (Value::use_iterator UI = AI->use_begin(), E = AI->use_end(); UI != E; ) {
+    Instruction *UserInst = cast<Instruction>(*UI++);
+    if (!isa<LoadInst>(UserInst)) {
+      assert(UserInst == OnlyStore && "Should only have load/stores");
+      continue;
+    }
+    LoadInst *LI = cast<LoadInst>(UserInst);
+    
+    // Okay, if we have a load from the alloca, we want to replace it with the
+    // only value stored to the alloca.  We can do this if the value is
+    // dominated by the store.  If not, we use the rest of the mem2reg machinery
+    // to insert the phi nodes as needed.
+    if (!StoringGlobalVal) {  // Non-instructions are always dominated.
+      if (LI->getParent() == StoreBB) {
+        // If we have a use that is in the same block as the store, compare the
+        // indices of the two instructions to see which one came first.  If the
+        // load came before the store, we can't handle it.
+        if (StoreIndex == -1)
+          StoreIndex = LBI.getInstructionIndex(OnlyStore);
+
+        if (unsigned(StoreIndex) > LBI.getInstructionIndex(LI)) {
+          // Can't handle this load, bail out.
+          Info.UsingBlocks.push_back(StoreBB);
+          continue;
+        }
+        
+      } else if (LI->getParent() != StoreBB &&
+                 !dominates(StoreBB, LI->getParent())) {
+        // If the load and store are in different blocks, use BB dominance to
+        // check their relationships.  If the store doesn't dom the use, bail
+        // out.
+        Info.UsingBlocks.push_back(LI->getParent());
+        continue;
+      }
+    }
+    
+    // Otherwise, we *can* safely rewrite this load.
+    LI->replaceAllUsesWith(OnlyStore->getOperand(0));
+    if (AST && isa<PointerType>(LI->getType()))
+      AST->deleteValue(LI);
+    LI->eraseFromParent();
+    LBI.deleteValue(LI);
+  }
+}
+
+
+/// StoreIndexSearchPredicate - This is a helper predicate used to search by the
+/// first element of a pair.
+struct StoreIndexSearchPredicate {
+  bool operator()(const std::pair<unsigned, StoreInst*> &LHS,
+                  const std::pair<unsigned, StoreInst*> &RHS) {
+    return LHS.first < RHS.first;
+  }
+};
+
+/// PromoteSingleBlockAlloca - Many allocas are only used within a single basic
+/// block.  If this is the case, avoid traversing the CFG and inserting a lot of
+/// potentially useless PHI nodes by just performing a single linear pass over
+/// the basic block using the Alloca.
+///
+/// If we cannot promote this alloca (because it is read before it is written),
+/// return true.  This is necessary in cases where, due to control flow, the
+/// alloca is potentially undefined on some control flow paths.  e.g. code like
+/// this is potentially correct:
+///
+///   for (...) { if (c) { A = undef; undef = B; } }
+///
+/// ... so long as A is not used before undef is set.
+///
+void PromoteMem2Reg::PromoteSingleBlockAlloca(AllocaInst *AI, AllocaInfo &Info,
+                                              LargeBlockInfo &LBI) {
+  // The trickiest case to handle is when we have large blocks. Because of this,
+  // this code is optimized assuming that large blocks happen.  This does not
+  // significantly pessimize the small block case.  This uses LargeBlockInfo to
+  // make it efficient to get the index of various operations in the block.
+  
+  // Clear out UsingBlocks.  We will reconstruct it here if needed.
+  Info.UsingBlocks.clear();
+  
+  // Walk the use-def list of the alloca, getting the locations of all stores.
+  typedef SmallVector<std::pair<unsigned, StoreInst*>, 64> StoresByIndexTy;
+  StoresByIndexTy StoresByIndex;
+  
+  for (Value::use_iterator UI = AI->use_begin(), E = AI->use_end();
+       UI != E; ++UI) 
+    if (StoreInst *SI = dyn_cast<StoreInst>(*UI))
+      StoresByIndex.push_back(std::make_pair(LBI.getInstructionIndex(SI), SI));
+
+  // If there are no stores to the alloca, just replace any loads with undef.
+  if (StoresByIndex.empty()) {
+    for (Value::use_iterator UI = AI->use_begin(), E = AI->use_end(); UI != E;) 
+      if (LoadInst *LI = dyn_cast<LoadInst>(*UI++)) {
+        LI->replaceAllUsesWith(UndefValue::get(LI->getType()));
+        if (AST && isa<PointerType>(LI->getType()))
+          AST->deleteValue(LI);
+        LBI.deleteValue(LI);
+        LI->eraseFromParent();
+      }
+    return;
+  }
+  
+  // Sort the stores by their index, making it efficient to do a lookup with a
+  // binary search.
+  std::sort(StoresByIndex.begin(), StoresByIndex.end());
+  
+  // Walk all of the loads from this alloca, replacing them with the nearest
+  // store above them, if any.
+  for (Value::use_iterator UI = AI->use_begin(), E = AI->use_end(); UI != E;) {
+    LoadInst *LI = dyn_cast<LoadInst>(*UI++);
+    if (!LI) continue;
+    
+    unsigned LoadIdx = LBI.getInstructionIndex(LI);
+    
+    // Find the nearest store that has a lower than this load. 
+    StoresByIndexTy::iterator I = 
+      std::lower_bound(StoresByIndex.begin(), StoresByIndex.end(),
+                       std::pair<unsigned, StoreInst*>(LoadIdx, 0),
+                       StoreIndexSearchPredicate());
+    
+    // If there is no store before this load, then we can't promote this load.
+    if (I == StoresByIndex.begin()) {
+      // Can't handle this load, bail out.
+      Info.UsingBlocks.push_back(LI->getParent());
+      continue;
+    }
+      
+    // Otherwise, there was a store before this load, the load takes its value.
+    --I;
+    LI->replaceAllUsesWith(I->second->getOperand(0));
+    if (AST && isa<PointerType>(LI->getType()))
+      AST->deleteValue(LI);
+    LI->eraseFromParent();
+    LBI.deleteValue(LI);
+  }
+}
+
+
+// QueuePhiNode - queues a phi-node to be added to a basic-block for a specific
+// Alloca returns true if there wasn't already a phi-node for that variable
+//
+bool PromoteMem2Reg::QueuePhiNode(BasicBlock *BB, unsigned AllocaNo,
+                                  unsigned &Version,
+                                  SmallPtrSet<PHINode*, 16> &InsertedPHINodes) {
+  // Look up the basic-block in question.
+  PHINode *&PN = NewPhiNodes[std::make_pair(BB, AllocaNo)];
+
+  // If the BB already has a phi node added for the i'th alloca then we're done!
+  if (PN) return false;
+
+  // Create a PhiNode using the dereferenced type... and add the phi-node to the
+  // BasicBlock.
+  PN = PHINode::Create(Allocas[AllocaNo]->getAllocatedType(),
+                       Allocas[AllocaNo]->getName() + "." +
+                       utostr(Version++), BB->begin());
+  ++NumPHIInsert;
+  PhiToAllocaMap[PN] = AllocaNo;
+  PN->reserveOperandSpace(getNumPreds(BB));
+  
+  InsertedPHINodes.insert(PN);
+
+  if (AST && isa<PointerType>(PN->getType()))
+    AST->copyValue(PointerAllocaValues[AllocaNo], PN);
+
+  return true;
+}
+
+// RenamePass - Recursively traverse the CFG of the function, renaming loads and
+// stores to the allocas which we are promoting.  IncomingVals indicates what
+// value each Alloca contains on exit from the predecessor block Pred.
+//
+void PromoteMem2Reg::RenamePass(BasicBlock *BB, BasicBlock *Pred,
+                                RenamePassData::ValVector &IncomingVals,
+                                std::vector<RenamePassData> &Worklist) {
+NextIteration:
+  // If we are inserting any phi nodes into this BB, they will already be in the
+  // block.
+  if (PHINode *APN = dyn_cast<PHINode>(BB->begin())) {
+    // If we have PHI nodes to update, compute the number of edges from Pred to
+    // BB.
+    if (PhiToAllocaMap.count(APN)) {
+      // We want to be able to distinguish between PHI nodes being inserted by
+      // this invocation of mem2reg from those phi nodes that already existed in
+      // the IR before mem2reg was run.  We determine that APN is being inserted
+      // because it is missing incoming edges.  All other PHI nodes being
+      // inserted by this pass of mem2reg will have the same number of incoming
+      // operands so far.  Remember this count.
+      unsigned NewPHINumOperands = APN->getNumOperands();
+      
+      unsigned NumEdges = 0;
+      for (succ_iterator I = succ_begin(Pred), E = succ_end(Pred); I != E; ++I)
+        if (*I == BB)
+          ++NumEdges;
+      assert(NumEdges && "Must be at least one edge from Pred to BB!");
+      
+      // Add entries for all the phis.
+      BasicBlock::iterator PNI = BB->begin();
+      do {
+        unsigned AllocaNo = PhiToAllocaMap[APN];
+        
+        // Add N incoming values to the PHI node.
+        for (unsigned i = 0; i != NumEdges; ++i)
+          APN->addIncoming(IncomingVals[AllocaNo], Pred);
+        
+        // The currently active variable for this block is now the PHI.
+        IncomingVals[AllocaNo] = APN;
+        
+        // Get the next phi node.
+        ++PNI;
+        APN = dyn_cast<PHINode>(PNI);
+        if (APN == 0) break;
+        
+        // Verify that it is missing entries.  If not, it is not being inserted
+        // by this mem2reg invocation so we want to ignore it.
+      } while (APN->getNumOperands() == NewPHINumOperands);
+    }
+  }
+  
+  // Don't revisit blocks.
+  if (!Visited.insert(BB)) return;
+
+  for (BasicBlock::iterator II = BB->begin(); !isa<TerminatorInst>(II); ) {
+    Instruction *I = II++; // get the instruction, increment iterator
+
+    if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+      AllocaInst *Src = dyn_cast<AllocaInst>(LI->getPointerOperand());
+      if (!Src) continue;
+  
+      std::map<AllocaInst*, unsigned>::iterator AI = AllocaLookup.find(Src);
+      if (AI == AllocaLookup.end()) continue;
+
+      Value *V = IncomingVals[AI->second];
+
+      // Anything using the load now uses the current value.
+      LI->replaceAllUsesWith(V);
+      if (AST && isa<PointerType>(LI->getType()))
+        AST->deleteValue(LI);
+      BB->getInstList().erase(LI);
+    } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
+      // Delete this instruction and mark the name as the current holder of the
+      // value
+      AllocaInst *Dest = dyn_cast<AllocaInst>(SI->getPointerOperand());
+      if (!Dest) continue;
+      
+      std::map<AllocaInst *, unsigned>::iterator ai = AllocaLookup.find(Dest);
+      if (ai == AllocaLookup.end())
+        continue;
+      
+      // what value were we writing?
+      IncomingVals[ai->second] = SI->getOperand(0);
+      BB->getInstList().erase(SI);
+    }
+  }
+
+  // 'Recurse' to our successors.
+  succ_iterator I = succ_begin(BB), E = succ_end(BB);
+  if (I == E) return;
+
+  // Keep track of the successors so we don't visit the same successor twice
+  SmallPtrSet<BasicBlock*, 8> VisitedSuccs;
+
+  // Handle the first successor without using the worklist.
+  VisitedSuccs.insert(*I);
+  Pred = BB;
+  BB = *I;
+  ++I;
+
+  for (; I != E; ++I)
+    if (VisitedSuccs.insert(*I))
+      Worklist.push_back(RenamePassData(*I, Pred, IncomingVals));
+
+  goto NextIteration;
+}
+
+/// PromoteMemToReg - Promote the specified list of alloca instructions into
+/// scalar registers, inserting PHI nodes as appropriate.  This function makes
+/// use of DominanceFrontier information.  This function does not modify the CFG
+/// of the function at all.  All allocas must be from the same function.
+///
+/// If AST is specified, the specified tracker is updated to reflect changes
+/// made to the IR.
+///
+void llvm::PromoteMemToReg(const std::vector<AllocaInst*> &Allocas,
+                           DominatorTree &DT, DominanceFrontier &DF,
+                           AliasSetTracker *AST) {
+  // If there is nothing to do, bail out...
+  if (Allocas.empty()) return;
+
+  PromoteMem2Reg(Allocas, DT, DF, AST).run();
+}
diff --git a/lib/Transforms/Utils/SimplifyCFG.cpp b/lib/Transforms/Utils/SimplifyCFG.cpp
new file mode 100644
index 0000000..2cde765
--- /dev/null
+++ b/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -0,0 +1,2213 @@
+//===- SimplifyCFG.cpp - Code to perform CFG simplification ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Peephole optimize the CFG.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "simplifycfg"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Constants.h"
+#include "llvm/Instructions.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Type.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/GlobalVariable.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include <algorithm>
+#include <functional>
+#include <set>
+#include <map>
+using namespace llvm;
+
+STATISTIC(NumSpeculations, "Number of speculative executed instructions");
+
+/// SafeToMergeTerminators - Return true if it is safe to merge these two
+/// terminator instructions together.
+///
+static bool SafeToMergeTerminators(TerminatorInst *SI1, TerminatorInst *SI2) {
+  if (SI1 == SI2) return false;  // Can't merge with self!
+  
+  // It is not safe to merge these two switch instructions if they have a common
+  // successor, and if that successor has a PHI node, and if *that* PHI node has
+  // conflicting incoming values from the two switch blocks.
+  BasicBlock *SI1BB = SI1->getParent();
+  BasicBlock *SI2BB = SI2->getParent();
+  SmallPtrSet<BasicBlock*, 16> SI1Succs(succ_begin(SI1BB), succ_end(SI1BB));
+  
+  for (succ_iterator I = succ_begin(SI2BB), E = succ_end(SI2BB); I != E; ++I)
+    if (SI1Succs.count(*I))
+      for (BasicBlock::iterator BBI = (*I)->begin();
+           isa<PHINode>(BBI); ++BBI) {
+        PHINode *PN = cast<PHINode>(BBI);
+        if (PN->getIncomingValueForBlock(SI1BB) !=
+            PN->getIncomingValueForBlock(SI2BB))
+          return false;
+      }
+        
+  return true;
+}
+
+/// AddPredecessorToBlock - Update PHI nodes in Succ to indicate that there will
+/// now be entries in it from the 'NewPred' block.  The values that will be
+/// flowing into the PHI nodes will be the same as those coming in from
+/// ExistPred, an existing predecessor of Succ.
+static void AddPredecessorToBlock(BasicBlock *Succ, BasicBlock *NewPred,
+                                  BasicBlock *ExistPred) {
+  assert(std::find(succ_begin(ExistPred), succ_end(ExistPred), Succ) !=
+         succ_end(ExistPred) && "ExistPred is not a predecessor of Succ!");
+  if (!isa<PHINode>(Succ->begin())) return; // Quick exit if nothing to do
+  
+  PHINode *PN;
+  for (BasicBlock::iterator I = Succ->begin();
+       (PN = dyn_cast<PHINode>(I)); ++I)
+    PN->addIncoming(PN->getIncomingValueForBlock(ExistPred), NewPred);
+}
+
+/// CanPropagatePredecessorsForPHIs - Return true if we can fold BB, an
+/// almost-empty BB ending in an unconditional branch to Succ, into succ.
+///
+/// Assumption: Succ is the single successor for BB.
+///
+static bool CanPropagatePredecessorsForPHIs(BasicBlock *BB, BasicBlock *Succ) {
+  assert(*succ_begin(BB) == Succ && "Succ is not successor of BB!");
+
+  DOUT << "Looking to fold " << BB->getNameStart() << " into " 
+       << Succ->getNameStart() << "\n";
+  // Shortcut, if there is only a single predecessor it must be BB and merging
+  // is always safe
+  if (Succ->getSinglePredecessor()) return true;
+
+  typedef SmallPtrSet<Instruction*, 16> InstrSet;
+  InstrSet BBPHIs;
+
+  // Make a list of all phi nodes in BB
+  BasicBlock::iterator BBI = BB->begin();
+  while (isa<PHINode>(*BBI)) BBPHIs.insert(BBI++);
+
+  // Make a list of the predecessors of BB
+  typedef SmallPtrSet<BasicBlock*, 16> BlockSet;
+  BlockSet BBPreds(pred_begin(BB), pred_end(BB));
+
+  // Use that list to make another list of common predecessors of BB and Succ
+  BlockSet CommonPreds;
+  for (pred_iterator PI = pred_begin(Succ), PE = pred_end(Succ);
+        PI != PE; ++PI)
+    if (BBPreds.count(*PI))
+      CommonPreds.insert(*PI);
+
+  // Shortcut, if there are no common predecessors, merging is always safe
+  if (CommonPreds.empty())
+    return true;
+  
+  // Look at all the phi nodes in Succ, to see if they present a conflict when
+  // merging these blocks
+  for (BasicBlock::iterator I = Succ->begin(); isa<PHINode>(I); ++I) {
+    PHINode *PN = cast<PHINode>(I);
+
+    // If the incoming value from BB is again a PHINode in
+    // BB which has the same incoming value for *PI as PN does, we can
+    // merge the phi nodes and then the blocks can still be merged
+    PHINode *BBPN = dyn_cast<PHINode>(PN->getIncomingValueForBlock(BB));
+    if (BBPN && BBPN->getParent() == BB) {
+      for (BlockSet::iterator PI = CommonPreds.begin(), PE = CommonPreds.end();
+            PI != PE; PI++) {
+        if (BBPN->getIncomingValueForBlock(*PI) 
+              != PN->getIncomingValueForBlock(*PI)) {
+          DOUT << "Can't fold, phi node " << *PN->getNameStart() << " in " 
+               << Succ->getNameStart() << " is conflicting with " 
+               << BBPN->getNameStart() << " with regard to common predecessor "
+               << (*PI)->getNameStart() << "\n";
+          return false;
+        }
+      }
+      // Remove this phinode from the list of phis in BB, since it has been
+      // handled.
+      BBPHIs.erase(BBPN);
+    } else {
+      Value* Val = PN->getIncomingValueForBlock(BB);
+      for (BlockSet::iterator PI = CommonPreds.begin(), PE = CommonPreds.end();
+            PI != PE; PI++) {
+        // See if the incoming value for the common predecessor is equal to the
+        // one for BB, in which case this phi node will not prevent the merging
+        // of the block.
+        if (Val != PN->getIncomingValueForBlock(*PI)) {
+          DOUT << "Can't fold, phi node " << *PN->getNameStart() << " in " 
+          << Succ->getNameStart() << " is conflicting with regard to common "
+          << "predecessor " << (*PI)->getNameStart() << "\n";
+          return false;
+        }
+      }
+    }
+  }
+
+  // If there are any other phi nodes in BB that don't have a phi node in Succ
+  // to merge with, they must be moved to Succ completely. However, for any
+  // predecessors of Succ, branches will be added to the phi node that just
+  // point to itself. So, for any common predecessors, this must not cause
+  // conflicts.
+  for (InstrSet::iterator I = BBPHIs.begin(), E = BBPHIs.end();
+        I != E; I++) {
+    PHINode *PN = cast<PHINode>(*I);
+    for (BlockSet::iterator PI = CommonPreds.begin(), PE = CommonPreds.end();
+          PI != PE; PI++)
+      if (PN->getIncomingValueForBlock(*PI) != PN) {
+        DOUT << "Can't fold, phi node " << *PN->getNameStart() << " in " 
+             << BB->getNameStart() << " is conflicting with regard to common "
+             << "predecessor " << (*PI)->getNameStart() << "\n";
+        return false;
+      }
+  }
+
+  return true;
+}
+
+/// TryToSimplifyUncondBranchFromEmptyBlock - BB contains an unconditional
+/// branch to Succ, and contains no instructions other than PHI nodes and the
+/// branch.  If possible, eliminate BB.
+static bool TryToSimplifyUncondBranchFromEmptyBlock(BasicBlock *BB,
+                                                    BasicBlock *Succ) {
+  // Check to see if merging these blocks would cause conflicts for any of the
+  // phi nodes in BB or Succ. If not, we can safely merge.
+  if (!CanPropagatePredecessorsForPHIs(BB, Succ)) return false;
+  
+  DOUT << "Killing Trivial BB: \n" << *BB;
+  
+  if (isa<PHINode>(Succ->begin())) {
+    // If there is more than one pred of succ, and there are PHI nodes in
+    // the successor, then we need to add incoming edges for the PHI nodes
+    //
+    const SmallVector<BasicBlock*, 16> BBPreds(pred_begin(BB), pred_end(BB));
+    
+    // Loop over all of the PHI nodes in the successor of BB.
+    for (BasicBlock::iterator I = Succ->begin(); isa<PHINode>(I); ++I) {
+      PHINode *PN = cast<PHINode>(I);
+      Value *OldVal = PN->removeIncomingValue(BB, false);
+      assert(OldVal && "No entry in PHI for Pred BB!");
+      
+      // If this incoming value is one of the PHI nodes in BB, the new entries
+      // in the PHI node are the entries from the old PHI.
+      if (isa<PHINode>(OldVal) && cast<PHINode>(OldVal)->getParent() == BB) {
+        PHINode *OldValPN = cast<PHINode>(OldVal);
+        for (unsigned i = 0, e = OldValPN->getNumIncomingValues(); i != e; ++i)
+          // Note that, since we are merging phi nodes and BB and Succ might
+          // have common predecessors, we could end up with a phi node with
+          // identical incoming branches. This will be cleaned up later (and
+          // will trigger asserts if we try to clean it up now, without also
+          // simplifying the corresponding conditional branch).
+          PN->addIncoming(OldValPN->getIncomingValue(i),
+                          OldValPN->getIncomingBlock(i));
+      } else {
+        // Add an incoming value for each of the new incoming values.
+        for (unsigned i = 0, e = BBPreds.size(); i != e; ++i)
+          PN->addIncoming(OldVal, BBPreds[i]);
+      }
+    }
+  }
+  
+  if (isa<PHINode>(&BB->front())) {
+    SmallVector<BasicBlock*, 16>
+    OldSuccPreds(pred_begin(Succ), pred_end(Succ));
+    
+    // Move all PHI nodes in BB to Succ if they are alive, otherwise
+    // delete them.
+    while (PHINode *PN = dyn_cast<PHINode>(&BB->front())) {
+      if (PN->use_empty()) {
+        // Just remove the dead phi.  This happens if Succ's PHIs were the only
+        // users of the PHI nodes.
+        PN->eraseFromParent();
+        continue;
+      }
+    
+      // The instruction is alive, so this means that BB must dominate all
+      // predecessors of Succ (Since all uses of the PN are after its
+      // definition, so in Succ or a block dominated by Succ. If a predecessor
+      // of Succ would not be dominated by BB, PN would violate the def before
+      // use SSA demand). Therefore, we can simply move the phi node to the
+      // next block.
+      Succ->getInstList().splice(Succ->begin(),
+                                 BB->getInstList(), BB->begin());
+      
+      // We need to add new entries for the PHI node to account for
+      // predecessors of Succ that the PHI node does not take into
+      // account.  At this point, since we know that BB dominated succ and all
+      // of its predecessors, this means that we should any newly added
+      // incoming edges should use the PHI node itself as the value for these
+      // edges, because they are loop back edges.
+      for (unsigned i = 0, e = OldSuccPreds.size(); i != e; ++i)
+        if (OldSuccPreds[i] != BB)
+          PN->addIncoming(PN, OldSuccPreds[i]);
+    }
+  }
+    
+  // Everything that jumped to BB now goes to Succ.
+  BB->replaceAllUsesWith(Succ);
+  if (!Succ->hasName()) Succ->takeName(BB);
+  BB->eraseFromParent();              // Delete the old basic block.
+  return true;
+}
+
+/// GetIfCondition - Given a basic block (BB) with two predecessors (and
+/// presumably PHI nodes in it), check to see if the merge at this block is due
+/// to an "if condition".  If so, return the boolean condition that determines
+/// which entry into BB will be taken.  Also, return by references the block
+/// that will be entered from if the condition is true, and the block that will
+/// be entered if the condition is false.
+///
+///
+static Value *GetIfCondition(BasicBlock *BB,
+                             BasicBlock *&IfTrue, BasicBlock *&IfFalse) {
+  assert(std::distance(pred_begin(BB), pred_end(BB)) == 2 &&
+         "Function can only handle blocks with 2 predecessors!");
+  BasicBlock *Pred1 = *pred_begin(BB);
+  BasicBlock *Pred2 = *++pred_begin(BB);
+
+  // We can only handle branches.  Other control flow will be lowered to
+  // branches if possible anyway.
+  if (!isa<BranchInst>(Pred1->getTerminator()) ||
+      !isa<BranchInst>(Pred2->getTerminator()))
+    return 0;
+  BranchInst *Pred1Br = cast<BranchInst>(Pred1->getTerminator());
+  BranchInst *Pred2Br = cast<BranchInst>(Pred2->getTerminator());
+
+  // Eliminate code duplication by ensuring that Pred1Br is conditional if
+  // either are.
+  if (Pred2Br->isConditional()) {
+    // If both branches are conditional, we don't have an "if statement".  In
+    // reality, we could transform this case, but since the condition will be
+    // required anyway, we stand no chance of eliminating it, so the xform is
+    // probably not profitable.
+    if (Pred1Br->isConditional())
+      return 0;
+
+    std::swap(Pred1, Pred2);
+    std::swap(Pred1Br, Pred2Br);
+  }
+
+  if (Pred1Br->isConditional()) {
+    // If we found a conditional branch predecessor, make sure that it branches
+    // to BB and Pred2Br.  If it doesn't, this isn't an "if statement".
+    if (Pred1Br->getSuccessor(0) == BB &&
+        Pred1Br->getSuccessor(1) == Pred2) {
+      IfTrue = Pred1;
+      IfFalse = Pred2;
+    } else if (Pred1Br->getSuccessor(0) == Pred2 &&
+               Pred1Br->getSuccessor(1) == BB) {
+      IfTrue = Pred2;
+      IfFalse = Pred1;
+    } else {
+      // We know that one arm of the conditional goes to BB, so the other must
+      // go somewhere unrelated, and this must not be an "if statement".
+      return 0;
+    }
+
+    // The only thing we have to watch out for here is to make sure that Pred2
+    // doesn't have incoming edges from other blocks.  If it does, the condition
+    // doesn't dominate BB.
+    if (++pred_begin(Pred2) != pred_end(Pred2))
+      return 0;
+
+    return Pred1Br->getCondition();
+  }
+
+  // Ok, if we got here, both predecessors end with an unconditional branch to
+  // BB.  Don't panic!  If both blocks only have a single (identical)
+  // predecessor, and THAT is a conditional branch, then we're all ok!
+  if (pred_begin(Pred1) == pred_end(Pred1) ||
+      ++pred_begin(Pred1) != pred_end(Pred1) ||
+      pred_begin(Pred2) == pred_end(Pred2) ||
+      ++pred_begin(Pred2) != pred_end(Pred2) ||
+      *pred_begin(Pred1) != *pred_begin(Pred2))
+    return 0;
+
+  // Otherwise, if this is a conditional branch, then we can use it!
+  BasicBlock *CommonPred = *pred_begin(Pred1);
+  if (BranchInst *BI = dyn_cast<BranchInst>(CommonPred->getTerminator())) {
+    assert(BI->isConditional() && "Two successors but not conditional?");
+    if (BI->getSuccessor(0) == Pred1) {
+      IfTrue = Pred1;
+      IfFalse = Pred2;
+    } else {
+      IfTrue = Pred2;
+      IfFalse = Pred1;
+    }
+    return BI->getCondition();
+  }
+  return 0;
+}
+
+/// DominatesMergePoint - If we have a merge point of an "if condition" as
+/// accepted above, return true if the specified value dominates the block.  We
+/// don't handle the true generality of domination here, just a special case
+/// which works well enough for us.
+///
+/// If AggressiveInsts is non-null, and if V does not dominate BB, we check to
+/// see if V (which must be an instruction) is cheap to compute and is
+/// non-trapping.  If both are true, the instruction is inserted into the set
+/// and true is returned.
+static bool DominatesMergePoint(Value *V, BasicBlock *BB,
+                                std::set<Instruction*> *AggressiveInsts) {
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (!I) {
+    // Non-instructions all dominate instructions, but not all constantexprs
+    // can be executed unconditionally.
+    if (ConstantExpr *C = dyn_cast<ConstantExpr>(V))
+      if (C->canTrap())
+        return false;
+    return true;
+  }
+  BasicBlock *PBB = I->getParent();
+
+  // We don't want to allow weird loops that might have the "if condition" in
+  // the bottom of this block.
+  if (PBB == BB) return false;
+
+  // If this instruction is defined in a block that contains an unconditional
+  // branch to BB, then it must be in the 'conditional' part of the "if
+  // statement".
+  if (BranchInst *BI = dyn_cast<BranchInst>(PBB->getTerminator()))
+    if (BI->isUnconditional() && BI->getSuccessor(0) == BB) {
+      if (!AggressiveInsts) return false;
+      // Okay, it looks like the instruction IS in the "condition".  Check to
+      // see if its a cheap instruction to unconditionally compute, and if it
+      // only uses stuff defined outside of the condition.  If so, hoist it out.
+      switch (I->getOpcode()) {
+      default: return false;  // Cannot hoist this out safely.
+      case Instruction::Load: {
+        // We can hoist loads that are non-volatile and obviously cannot trap.
+        if (cast<LoadInst>(I)->isVolatile())
+          return false;
+        // FIXME: A computation of a constant can trap!
+        if (!isa<AllocaInst>(I->getOperand(0)) &&
+            !isa<Constant>(I->getOperand(0)))
+          return false;
+        // External weak globals may have address 0, so we can't load them.
+        Value *V2 = I->getOperand(0)->getUnderlyingObject();
+        if (V2) {
+          GlobalVariable* GV = dyn_cast<GlobalVariable>(V2);
+          if (GV && GV->hasExternalWeakLinkage())
+            return false;
+        }
+        // Finally, we have to check to make sure there are no instructions
+        // before the load in its basic block, as we are going to hoist the loop
+        // out to its predecessor.
+        BasicBlock::iterator IP = PBB->begin();
+        while (isa<DbgInfoIntrinsic>(IP))
+          IP++;
+        if (IP != BasicBlock::iterator(I))
+          return false;
+        break;
+      }
+      case Instruction::Add:
+      case Instruction::Sub:
+      case Instruction::And:
+      case Instruction::Or:
+      case Instruction::Xor:
+      case Instruction::Shl:
+      case Instruction::LShr:
+      case Instruction::AShr:
+      case Instruction::ICmp:
+      case Instruction::FCmp:
+        if (I->getOperand(0)->getType()->isFPOrFPVector())
+          return false;  // FP arithmetic might trap.
+        break;   // These are all cheap and non-trapping instructions.
+      }
+
+      // Okay, we can only really hoist these out if their operands are not
+      // defined in the conditional region.
+      for (User::op_iterator i = I->op_begin(), e = I->op_end(); i != e; ++i)
+        if (!DominatesMergePoint(*i, BB, 0))
+          return false;
+      // Okay, it's safe to do this!  Remember this instruction.
+      AggressiveInsts->insert(I);
+    }
+
+  return true;
+}
+
+/// GatherConstantSetEQs - Given a potentially 'or'd together collection of
+/// icmp_eq instructions that compare a value against a constant, return the
+/// value being compared, and stick the constant into the Values vector.
+static Value *GatherConstantSetEQs(Value *V, std::vector<ConstantInt*> &Values){
+  if (Instruction *Inst = dyn_cast<Instruction>(V)) {
+    if (Inst->getOpcode() == Instruction::ICmp &&
+        cast<ICmpInst>(Inst)->getPredicate() == ICmpInst::ICMP_EQ) {
+      if (ConstantInt *C = dyn_cast<ConstantInt>(Inst->getOperand(1))) {
+        Values.push_back(C);
+        return Inst->getOperand(0);
+      } else if (ConstantInt *C = dyn_cast<ConstantInt>(Inst->getOperand(0))) {
+        Values.push_back(C);
+        return Inst->getOperand(1);
+      }
+    } else if (Inst->getOpcode() == Instruction::Or) {
+      if (Value *LHS = GatherConstantSetEQs(Inst->getOperand(0), Values))
+        if (Value *RHS = GatherConstantSetEQs(Inst->getOperand(1), Values))
+          if (LHS == RHS)
+            return LHS;
+    }
+  }
+  return 0;
+}
+
+/// GatherConstantSetNEs - Given a potentially 'and'd together collection of
+/// setne instructions that compare a value against a constant, return the value
+/// being compared, and stick the constant into the Values vector.
+static Value *GatherConstantSetNEs(Value *V, std::vector<ConstantInt*> &Values){
+  if (Instruction *Inst = dyn_cast<Instruction>(V)) {
+    if (Inst->getOpcode() == Instruction::ICmp &&
+               cast<ICmpInst>(Inst)->getPredicate() == ICmpInst::ICMP_NE) {
+      if (ConstantInt *C = dyn_cast<ConstantInt>(Inst->getOperand(1))) {
+        Values.push_back(C);
+        return Inst->getOperand(0);
+      } else if (ConstantInt *C = dyn_cast<ConstantInt>(Inst->getOperand(0))) {
+        Values.push_back(C);
+        return Inst->getOperand(1);
+      }
+    } else if (Inst->getOpcode() == Instruction::And) {
+      if (Value *LHS = GatherConstantSetNEs(Inst->getOperand(0), Values))
+        if (Value *RHS = GatherConstantSetNEs(Inst->getOperand(1), Values))
+          if (LHS == RHS)
+            return LHS;
+    }
+  }
+  return 0;
+}
+
+/// GatherValueComparisons - If the specified Cond is an 'and' or 'or' of a
+/// bunch of comparisons of one value against constants, return the value and
+/// the constants being compared.
+static bool GatherValueComparisons(Instruction *Cond, Value *&CompVal,
+                                   std::vector<ConstantInt*> &Values) {
+  if (Cond->getOpcode() == Instruction::Or) {
+    CompVal = GatherConstantSetEQs(Cond, Values);
+
+    // Return true to indicate that the condition is true if the CompVal is
+    // equal to one of the constants.
+    return true;
+  } else if (Cond->getOpcode() == Instruction::And) {
+    CompVal = GatherConstantSetNEs(Cond, Values);
+
+    // Return false to indicate that the condition is false if the CompVal is
+    // equal to one of the constants.
+    return false;
+  }
+  return false;
+}
+
+static void EraseTerminatorInstAndDCECond(TerminatorInst *TI) {
+  Instruction* Cond = 0;
+  if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
+    Cond = dyn_cast<Instruction>(SI->getCondition());
+  } else if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
+    if (BI->isConditional())
+      Cond = dyn_cast<Instruction>(BI->getCondition());
+  }
+
+  TI->eraseFromParent();
+  if (Cond) RecursivelyDeleteTriviallyDeadInstructions(Cond);
+}
+
+/// isValueEqualityComparison - Return true if the specified terminator checks
+/// to see if a value is equal to constant integer value.
+static Value *isValueEqualityComparison(TerminatorInst *TI) {
+  if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
+    // Do not permit merging of large switch instructions into their
+    // predecessors unless there is only one predecessor.
+    if (SI->getNumSuccessors() * std::distance(pred_begin(SI->getParent()),
+                                               pred_end(SI->getParent())) > 128)
+      return 0;
+
+    return SI->getCondition();
+  }
+  if (BranchInst *BI = dyn_cast<BranchInst>(TI))
+    if (BI->isConditional() && BI->getCondition()->hasOneUse())
+      if (ICmpInst *ICI = dyn_cast<ICmpInst>(BI->getCondition()))
+        if ((ICI->getPredicate() == ICmpInst::ICMP_EQ ||
+             ICI->getPredicate() == ICmpInst::ICMP_NE) &&
+            isa<ConstantInt>(ICI->getOperand(1)))
+          return ICI->getOperand(0);
+  return 0;
+}
+
+/// GetValueEqualityComparisonCases - Given a value comparison instruction,
+/// decode all of the 'cases' that it represents and return the 'default' block.
+static BasicBlock *
+GetValueEqualityComparisonCases(TerminatorInst *TI,
+                                std::vector<std::pair<ConstantInt*,
+                                                      BasicBlock*> > &Cases) {
+  if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
+    Cases.reserve(SI->getNumCases());
+    for (unsigned i = 1, e = SI->getNumCases(); i != e; ++i)
+      Cases.push_back(std::make_pair(SI->getCaseValue(i), SI->getSuccessor(i)));
+    return SI->getDefaultDest();
+  }
+
+  BranchInst *BI = cast<BranchInst>(TI);
+  ICmpInst *ICI = cast<ICmpInst>(BI->getCondition());
+  Cases.push_back(std::make_pair(cast<ConstantInt>(ICI->getOperand(1)),
+                                 BI->getSuccessor(ICI->getPredicate() ==
+                                                  ICmpInst::ICMP_NE)));
+  return BI->getSuccessor(ICI->getPredicate() == ICmpInst::ICMP_EQ);
+}
+
+
+/// EliminateBlockCases - Given a vector of bb/value pairs, remove any entries
+/// in the list that match the specified block.
+static void EliminateBlockCases(BasicBlock *BB,
+               std::vector<std::pair<ConstantInt*, BasicBlock*> > &Cases) {
+  for (unsigned i = 0, e = Cases.size(); i != e; ++i)
+    if (Cases[i].second == BB) {
+      Cases.erase(Cases.begin()+i);
+      --i; --e;
+    }
+}
+
+/// ValuesOverlap - Return true if there are any keys in C1 that exist in C2 as
+/// well.
+static bool
+ValuesOverlap(std::vector<std::pair<ConstantInt*, BasicBlock*> > &C1,
+              std::vector<std::pair<ConstantInt*, BasicBlock*> > &C2) {
+  std::vector<std::pair<ConstantInt*, BasicBlock*> > *V1 = &C1, *V2 = &C2;
+
+  // Make V1 be smaller than V2.
+  if (V1->size() > V2->size())
+    std::swap(V1, V2);
+
+  if (V1->size() == 0) return false;
+  if (V1->size() == 1) {
+    // Just scan V2.
+    ConstantInt *TheVal = (*V1)[0].first;
+    for (unsigned i = 0, e = V2->size(); i != e; ++i)
+      if (TheVal == (*V2)[i].first)
+        return true;
+  }
+
+  // Otherwise, just sort both lists and compare element by element.
+  std::sort(V1->begin(), V1->end());
+  std::sort(V2->begin(), V2->end());
+  unsigned i1 = 0, i2 = 0, e1 = V1->size(), e2 = V2->size();
+  while (i1 != e1 && i2 != e2) {
+    if ((*V1)[i1].first == (*V2)[i2].first)
+      return true;
+    if ((*V1)[i1].first < (*V2)[i2].first)
+      ++i1;
+    else
+      ++i2;
+  }
+  return false;
+}
+
+/// SimplifyEqualityComparisonWithOnlyPredecessor - If TI is known to be a
+/// terminator instruction and its block is known to only have a single
+/// predecessor block, check to see if that predecessor is also a value
+/// comparison with the same value, and if that comparison determines the
+/// outcome of this comparison.  If so, simplify TI.  This does a very limited
+/// form of jump threading.
+static bool SimplifyEqualityComparisonWithOnlyPredecessor(TerminatorInst *TI,
+                                                          BasicBlock *Pred) {
+  Value *PredVal = isValueEqualityComparison(Pred->getTerminator());
+  if (!PredVal) return false;  // Not a value comparison in predecessor.
+
+  Value *ThisVal = isValueEqualityComparison(TI);
+  assert(ThisVal && "This isn't a value comparison!!");
+  if (ThisVal != PredVal) return false;  // Different predicates.
+
+  // Find out information about when control will move from Pred to TI's block.
+  std::vector<std::pair<ConstantInt*, BasicBlock*> > PredCases;
+  BasicBlock *PredDef = GetValueEqualityComparisonCases(Pred->getTerminator(),
+                                                        PredCases);
+  EliminateBlockCases(PredDef, PredCases);  // Remove default from cases.
+
+  // Find information about how control leaves this block.
+  std::vector<std::pair<ConstantInt*, BasicBlock*> > ThisCases;
+  BasicBlock *ThisDef = GetValueEqualityComparisonCases(TI, ThisCases);
+  EliminateBlockCases(ThisDef, ThisCases);  // Remove default from cases.
+
+  // If TI's block is the default block from Pred's comparison, potentially
+  // simplify TI based on this knowledge.
+  if (PredDef == TI->getParent()) {
+    // If we are here, we know that the value is none of those cases listed in
+    // PredCases.  If there are any cases in ThisCases that are in PredCases, we
+    // can simplify TI.
+    if (ValuesOverlap(PredCases, ThisCases)) {
+      if (isa<BranchInst>(TI)) {
+        // Okay, one of the successors of this condbr is dead.  Convert it to a
+        // uncond br.
+        assert(ThisCases.size() == 1 && "Branch can only have one case!");
+        // Insert the new branch.
+        Instruction *NI = BranchInst::Create(ThisDef, TI);
+
+        // Remove PHI node entries for the dead edge.
+        ThisCases[0].second->removePredecessor(TI->getParent());
+
+        DOUT << "Threading pred instr: " << *Pred->getTerminator()
+             << "Through successor TI: " << *TI << "Leaving: " << *NI << "\n";
+
+        EraseTerminatorInstAndDCECond(TI);
+        return true;
+
+      } else {
+        SwitchInst *SI = cast<SwitchInst>(TI);
+        // Okay, TI has cases that are statically dead, prune them away.
+        SmallPtrSet<Constant*, 16> DeadCases;
+        for (unsigned i = 0, e = PredCases.size(); i != e; ++i)
+          DeadCases.insert(PredCases[i].first);
+
+        DOUT << "Threading pred instr: " << *Pred->getTerminator()
+             << "Through successor TI: " << *TI;
+
+        for (unsigned i = SI->getNumCases()-1; i != 0; --i)
+          if (DeadCases.count(SI->getCaseValue(i))) {
+            SI->getSuccessor(i)->removePredecessor(TI->getParent());
+            SI->removeCase(i);
+          }
+
+        DOUT << "Leaving: " << *TI << "\n";
+        return true;
+      }
+    }
+
+  } else {
+    // Otherwise, TI's block must correspond to some matched value.  Find out
+    // which value (or set of values) this is.
+    ConstantInt *TIV = 0;
+    BasicBlock *TIBB = TI->getParent();
+    for (unsigned i = 0, e = PredCases.size(); i != e; ++i)
+      if (PredCases[i].second == TIBB) {
+        if (TIV == 0)
+          TIV = PredCases[i].first;
+        else
+          return false;  // Cannot handle multiple values coming to this block.
+      }
+    assert(TIV && "No edge from pred to succ?");
+
+    // Okay, we found the one constant that our value can be if we get into TI's
+    // BB.  Find out which successor will unconditionally be branched to.
+    BasicBlock *TheRealDest = 0;
+    for (unsigned i = 0, e = ThisCases.size(); i != e; ++i)
+      if (ThisCases[i].first == TIV) {
+        TheRealDest = ThisCases[i].second;
+        break;
+      }
+
+    // If not handled by any explicit cases, it is handled by the default case.
+    if (TheRealDest == 0) TheRealDest = ThisDef;
+
+    // Remove PHI node entries for dead edges.
+    BasicBlock *CheckEdge = TheRealDest;
+    for (succ_iterator SI = succ_begin(TIBB), e = succ_end(TIBB); SI != e; ++SI)
+      if (*SI != CheckEdge)
+        (*SI)->removePredecessor(TIBB);
+      else
+        CheckEdge = 0;
+
+    // Insert the new branch.
+    Instruction *NI = BranchInst::Create(TheRealDest, TI);
+
+    DOUT << "Threading pred instr: " << *Pred->getTerminator()
+         << "Through successor TI: " << *TI << "Leaving: " << *NI << "\n";
+
+    EraseTerminatorInstAndDCECond(TI);
+    return true;
+  }
+  return false;
+}
+
+namespace {
+  /// ConstantIntOrdering - This class implements a stable ordering of constant
+  /// integers that does not depend on their address.  This is important for
+  /// applications that sort ConstantInt's to ensure uniqueness.
+  struct ConstantIntOrdering {
+    bool operator()(const ConstantInt *LHS, const ConstantInt *RHS) const {
+      return LHS->getValue().ult(RHS->getValue());
+    }
+  };
+}
+
+/// FoldValueComparisonIntoPredecessors - The specified terminator is a value
+/// equality comparison instruction (either a switch or a branch on "X == c").
+/// See if any of the predecessors of the terminator block are value comparisons
+/// on the same value.  If so, and if safe to do so, fold them together.
+static bool FoldValueComparisonIntoPredecessors(TerminatorInst *TI) {
+  BasicBlock *BB = TI->getParent();
+  Value *CV = isValueEqualityComparison(TI);  // CondVal
+  assert(CV && "Not a comparison?");
+  bool Changed = false;
+
+  SmallVector<BasicBlock*, 16> Preds(pred_begin(BB), pred_end(BB));
+  while (!Preds.empty()) {
+    BasicBlock *Pred = Preds.pop_back_val();
+
+    // See if the predecessor is a comparison with the same value.
+    TerminatorInst *PTI = Pred->getTerminator();
+    Value *PCV = isValueEqualityComparison(PTI);  // PredCondVal
+
+    if (PCV == CV && SafeToMergeTerminators(TI, PTI)) {
+      // Figure out which 'cases' to copy from SI to PSI.
+      std::vector<std::pair<ConstantInt*, BasicBlock*> > BBCases;
+      BasicBlock *BBDefault = GetValueEqualityComparisonCases(TI, BBCases);
+
+      std::vector<std::pair<ConstantInt*, BasicBlock*> > PredCases;
+      BasicBlock *PredDefault = GetValueEqualityComparisonCases(PTI, PredCases);
+
+      // Based on whether the default edge from PTI goes to BB or not, fill in
+      // PredCases and PredDefault with the new switch cases we would like to
+      // build.
+      SmallVector<BasicBlock*, 8> NewSuccessors;
+
+      if (PredDefault == BB) {
+        // If this is the default destination from PTI, only the edges in TI
+        // that don't occur in PTI, or that branch to BB will be activated.
+        std::set<ConstantInt*, ConstantIntOrdering> PTIHandled;
+        for (unsigned i = 0, e = PredCases.size(); i != e; ++i)
+          if (PredCases[i].second != BB)
+            PTIHandled.insert(PredCases[i].first);
+          else {
+            // The default destination is BB, we don't need explicit targets.
+            std::swap(PredCases[i], PredCases.back());
+            PredCases.pop_back();
+            --i; --e;
+          }
+
+        // Reconstruct the new switch statement we will be building.
+        if (PredDefault != BBDefault) {
+          PredDefault->removePredecessor(Pred);
+          PredDefault = BBDefault;
+          NewSuccessors.push_back(BBDefault);
+        }
+        for (unsigned i = 0, e = BBCases.size(); i != e; ++i)
+          if (!PTIHandled.count(BBCases[i].first) &&
+              BBCases[i].second != BBDefault) {
+            PredCases.push_back(BBCases[i]);
+            NewSuccessors.push_back(BBCases[i].second);
+          }
+
+      } else {
+        // If this is not the default destination from PSI, only the edges
+        // in SI that occur in PSI with a destination of BB will be
+        // activated.
+        std::set<ConstantInt*, ConstantIntOrdering> PTIHandled;
+        for (unsigned i = 0, e = PredCases.size(); i != e; ++i)
+          if (PredCases[i].second == BB) {
+            PTIHandled.insert(PredCases[i].first);
+            std::swap(PredCases[i], PredCases.back());
+            PredCases.pop_back();
+            --i; --e;
+          }
+
+        // Okay, now we know which constants were sent to BB from the
+        // predecessor.  Figure out where they will all go now.
+        for (unsigned i = 0, e = BBCases.size(); i != e; ++i)
+          if (PTIHandled.count(BBCases[i].first)) {
+            // If this is one we are capable of getting...
+            PredCases.push_back(BBCases[i]);
+            NewSuccessors.push_back(BBCases[i].second);
+            PTIHandled.erase(BBCases[i].first);// This constant is taken care of
+          }
+
+        // If there are any constants vectored to BB that TI doesn't handle,
+        // they must go to the default destination of TI.
+        for (std::set<ConstantInt*, ConstantIntOrdering>::iterator I = 
+                                    PTIHandled.begin(),
+               E = PTIHandled.end(); I != E; ++I) {
+          PredCases.push_back(std::make_pair(*I, BBDefault));
+          NewSuccessors.push_back(BBDefault);
+        }
+      }
+
+      // Okay, at this point, we know which new successor Pred will get.  Make
+      // sure we update the number of entries in the PHI nodes for these
+      // successors.
+      for (unsigned i = 0, e = NewSuccessors.size(); i != e; ++i)
+        AddPredecessorToBlock(NewSuccessors[i], Pred, BB);
+
+      // Now that the successors are updated, create the new Switch instruction.
+      SwitchInst *NewSI = SwitchInst::Create(CV, PredDefault,
+                                             PredCases.size(), PTI);
+      for (unsigned i = 0, e = PredCases.size(); i != e; ++i)
+        NewSI->addCase(PredCases[i].first, PredCases[i].second);
+
+      EraseTerminatorInstAndDCECond(PTI);
+
+      // Okay, last check.  If BB is still a successor of PSI, then we must
+      // have an infinite loop case.  If so, add an infinitely looping block
+      // to handle the case to preserve the behavior of the code.
+      BasicBlock *InfLoopBlock = 0;
+      for (unsigned i = 0, e = NewSI->getNumSuccessors(); i != e; ++i)
+        if (NewSI->getSuccessor(i) == BB) {
+          if (InfLoopBlock == 0) {
+            // Insert it at the end of the function, because it's either code,
+            // or it won't matter if it's hot. :)
+            InfLoopBlock = BasicBlock::Create("infloop", BB->getParent());
+            BranchInst::Create(InfLoopBlock, InfLoopBlock);
+          }
+          NewSI->setSuccessor(i, InfLoopBlock);
+        }
+
+      Changed = true;
+    }
+  }
+  return Changed;
+}
+
+/// HoistThenElseCodeToIf - Given a conditional branch that goes to BB1 and
+/// BB2, hoist any common code in the two blocks up into the branch block.  The
+/// caller of this function guarantees that BI's block dominates BB1 and BB2.
+static bool HoistThenElseCodeToIf(BranchInst *BI) {
+  // This does very trivial matching, with limited scanning, to find identical
+  // instructions in the two blocks.  In particular, we don't want to get into
+  // O(M*N) situations here where M and N are the sizes of BB1 and BB2.  As
+  // such, we currently just scan for obviously identical instructions in an
+  // identical order.
+  BasicBlock *BB1 = BI->getSuccessor(0);  // The true destination.
+  BasicBlock *BB2 = BI->getSuccessor(1);  // The false destination
+
+  BasicBlock::iterator BB1_Itr = BB1->begin();
+  BasicBlock::iterator BB2_Itr = BB2->begin();
+
+  Instruction *I1 = BB1_Itr++, *I2 = BB2_Itr++;
+  while (isa<DbgInfoIntrinsic>(I1))
+    I1 = BB1_Itr++;
+  while (isa<DbgInfoIntrinsic>(I2))
+    I2 = BB2_Itr++;
+  if (I1->getOpcode() != I2->getOpcode() || isa<PHINode>(I1) || 
+      isa<InvokeInst>(I1) || !I1->isIdenticalTo(I2))
+    return false;
+
+  // If we get here, we can hoist at least one instruction.
+  BasicBlock *BIParent = BI->getParent();
+
+  do {
+    // If we are hoisting the terminator instruction, don't move one (making a
+    // broken BB), instead clone it, and remove BI.
+    if (isa<TerminatorInst>(I1))
+      goto HoistTerminator;
+
+    // For a normal instruction, we just move one to right before the branch,
+    // then replace all uses of the other with the first.  Finally, we remove
+    // the now redundant second instruction.
+    BIParent->getInstList().splice(BI, BB1->getInstList(), I1);
+    if (!I2->use_empty())
+      I2->replaceAllUsesWith(I1);
+    BB2->getInstList().erase(I2);
+
+    I1 = BB1_Itr++;
+    while (isa<DbgInfoIntrinsic>(I1))
+      I1 = BB1_Itr++;
+    I2 = BB2_Itr++;
+    while (isa<DbgInfoIntrinsic>(I2))
+      I2 = BB2_Itr++;
+  } while (I1->getOpcode() == I2->getOpcode() && I1->isIdenticalTo(I2));
+
+  return true;
+
+HoistTerminator:
+  // Okay, it is safe to hoist the terminator.
+  Instruction *NT = I1->clone();
+  BIParent->getInstList().insert(BI, NT);
+  if (NT->getType() != Type::VoidTy) {
+    I1->replaceAllUsesWith(NT);
+    I2->replaceAllUsesWith(NT);
+    NT->takeName(I1);
+  }
+
+  // Hoisting one of the terminators from our successor is a great thing.
+  // Unfortunately, the successors of the if/else blocks may have PHI nodes in
+  // them.  If they do, all PHI entries for BB1/BB2 must agree for all PHI
+  // nodes, so we insert select instruction to compute the final result.
+  std::map<std::pair<Value*,Value*>, SelectInst*> InsertedSelects;
+  for (succ_iterator SI = succ_begin(BB1), E = succ_end(BB1); SI != E; ++SI) {
+    PHINode *PN;
+    for (BasicBlock::iterator BBI = SI->begin();
+         (PN = dyn_cast<PHINode>(BBI)); ++BBI) {
+      Value *BB1V = PN->getIncomingValueForBlock(BB1);
+      Value *BB2V = PN->getIncomingValueForBlock(BB2);
+      if (BB1V != BB2V) {
+        // These values do not agree.  Insert a select instruction before NT
+        // that determines the right value.
+        SelectInst *&SI = InsertedSelects[std::make_pair(BB1V, BB2V)];
+        if (SI == 0)
+          SI = SelectInst::Create(BI->getCondition(), BB1V, BB2V,
+                                  BB1V->getName()+"."+BB2V->getName(), NT);
+        // Make the PHI node use the select for all incoming values for BB1/BB2
+        for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+          if (PN->getIncomingBlock(i) == BB1 || PN->getIncomingBlock(i) == BB2)
+            PN->setIncomingValue(i, SI);
+      }
+    }
+  }
+
+  // Update any PHI nodes in our new successors.
+  for (succ_iterator SI = succ_begin(BB1), E = succ_end(BB1); SI != E; ++SI)
+    AddPredecessorToBlock(*SI, BIParent, BB1);
+
+  EraseTerminatorInstAndDCECond(BI);
+  return true;
+}
+
+/// SpeculativelyExecuteBB - Given a conditional branch that goes to BB1
+/// and an BB2 and the only successor of BB1 is BB2, hoist simple code
+/// (for now, restricted to a single instruction that's side effect free) from
+/// the BB1 into the branch block to speculatively execute it.
+static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *BB1) {
+  // Only speculatively execution a single instruction (not counting the
+  // terminator) for now.
+  Instruction *HInst = NULL;
+  Instruction *Term = BB1->getTerminator();
+  for (BasicBlock::iterator BBI = BB1->begin(), BBE = BB1->end();
+       BBI != BBE; ++BBI) {
+    Instruction *I = BBI;
+    // Skip debug info.
+    if (isa<DbgInfoIntrinsic>(I))   continue;
+    if (I == Term)  break;
+
+    if (!HInst)
+      HInst = I;
+    else
+      return false;
+  }
+  if (!HInst)
+    return false;
+
+  // Be conservative for now. FP select instruction can often be expensive.
+  Value *BrCond = BI->getCondition();
+  if (isa<Instruction>(BrCond) &&
+      cast<Instruction>(BrCond)->getOpcode() == Instruction::FCmp)
+    return false;
+
+  // If BB1 is actually on the false edge of the conditional branch, remember
+  // to swap the select operands later.
+  bool Invert = false;
+  if (BB1 != BI->getSuccessor(0)) {
+    assert(BB1 == BI->getSuccessor(1) && "No edge from 'if' block?");
+    Invert = true;
+  }
+
+  // Turn
+  // BB:
+  //     %t1 = icmp
+  //     br i1 %t1, label %BB1, label %BB2
+  // BB1:
+  //     %t3 = add %t2, c
+  //     br label BB2
+  // BB2:
+  // =>
+  // BB:
+  //     %t1 = icmp
+  //     %t4 = add %t2, c
+  //     %t3 = select i1 %t1, %t2, %t3
+  switch (HInst->getOpcode()) {
+  default: return false;  // Not safe / profitable to hoist.
+  case Instruction::Add:
+  case Instruction::Sub:
+    // FP arithmetic might trap. Not worth doing for vector ops.
+    if (HInst->getType()->isFloatingPoint() 
+        || isa<VectorType>(HInst->getType()))
+      return false;
+    break;
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:
+  case Instruction::Shl:
+  case Instruction::LShr:
+  case Instruction::AShr:
+    // Don't mess with vector operations.
+    if (isa<VectorType>(HInst->getType()))
+      return false;
+    break;   // These are all cheap and non-trapping instructions.
+  }
+  
+  // If the instruction is obviously dead, don't try to predicate it.
+  if (HInst->use_empty()) {
+    HInst->eraseFromParent();
+    return true;
+  }
+
+  // Can we speculatively execute the instruction? And what is the value 
+  // if the condition is false? Consider the phi uses, if the incoming value
+  // from the "if" block are all the same V, then V is the value of the
+  // select if the condition is false.
+  BasicBlock *BIParent = BI->getParent();
+  SmallVector<PHINode*, 4> PHIUses;
+  Value *FalseV = NULL;
+  
+  BasicBlock *BB2 = BB1->getTerminator()->getSuccessor(0);
+  for (Value::use_iterator UI = HInst->use_begin(), E = HInst->use_end();
+       UI != E; ++UI) {
+    // Ignore any user that is not a PHI node in BB2.  These can only occur in
+    // unreachable blocks, because they would not be dominated by the instr.
+    PHINode *PN = dyn_cast<PHINode>(UI);
+    if (!PN || PN->getParent() != BB2)
+      return false;
+    PHIUses.push_back(PN);
+    
+    Value *PHIV = PN->getIncomingValueForBlock(BIParent);
+    if (!FalseV)
+      FalseV = PHIV;
+    else if (FalseV != PHIV)
+      return false;  // Inconsistent value when condition is false.
+  }
+  
+  assert(FalseV && "Must have at least one user, and it must be a PHI");
+
+  // Do not hoist the instruction if any of its operands are defined but not
+  // used in this BB. The transformation will prevent the operand from
+  // being sunk into the use block.
+  for (User::op_iterator i = HInst->op_begin(), e = HInst->op_end(); 
+       i != e; ++i) {
+    Instruction *OpI = dyn_cast<Instruction>(*i);
+    if (OpI && OpI->getParent() == BIParent &&
+        !OpI->isUsedInBasicBlock(BIParent))
+      return false;
+  }
+
+  // If we get here, we can hoist the instruction. Try to place it
+  // before the icmp instruction preceding the conditional branch.
+  BasicBlock::iterator InsertPos = BI;
+  if (InsertPos != BIParent->begin())
+    --InsertPos;
+  // Skip debug info between condition and branch.
+  while (InsertPos != BIParent->begin() && isa<DbgInfoIntrinsic>(InsertPos))
+    --InsertPos;
+  if (InsertPos == BrCond && !isa<PHINode>(BrCond)) {
+    SmallPtrSet<Instruction *, 4> BB1Insns;
+    for(BasicBlock::iterator BB1I = BB1->begin(), BB1E = BB1->end(); 
+        BB1I != BB1E; ++BB1I) 
+      BB1Insns.insert(BB1I);
+    for(Value::use_iterator UI = BrCond->use_begin(), UE = BrCond->use_end();
+        UI != UE; ++UI) {
+      Instruction *Use = cast<Instruction>(*UI);
+      if (BB1Insns.count(Use)) {
+        // If BrCond uses the instruction that place it just before
+        // branch instruction.
+        InsertPos = BI;
+        break;
+      }
+    }
+  } else
+    InsertPos = BI;
+  BIParent->getInstList().splice(InsertPos, BB1->getInstList(), HInst);
+
+  // Create a select whose true value is the speculatively executed value and
+  // false value is the previously determined FalseV.
+  SelectInst *SI;
+  if (Invert)
+    SI = SelectInst::Create(BrCond, FalseV, HInst,
+                            FalseV->getName() + "." + HInst->getName(), BI);
+  else
+    SI = SelectInst::Create(BrCond, HInst, FalseV,
+                            HInst->getName() + "." + FalseV->getName(), BI);
+
+  // Make the PHI node use the select for all incoming values for "then" and
+  // "if" blocks.
+  for (unsigned i = 0, e = PHIUses.size(); i != e; ++i) {
+    PHINode *PN = PHIUses[i];
+    for (unsigned j = 0, ee = PN->getNumIncomingValues(); j != ee; ++j)
+      if (PN->getIncomingBlock(j) == BB1 ||
+          PN->getIncomingBlock(j) == BIParent)
+        PN->setIncomingValue(j, SI);
+  }
+
+  ++NumSpeculations;
+  return true;
+}
+
+/// BlockIsSimpleEnoughToThreadThrough - Return true if we can thread a branch
+/// across this block.
+static bool BlockIsSimpleEnoughToThreadThrough(BasicBlock *BB) {
+  BranchInst *BI = cast<BranchInst>(BB->getTerminator());
+  unsigned Size = 0;
+  
+  for (BasicBlock::iterator BBI = BB->begin(); &*BBI != BI; ++BBI) {
+    if (isa<DbgInfoIntrinsic>(BBI))
+      continue;
+    if (Size > 10) return false;  // Don't clone large BB's.
+    ++Size;
+    
+    // We can only support instructions that do not define values that are
+    // live outside of the current basic block.
+    for (Value::use_iterator UI = BBI->use_begin(), E = BBI->use_end();
+         UI != E; ++UI) {
+      Instruction *U = cast<Instruction>(*UI);
+      if (U->getParent() != BB || isa<PHINode>(U)) return false;
+    }
+    
+    // Looks ok, continue checking.
+  }
+
+  return true;
+}
+
+/// FoldCondBranchOnPHI - If we have a conditional branch on a PHI node value
+/// that is defined in the same block as the branch and if any PHI entries are
+/// constants, thread edges corresponding to that entry to be branches to their
+/// ultimate destination.
+static bool FoldCondBranchOnPHI(BranchInst *BI) {
+  BasicBlock *BB = BI->getParent();
+  PHINode *PN = dyn_cast<PHINode>(BI->getCondition());
+  // NOTE: we currently cannot transform this case if the PHI node is used
+  // outside of the block.
+  if (!PN || PN->getParent() != BB || !PN->hasOneUse())
+    return false;
+  
+  // Degenerate case of a single entry PHI.
+  if (PN->getNumIncomingValues() == 1) {
+    FoldSingleEntryPHINodes(PN->getParent());
+    return true;    
+  }
+
+  // Now we know that this block has multiple preds and two succs.
+  if (!BlockIsSimpleEnoughToThreadThrough(BB)) return false;
+  
+  // Okay, this is a simple enough basic block.  See if any phi values are
+  // constants.
+  for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+    ConstantInt *CB;
+    if ((CB = dyn_cast<ConstantInt>(PN->getIncomingValue(i))) &&
+        CB->getType() == Type::Int1Ty) {
+      // Okay, we now know that all edges from PredBB should be revectored to
+      // branch to RealDest.
+      BasicBlock *PredBB = PN->getIncomingBlock(i);
+      BasicBlock *RealDest = BI->getSuccessor(!CB->getZExtValue());
+      
+      if (RealDest == BB) continue;  // Skip self loops.
+      
+      // The dest block might have PHI nodes, other predecessors and other
+      // difficult cases.  Instead of being smart about this, just insert a new
+      // block that jumps to the destination block, effectively splitting
+      // the edge we are about to create.
+      BasicBlock *EdgeBB = BasicBlock::Create(RealDest->getName()+".critedge",
+                                              RealDest->getParent(), RealDest);
+      BranchInst::Create(RealDest, EdgeBB);
+      PHINode *PN;
+      for (BasicBlock::iterator BBI = RealDest->begin();
+           (PN = dyn_cast<PHINode>(BBI)); ++BBI) {
+        Value *V = PN->getIncomingValueForBlock(BB);
+        PN->addIncoming(V, EdgeBB);
+      }
+
+      // BB may have instructions that are being threaded over.  Clone these
+      // instructions into EdgeBB.  We know that there will be no uses of the
+      // cloned instructions outside of EdgeBB.
+      BasicBlock::iterator InsertPt = EdgeBB->begin();
+      std::map<Value*, Value*> TranslateMap;  // Track translated values.
+      for (BasicBlock::iterator BBI = BB->begin(); &*BBI != BI; ++BBI) {
+        if (PHINode *PN = dyn_cast<PHINode>(BBI)) {
+          TranslateMap[PN] = PN->getIncomingValueForBlock(PredBB);
+        } else {
+          // Clone the instruction.
+          Instruction *N = BBI->clone();
+          if (BBI->hasName()) N->setName(BBI->getName()+".c");
+          
+          // Update operands due to translation.
+          for (User::op_iterator i = N->op_begin(), e = N->op_end();
+               i != e; ++i) {
+            std::map<Value*, Value*>::iterator PI =
+              TranslateMap.find(*i);
+            if (PI != TranslateMap.end())
+              *i = PI->second;
+          }
+          
+          // Check for trivial simplification.
+          if (Constant *C = ConstantFoldInstruction(N)) {
+            TranslateMap[BBI] = C;
+            delete N;   // Constant folded away, don't need actual inst
+          } else {
+            // Insert the new instruction into its new home.
+            EdgeBB->getInstList().insert(InsertPt, N);
+            if (!BBI->use_empty())
+              TranslateMap[BBI] = N;
+          }
+        }
+      }
+
+      // Loop over all of the edges from PredBB to BB, changing them to branch
+      // to EdgeBB instead.
+      TerminatorInst *PredBBTI = PredBB->getTerminator();
+      for (unsigned i = 0, e = PredBBTI->getNumSuccessors(); i != e; ++i)
+        if (PredBBTI->getSuccessor(i) == BB) {
+          BB->removePredecessor(PredBB);
+          PredBBTI->setSuccessor(i, EdgeBB);
+        }
+      
+      // Recurse, simplifying any other constants.
+      return FoldCondBranchOnPHI(BI) | true;
+    }
+  }
+
+  return false;
+}
+
+/// FoldTwoEntryPHINode - Given a BB that starts with the specified two-entry
+/// PHI node, see if we can eliminate it.
+static bool FoldTwoEntryPHINode(PHINode *PN) {
+  // Ok, this is a two entry PHI node.  Check to see if this is a simple "if
+  // statement", which has a very simple dominance structure.  Basically, we
+  // are trying to find the condition that is being branched on, which
+  // subsequently causes this merge to happen.  We really want control
+  // dependence information for this check, but simplifycfg can't keep it up
+  // to date, and this catches most of the cases we care about anyway.
+  //
+  BasicBlock *BB = PN->getParent();
+  BasicBlock *IfTrue, *IfFalse;
+  Value *IfCond = GetIfCondition(BB, IfTrue, IfFalse);
+  if (!IfCond) return false;
+  
+  // Okay, we found that we can merge this two-entry phi node into a select.
+  // Doing so would require us to fold *all* two entry phi nodes in this block.
+  // At some point this becomes non-profitable (particularly if the target
+  // doesn't support cmov's).  Only do this transformation if there are two or
+  // fewer PHI nodes in this block.
+  unsigned NumPhis = 0;
+  for (BasicBlock::iterator I = BB->begin(); isa<PHINode>(I); ++NumPhis, ++I)
+    if (NumPhis > 2)
+      return false;
+  
+  DOUT << "FOUND IF CONDITION!  " << *IfCond << "  T: "
+       << IfTrue->getName() << "  F: " << IfFalse->getName() << "\n";
+  
+  // Loop over the PHI's seeing if we can promote them all to select
+  // instructions.  While we are at it, keep track of the instructions
+  // that need to be moved to the dominating block.
+  std::set<Instruction*> AggressiveInsts;
+  
+  BasicBlock::iterator AfterPHIIt = BB->begin();
+  while (isa<PHINode>(AfterPHIIt)) {
+    PHINode *PN = cast<PHINode>(AfterPHIIt++);
+    if (PN->getIncomingValue(0) == PN->getIncomingValue(1)) {
+      if (PN->getIncomingValue(0) != PN)
+        PN->replaceAllUsesWith(PN->getIncomingValue(0));
+      else
+        PN->replaceAllUsesWith(UndefValue::get(PN->getType()));
+    } else if (!DominatesMergePoint(PN->getIncomingValue(0), BB,
+                                    &AggressiveInsts) ||
+               !DominatesMergePoint(PN->getIncomingValue(1), BB,
+                                    &AggressiveInsts)) {
+      return false;
+    }
+  }
+  
+  // If we all PHI nodes are promotable, check to make sure that all
+  // instructions in the predecessor blocks can be promoted as well.  If
+  // not, we won't be able to get rid of the control flow, so it's not
+  // worth promoting to select instructions.
+  BasicBlock *DomBlock = 0, *IfBlock1 = 0, *IfBlock2 = 0;
+  PN = cast<PHINode>(BB->begin());
+  BasicBlock *Pred = PN->getIncomingBlock(0);
+  if (cast<BranchInst>(Pred->getTerminator())->isUnconditional()) {
+    IfBlock1 = Pred;
+    DomBlock = *pred_begin(Pred);
+    for (BasicBlock::iterator I = Pred->begin();
+         !isa<TerminatorInst>(I); ++I)
+      if (!AggressiveInsts.count(I) && !isa<DbgInfoIntrinsic>(I)) {
+        // This is not an aggressive instruction that we can promote.
+        // Because of this, we won't be able to get rid of the control
+        // flow, so the xform is not worth it.
+        return false;
+      }
+  }
+    
+  Pred = PN->getIncomingBlock(1);
+  if (cast<BranchInst>(Pred->getTerminator())->isUnconditional()) {
+    IfBlock2 = Pred;
+    DomBlock = *pred_begin(Pred);
+    for (BasicBlock::iterator I = Pred->begin();
+         !isa<TerminatorInst>(I); ++I)
+      if (!AggressiveInsts.count(I) && !isa<DbgInfoIntrinsic>(I)) {
+        // This is not an aggressive instruction that we can promote.
+        // Because of this, we won't be able to get rid of the control
+        // flow, so the xform is not worth it.
+        return false;
+      }
+  }
+      
+  // If we can still promote the PHI nodes after this gauntlet of tests,
+  // do all of the PHI's now.
+
+  // Move all 'aggressive' instructions, which are defined in the
+  // conditional parts of the if's up to the dominating block.
+  if (IfBlock1) {
+    DomBlock->getInstList().splice(DomBlock->getTerminator(),
+                                   IfBlock1->getInstList(),
+                                   IfBlock1->begin(),
+                                   IfBlock1->getTerminator());
+  }
+  if (IfBlock2) {
+    DomBlock->getInstList().splice(DomBlock->getTerminator(),
+                                   IfBlock2->getInstList(),
+                                   IfBlock2->begin(),
+                                   IfBlock2->getTerminator());
+  }
+  
+  while (PHINode *PN = dyn_cast<PHINode>(BB->begin())) {
+    // Change the PHI node into a select instruction.
+    Value *TrueVal =
+      PN->getIncomingValue(PN->getIncomingBlock(0) == IfFalse);
+    Value *FalseVal =
+      PN->getIncomingValue(PN->getIncomingBlock(0) == IfTrue);
+    
+    Value *NV = SelectInst::Create(IfCond, TrueVal, FalseVal, "", AfterPHIIt);
+    PN->replaceAllUsesWith(NV);
+    NV->takeName(PN);
+    
+    BB->getInstList().erase(PN);
+  }
+  return true;
+}
+
+/// isTerminatorFirstRelevantInsn - Return true if Term is very first 
+/// instruction ignoring Phi nodes and dbg intrinsics.
+static bool isTerminatorFirstRelevantInsn(BasicBlock *BB, Instruction *Term) {
+  BasicBlock::iterator BBI = Term;
+  while (BBI != BB->begin()) {
+    --BBI;
+    if (!isa<DbgInfoIntrinsic>(BBI))
+      break;
+  }
+
+  if (isa<PHINode>(BBI) || &*BBI == Term || isa<DbgInfoIntrinsic>(BBI))
+    return true;
+  return false;
+}
+
+/// SimplifyCondBranchToTwoReturns - If we found a conditional branch that goes
+/// to two returning blocks, try to merge them together into one return,
+/// introducing a select if the return values disagree.
+static bool SimplifyCondBranchToTwoReturns(BranchInst *BI) {
+  assert(BI->isConditional() && "Must be a conditional branch");
+  BasicBlock *TrueSucc = BI->getSuccessor(0);
+  BasicBlock *FalseSucc = BI->getSuccessor(1);
+  ReturnInst *TrueRet = cast<ReturnInst>(TrueSucc->getTerminator());
+  ReturnInst *FalseRet = cast<ReturnInst>(FalseSucc->getTerminator());
+  
+  // Check to ensure both blocks are empty (just a return) or optionally empty
+  // with PHI nodes.  If there are other instructions, merging would cause extra
+  // computation on one path or the other.
+  if (!isTerminatorFirstRelevantInsn(TrueSucc, TrueRet))
+    return false;
+  if (!isTerminatorFirstRelevantInsn(FalseSucc, FalseRet))
+    return false;
+
+  // Okay, we found a branch that is going to two return nodes.  If
+  // there is no return value for this function, just change the
+  // branch into a return.
+  if (FalseRet->getNumOperands() == 0) {
+    TrueSucc->removePredecessor(BI->getParent());
+    FalseSucc->removePredecessor(BI->getParent());
+    ReturnInst::Create(0, BI);
+    EraseTerminatorInstAndDCECond(BI);
+    return true;
+  }
+    
+  // Otherwise, figure out what the true and false return values are
+  // so we can insert a new select instruction.
+  Value *TrueValue = TrueRet->getReturnValue();
+  Value *FalseValue = FalseRet->getReturnValue();
+  
+  // Unwrap any PHI nodes in the return blocks.
+  if (PHINode *TVPN = dyn_cast_or_null<PHINode>(TrueValue))
+    if (TVPN->getParent() == TrueSucc)
+      TrueValue = TVPN->getIncomingValueForBlock(BI->getParent());
+  if (PHINode *FVPN = dyn_cast_or_null<PHINode>(FalseValue))
+    if (FVPN->getParent() == FalseSucc)
+      FalseValue = FVPN->getIncomingValueForBlock(BI->getParent());
+  
+  // In order for this transformation to be safe, we must be able to
+  // unconditionally execute both operands to the return.  This is
+  // normally the case, but we could have a potentially-trapping
+  // constant expression that prevents this transformation from being
+  // safe.
+  if (ConstantExpr *TCV = dyn_cast_or_null<ConstantExpr>(TrueValue))
+    if (TCV->canTrap())
+      return false;
+  if (ConstantExpr *FCV = dyn_cast_or_null<ConstantExpr>(FalseValue))
+    if (FCV->canTrap())
+      return false;
+  
+  // Okay, we collected all the mapped values and checked them for sanity, and
+  // defined to really do this transformation.  First, update the CFG.
+  TrueSucc->removePredecessor(BI->getParent());
+  FalseSucc->removePredecessor(BI->getParent());
+  
+  // Insert select instructions where needed.
+  Value *BrCond = BI->getCondition();
+  if (TrueValue) {
+    // Insert a select if the results differ.
+    if (TrueValue == FalseValue || isa<UndefValue>(FalseValue)) {
+    } else if (isa<UndefValue>(TrueValue)) {
+      TrueValue = FalseValue;
+    } else {
+      TrueValue = SelectInst::Create(BrCond, TrueValue,
+                                     FalseValue, "retval", BI);
+    }
+  }
+
+  Value *RI = !TrueValue ?
+              ReturnInst::Create(BI) :
+              ReturnInst::Create(TrueValue, BI);
+      
+  DOUT << "\nCHANGING BRANCH TO TWO RETURNS INTO SELECT:"
+       << "\n  " << *BI << "NewRet = " << *RI
+       << "TRUEBLOCK: " << *TrueSucc << "FALSEBLOCK: "<< *FalseSucc;
+      
+  EraseTerminatorInstAndDCECond(BI);
+
+  return true;
+}
+
+/// FoldBranchToCommonDest - If this basic block is ONLY a setcc and a branch,
+/// and if a predecessor branches to us and one of our successors, fold the
+/// setcc into the predecessor and use logical operations to pick the right
+/// destination.
+static bool FoldBranchToCommonDest(BranchInst *BI) {
+  BasicBlock *BB = BI->getParent();
+  Instruction *Cond = dyn_cast<Instruction>(BI->getCondition());
+  if (Cond == 0) return false;
+
+  
+  // Only allow this if the condition is a simple instruction that can be
+  // executed unconditionally.  It must be in the same block as the branch, and
+  // must be at the front of the block.
+  BasicBlock::iterator FrontIt = BB->front();
+  // Ignore dbg intrinsics.
+  while(isa<DbgInfoIntrinsic>(FrontIt))
+    ++FrontIt;
+  if ((!isa<CmpInst>(Cond) && !isa<BinaryOperator>(Cond)) ||
+      Cond->getParent() != BB || &*FrontIt != Cond || !Cond->hasOneUse()) {
+    return false;
+  }
+  
+  // Make sure the instruction after the condition is the cond branch.
+  BasicBlock::iterator CondIt = Cond; ++CondIt;
+  // Ingore dbg intrinsics.
+  while(isa<DbgInfoIntrinsic>(CondIt))
+    ++CondIt;
+  if (&*CondIt != BI) {
+    assert (!isa<DbgInfoIntrinsic>(CondIt) && "Hey do not forget debug info!");
+    return false;
+  }
+
+  // Cond is known to be a compare or binary operator.  Check to make sure that
+  // neither operand is a potentially-trapping constant expression.
+  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Cond->getOperand(0)))
+    if (CE->canTrap())
+      return false;
+  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Cond->getOperand(1)))
+    if (CE->canTrap())
+      return false;
+  
+  
+  // Finally, don't infinitely unroll conditional loops.
+  BasicBlock *TrueDest  = BI->getSuccessor(0);
+  BasicBlock *FalseDest = BI->getSuccessor(1);
+  if (TrueDest == BB || FalseDest == BB)
+    return false;
+  
+  for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) {
+    BasicBlock *PredBlock = *PI;
+    BranchInst *PBI = dyn_cast<BranchInst>(PredBlock->getTerminator());
+    
+    // Check that we have two conditional branches.  If there is a PHI node in
+    // the common successor, verify that the same value flows in from both
+    // blocks.
+    if (PBI == 0 || PBI->isUnconditional() ||
+        !SafeToMergeTerminators(BI, PBI))
+      continue;
+    
+    Instruction::BinaryOps Opc;
+    bool InvertPredCond = false;
+
+    if (PBI->getSuccessor(0) == TrueDest)
+      Opc = Instruction::Or;
+    else if (PBI->getSuccessor(1) == FalseDest)
+      Opc = Instruction::And;
+    else if (PBI->getSuccessor(0) == FalseDest)
+      Opc = Instruction::And, InvertPredCond = true;
+    else if (PBI->getSuccessor(1) == TrueDest)
+      Opc = Instruction::Or, InvertPredCond = true;
+    else
+      continue;
+
+    DOUT << "FOLDING BRANCH TO COMMON DEST:\n" << *PBI << *BB;
+    
+    // If we need to invert the condition in the pred block to match, do so now.
+    if (InvertPredCond) {
+      Value *NewCond =
+        BinaryOperator::CreateNot(PBI->getCondition(),
+                                  PBI->getCondition()->getName()+".not", PBI);
+      PBI->setCondition(NewCond);
+      BasicBlock *OldTrue = PBI->getSuccessor(0);
+      BasicBlock *OldFalse = PBI->getSuccessor(1);
+      PBI->setSuccessor(0, OldFalse);
+      PBI->setSuccessor(1, OldTrue);
+    }
+    
+    // Clone Cond into the predecessor basic block, and or/and the
+    // two conditions together.
+    Instruction *New = Cond->clone();
+    PredBlock->getInstList().insert(PBI, New);
+    New->takeName(Cond);
+    Cond->setName(New->getName()+".old");
+    
+    Value *NewCond = BinaryOperator::Create(Opc, PBI->getCondition(),
+                                            New, "or.cond", PBI);
+    PBI->setCondition(NewCond);
+    if (PBI->getSuccessor(0) == BB) {
+      AddPredecessorToBlock(TrueDest, PredBlock, BB);
+      PBI->setSuccessor(0, TrueDest);
+    }
+    if (PBI->getSuccessor(1) == BB) {
+      AddPredecessorToBlock(FalseDest, PredBlock, BB);
+      PBI->setSuccessor(1, FalseDest);
+    }
+    return true;
+  }
+  return false;
+}
+
+/// SimplifyCondBranchToCondBranch - If we have a conditional branch as a
+/// predecessor of another block, this function tries to simplify it.  We know
+/// that PBI and BI are both conditional branches, and BI is in one of the
+/// successor blocks of PBI - PBI branches to BI.
+static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI) {
+  assert(PBI->isConditional() && BI->isConditional());
+  BasicBlock *BB = BI->getParent();
+  
+  // If this block ends with a branch instruction, and if there is a
+  // predecessor that ends on a branch of the same condition, make 
+  // this conditional branch redundant.
+  if (PBI->getCondition() == BI->getCondition() &&
+      PBI->getSuccessor(0) != PBI->getSuccessor(1)) {
+    // Okay, the outcome of this conditional branch is statically
+    // knowable.  If this block had a single pred, handle specially.
+    if (BB->getSinglePredecessor()) {
+      // Turn this into a branch on constant.
+      bool CondIsTrue = PBI->getSuccessor(0) == BB;
+      BI->setCondition(ConstantInt::get(Type::Int1Ty, CondIsTrue));
+      return true;  // Nuke the branch on constant.
+    }
+    
+    // Otherwise, if there are multiple predecessors, insert a PHI that merges
+    // in the constant and simplify the block result.  Subsequent passes of
+    // simplifycfg will thread the block.
+    if (BlockIsSimpleEnoughToThreadThrough(BB)) {
+      PHINode *NewPN = PHINode::Create(Type::Int1Ty,
+                                       BI->getCondition()->getName() + ".pr",
+                                       BB->begin());
+      // Okay, we're going to insert the PHI node.  Since PBI is not the only
+      // predecessor, compute the PHI'd conditional value for all of the preds.
+      // Any predecessor where the condition is not computable we keep symbolic.
+      for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI)
+        if ((PBI = dyn_cast<BranchInst>((*PI)->getTerminator())) &&
+            PBI != BI && PBI->isConditional() &&
+            PBI->getCondition() == BI->getCondition() &&
+            PBI->getSuccessor(0) != PBI->getSuccessor(1)) {
+          bool CondIsTrue = PBI->getSuccessor(0) == BB;
+          NewPN->addIncoming(ConstantInt::get(Type::Int1Ty, 
+                                              CondIsTrue), *PI);
+        } else {
+          NewPN->addIncoming(BI->getCondition(), *PI);
+        }
+      
+      BI->setCondition(NewPN);
+      return true;
+    }
+  }
+  
+  // If this is a conditional branch in an empty block, and if any
+  // predecessors is a conditional branch to one of our destinations,
+  // fold the conditions into logical ops and one cond br.
+  BasicBlock::iterator BBI = BB->begin();
+  // Ignore dbg intrinsics.
+  while (isa<DbgInfoIntrinsic>(BBI))
+    ++BBI;
+  if (&*BBI != BI)
+    return false;
+
+  
+  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(BI->getCondition()))
+    if (CE->canTrap())
+      return false;
+  
+  int PBIOp, BIOp;
+  if (PBI->getSuccessor(0) == BI->getSuccessor(0))
+    PBIOp = BIOp = 0;
+  else if (PBI->getSuccessor(0) == BI->getSuccessor(1))
+    PBIOp = 0, BIOp = 1;
+  else if (PBI->getSuccessor(1) == BI->getSuccessor(0))
+    PBIOp = 1, BIOp = 0;
+  else if (PBI->getSuccessor(1) == BI->getSuccessor(1))
+    PBIOp = BIOp = 1;
+  else
+    return false;
+    
+  // Check to make sure that the other destination of this branch
+  // isn't BB itself.  If so, this is an infinite loop that will
+  // keep getting unwound.
+  if (PBI->getSuccessor(PBIOp) == BB)
+    return false;
+    
+  // Do not perform this transformation if it would require 
+  // insertion of a large number of select instructions. For targets
+  // without predication/cmovs, this is a big pessimization.
+  BasicBlock *CommonDest = PBI->getSuccessor(PBIOp);
+      
+  unsigned NumPhis = 0;
+  for (BasicBlock::iterator II = CommonDest->begin();
+       isa<PHINode>(II); ++II, ++NumPhis)
+    if (NumPhis > 2) // Disable this xform.
+      return false;
+    
+  // Finally, if everything is ok, fold the branches to logical ops.
+  BasicBlock *OtherDest  = BI->getSuccessor(BIOp ^ 1);
+  
+  DOUT << "FOLDING BRs:" << *PBI->getParent()
+       << "AND: " << *BI->getParent();
+  
+  
+  // If OtherDest *is* BB, then BB is a basic block with a single conditional
+  // branch in it, where one edge (OtherDest) goes back to itself but the other
+  // exits.  We don't *know* that the program avoids the infinite loop
+  // (even though that seems likely).  If we do this xform naively, we'll end up
+  // recursively unpeeling the loop.  Since we know that (after the xform is
+  // done) that the block *is* infinite if reached, we just make it an obviously
+  // infinite loop with no cond branch.
+  if (OtherDest == BB) {
+    // Insert it at the end of the function, because it's either code,
+    // or it won't matter if it's hot. :)
+    BasicBlock *InfLoopBlock = BasicBlock::Create("infloop", BB->getParent());
+    BranchInst::Create(InfLoopBlock, InfLoopBlock);
+    OtherDest = InfLoopBlock;
+  }  
+  
+  DOUT << *PBI->getParent()->getParent();
+  
+  // BI may have other predecessors.  Because of this, we leave
+  // it alone, but modify PBI.
+  
+  // Make sure we get to CommonDest on True&True directions.
+  Value *PBICond = PBI->getCondition();
+  if (PBIOp)
+    PBICond = BinaryOperator::CreateNot(PBICond,
+                                        PBICond->getName()+".not",
+                                        PBI);
+  Value *BICond = BI->getCondition();
+  if (BIOp)
+    BICond = BinaryOperator::CreateNot(BICond,
+                                       BICond->getName()+".not",
+                                       PBI);
+  // Merge the conditions.
+  Value *Cond = BinaryOperator::CreateOr(PBICond, BICond, "brmerge", PBI);
+  
+  // Modify PBI to branch on the new condition to the new dests.
+  PBI->setCondition(Cond);
+  PBI->setSuccessor(0, CommonDest);
+  PBI->setSuccessor(1, OtherDest);
+  
+  // OtherDest may have phi nodes.  If so, add an entry from PBI's
+  // block that are identical to the entries for BI's block.
+  PHINode *PN;
+  for (BasicBlock::iterator II = OtherDest->begin();
+       (PN = dyn_cast<PHINode>(II)); ++II) {
+    Value *V = PN->getIncomingValueForBlock(BB);
+    PN->addIncoming(V, PBI->getParent());
+  }
+  
+  // We know that the CommonDest already had an edge from PBI to
+  // it.  If it has PHIs though, the PHIs may have different
+  // entries for BB and PBI's BB.  If so, insert a select to make
+  // them agree.
+  for (BasicBlock::iterator II = CommonDest->begin();
+       (PN = dyn_cast<PHINode>(II)); ++II) {
+    Value *BIV = PN->getIncomingValueForBlock(BB);
+    unsigned PBBIdx = PN->getBasicBlockIndex(PBI->getParent());
+    Value *PBIV = PN->getIncomingValue(PBBIdx);
+    if (BIV != PBIV) {
+      // Insert a select in PBI to pick the right value.
+      Value *NV = SelectInst::Create(PBICond, PBIV, BIV,
+                                     PBIV->getName()+".mux", PBI);
+      PN->setIncomingValue(PBBIdx, NV);
+    }
+  }
+  
+  DOUT << "INTO: " << *PBI->getParent();
+  
+  DOUT << *PBI->getParent()->getParent();
+  
+  // This basic block is probably dead.  We know it has at least
+  // one fewer predecessor.
+  return true;
+}
+
+
+/// SimplifyCFG - This function is used to do simplification of a CFG.  For
+/// example, it adjusts branches to branches to eliminate the extra hop, it
+/// eliminates unreachable basic blocks, and does other "peephole" optimization
+/// of the CFG.  It returns true if a modification was made.
+///
+/// WARNING:  The entry node of a function may not be simplified.
+///
+bool llvm::SimplifyCFG(BasicBlock *BB) {
+  bool Changed = false;
+  Function *M = BB->getParent();
+
+  assert(BB && BB->getParent() && "Block not embedded in function!");
+  assert(BB->getTerminator() && "Degenerate basic block encountered!");
+  assert(&BB->getParent()->getEntryBlock() != BB &&
+         "Can't Simplify entry block!");
+
+  // Remove basic blocks that have no predecessors... or that just have themself
+  // as a predecessor.  These are unreachable.
+  if (pred_begin(BB) == pred_end(BB) || BB->getSinglePredecessor() == BB) {
+    DOUT << "Removing BB: \n" << *BB;
+    DeleteDeadBlock(BB);
+    return true;
+  }
+
+  // Check to see if we can constant propagate this terminator instruction
+  // away...
+  Changed |= ConstantFoldTerminator(BB);
+
+  // If there is a trivial two-entry PHI node in this basic block, and we can
+  // eliminate it, do so now.
+  if (PHINode *PN = dyn_cast<PHINode>(BB->begin()))
+    if (PN->getNumIncomingValues() == 2)
+      Changed |= FoldTwoEntryPHINode(PN); 
+
+  // If this is a returning block with only PHI nodes in it, fold the return
+  // instruction into any unconditional branch predecessors.
+  //
+  // If any predecessor is a conditional branch that just selects among
+  // different return values, fold the replace the branch/return with a select
+  // and return.
+  if (ReturnInst *RI = dyn_cast<ReturnInst>(BB->getTerminator())) {
+    if (isTerminatorFirstRelevantInsn(BB, BB->getTerminator())) {
+      // Find predecessors that end with branches.
+      SmallVector<BasicBlock*, 8> UncondBranchPreds;
+      SmallVector<BranchInst*, 8> CondBranchPreds;
+      for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) {
+        TerminatorInst *PTI = (*PI)->getTerminator();
+        if (BranchInst *BI = dyn_cast<BranchInst>(PTI)) {
+          if (BI->isUnconditional())
+            UncondBranchPreds.push_back(*PI);
+          else
+            CondBranchPreds.push_back(BI);
+        }
+      }
+
+      // If we found some, do the transformation!
+      if (!UncondBranchPreds.empty()) {
+        while (!UncondBranchPreds.empty()) {
+          BasicBlock *Pred = UncondBranchPreds.pop_back_val();
+          DOUT << "FOLDING: " << *BB
+               << "INTO UNCOND BRANCH PRED: " << *Pred;
+          Instruction *UncondBranch = Pred->getTerminator();
+          // Clone the return and add it to the end of the predecessor.
+          Instruction *NewRet = RI->clone();
+          Pred->getInstList().push_back(NewRet);
+
+          BasicBlock::iterator BBI = RI;
+          if (BBI != BB->begin()) {
+            // Move region end info into the predecessor.
+            if (DbgRegionEndInst *DREI = dyn_cast<DbgRegionEndInst>(--BBI))
+              DREI->moveBefore(NewRet);
+          }
+
+          // If the return instruction returns a value, and if the value was a
+          // PHI node in "BB", propagate the right value into the return.
+          for (User::op_iterator i = NewRet->op_begin(), e = NewRet->op_end();
+               i != e; ++i)
+            if (PHINode *PN = dyn_cast<PHINode>(*i))
+              if (PN->getParent() == BB)
+                *i = PN->getIncomingValueForBlock(Pred);
+          
+          // Update any PHI nodes in the returning block to realize that we no
+          // longer branch to them.
+          BB->removePredecessor(Pred);
+          Pred->getInstList().erase(UncondBranch);
+        }
+
+        // If we eliminated all predecessors of the block, delete the block now.
+        if (pred_begin(BB) == pred_end(BB))
+          // We know there are no successors, so just nuke the block.
+          M->getBasicBlockList().erase(BB);
+
+        return true;
+      }
+
+      // Check out all of the conditional branches going to this return
+      // instruction.  If any of them just select between returns, change the
+      // branch itself into a select/return pair.
+      while (!CondBranchPreds.empty()) {
+        BranchInst *BI = CondBranchPreds.pop_back_val();
+
+        // Check to see if the non-BB successor is also a return block.
+        if (isa<ReturnInst>(BI->getSuccessor(0)->getTerminator()) &&
+            isa<ReturnInst>(BI->getSuccessor(1)->getTerminator()) &&
+            SimplifyCondBranchToTwoReturns(BI))
+          return true;
+      }
+    }
+  } else if (isa<UnwindInst>(BB->begin())) {
+    // Check to see if the first instruction in this block is just an unwind.
+    // If so, replace any invoke instructions which use this as an exception
+    // destination with call instructions, and any unconditional branch
+    // predecessor with an unwind.
+    //
+    SmallVector<BasicBlock*, 8> Preds(pred_begin(BB), pred_end(BB));
+    while (!Preds.empty()) {
+      BasicBlock *Pred = Preds.back();
+      if (BranchInst *BI = dyn_cast<BranchInst>(Pred->getTerminator())) {
+        if (BI->isUnconditional()) {
+          Pred->getInstList().pop_back();  // nuke uncond branch
+          new UnwindInst(Pred);            // Use unwind.
+          Changed = true;
+        }
+      } else if (InvokeInst *II = dyn_cast<InvokeInst>(Pred->getTerminator()))
+        if (II->getUnwindDest() == BB) {
+          // Insert a new branch instruction before the invoke, because this
+          // is now a fall through...
+          BranchInst *BI = BranchInst::Create(II->getNormalDest(), II);
+          Pred->getInstList().remove(II);   // Take out of symbol table
+
+          // Insert the call now...
+          SmallVector<Value*,8> Args(II->op_begin()+3, II->op_end());
+          CallInst *CI = CallInst::Create(II->getCalledValue(),
+                                          Args.begin(), Args.end(),
+                                          II->getName(), BI);
+          CI->setCallingConv(II->getCallingConv());
+          CI->setAttributes(II->getAttributes());
+          // If the invoke produced a value, the Call now does instead
+          II->replaceAllUsesWith(CI);
+          delete II;
+          Changed = true;
+        }
+
+      Preds.pop_back();
+    }
+
+    // If this block is now dead, remove it.
+    if (pred_begin(BB) == pred_end(BB)) {
+      // We know there are no successors, so just nuke the block.
+      M->getBasicBlockList().erase(BB);
+      return true;
+    }
+
+  } else if (SwitchInst *SI = dyn_cast<SwitchInst>(BB->getTerminator())) {
+    if (isValueEqualityComparison(SI)) {
+      // If we only have one predecessor, and if it is a branch on this value,
+      // see if that predecessor totally determines the outcome of this switch.
+      if (BasicBlock *OnlyPred = BB->getSinglePredecessor())
+        if (SimplifyEqualityComparisonWithOnlyPredecessor(SI, OnlyPred))
+          return SimplifyCFG(BB) || 1;
+
+      // If the block only contains the switch, see if we can fold the block
+      // away into any preds.
+      BasicBlock::iterator BBI = BB->begin();
+      // Ignore dbg intrinsics.
+      while (isa<DbgInfoIntrinsic>(BBI))
+        ++BBI;
+      if (SI == &*BBI)
+        if (FoldValueComparisonIntoPredecessors(SI))
+          return SimplifyCFG(BB) || 1;
+    }
+  } else if (BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator())) {
+    if (BI->isUnconditional()) {
+      BasicBlock::iterator BBI = BB->getFirstNonPHI();
+
+      BasicBlock *Succ = BI->getSuccessor(0);
+      // Ignore dbg intrinsics.
+      while (isa<DbgInfoIntrinsic>(BBI))
+        ++BBI;
+      if (BBI->isTerminator() &&  // Terminator is the only non-phi instruction!
+          Succ != BB)             // Don't hurt infinite loops!
+        if (TryToSimplifyUncondBranchFromEmptyBlock(BB, Succ))
+          return true;
+      
+    } else {  // Conditional branch
+      if (isValueEqualityComparison(BI)) {
+        // If we only have one predecessor, and if it is a branch on this value,
+        // see if that predecessor totally determines the outcome of this
+        // switch.
+        if (BasicBlock *OnlyPred = BB->getSinglePredecessor())
+          if (SimplifyEqualityComparisonWithOnlyPredecessor(BI, OnlyPred))
+            return SimplifyCFG(BB) || 1;
+
+        // This block must be empty, except for the setcond inst, if it exists.
+        // Ignore dbg intrinsics.
+        BasicBlock::iterator I = BB->begin();
+        // Ignore dbg intrinsics.
+        while (isa<DbgInfoIntrinsic>(I))
+          ++I;
+        if (&*I == BI) {
+          if (FoldValueComparisonIntoPredecessors(BI))
+            return SimplifyCFG(BB) | true;
+        } else if (&*I == cast<Instruction>(BI->getCondition())){
+          ++I;
+          // Ignore dbg intrinsics.
+          while (isa<DbgInfoIntrinsic>(I))
+            ++I;
+          if(&*I == BI) {
+            if (FoldValueComparisonIntoPredecessors(BI))
+              return SimplifyCFG(BB) | true;
+          }
+        }
+      }
+
+      // If this is a branch on a phi node in the current block, thread control
+      // through this block if any PHI node entries are constants.
+      if (PHINode *PN = dyn_cast<PHINode>(BI->getCondition()))
+        if (PN->getParent() == BI->getParent())
+          if (FoldCondBranchOnPHI(BI))
+            return SimplifyCFG(BB) | true;
+
+      // If this basic block is ONLY a setcc and a branch, and if a predecessor
+      // branches to us and one of our successors, fold the setcc into the
+      // predecessor and use logical operations to pick the right destination.
+      if (FoldBranchToCommonDest(BI))
+        return SimplifyCFG(BB) | 1;
+
+
+      // Scan predecessor blocks for conditional branches.
+      for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI)
+        if (BranchInst *PBI = dyn_cast<BranchInst>((*PI)->getTerminator()))
+          if (PBI != BI && PBI->isConditional())
+            if (SimplifyCondBranchToCondBranch(PBI, BI))
+              return SimplifyCFG(BB) | true;
+    }
+  } else if (isa<UnreachableInst>(BB->getTerminator())) {
+    // If there are any instructions immediately before the unreachable that can
+    // be removed, do so.
+    Instruction *Unreachable = BB->getTerminator();
+    while (Unreachable != BB->begin()) {
+      BasicBlock::iterator BBI = Unreachable;
+      --BBI;
+      // Do not delete instructions that can have side effects, like calls
+      // (which may never return) and volatile loads and stores.
+      if (isa<CallInst>(BBI) && !isa<DbgInfoIntrinsic>(BBI)) break;
+
+      if (StoreInst *SI = dyn_cast<StoreInst>(BBI))
+        if (SI->isVolatile())
+          break;
+
+      if (LoadInst *LI = dyn_cast<LoadInst>(BBI))
+        if (LI->isVolatile())
+          break;
+
+      // Delete this instruction
+      BB->getInstList().erase(BBI);
+      Changed = true;
+    }
+
+    // If the unreachable instruction is the first in the block, take a gander
+    // at all of the predecessors of this instruction, and simplify them.
+    if (&BB->front() == Unreachable) {
+      SmallVector<BasicBlock*, 8> Preds(pred_begin(BB), pred_end(BB));
+      for (unsigned i = 0, e = Preds.size(); i != e; ++i) {
+        TerminatorInst *TI = Preds[i]->getTerminator();
+
+        if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
+          if (BI->isUnconditional()) {
+            if (BI->getSuccessor(0) == BB) {
+              new UnreachableInst(TI);
+              TI->eraseFromParent();
+              Changed = true;
+            }
+          } else {
+            if (BI->getSuccessor(0) == BB) {
+              BranchInst::Create(BI->getSuccessor(1), BI);
+              EraseTerminatorInstAndDCECond(BI);
+            } else if (BI->getSuccessor(1) == BB) {
+              BranchInst::Create(BI->getSuccessor(0), BI);
+              EraseTerminatorInstAndDCECond(BI);
+              Changed = true;
+            }
+          }
+        } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
+          for (unsigned i = 1, e = SI->getNumCases(); i != e; ++i)
+            if (SI->getSuccessor(i) == BB) {
+              BB->removePredecessor(SI->getParent());
+              SI->removeCase(i);
+              --i; --e;
+              Changed = true;
+            }
+          // If the default value is unreachable, figure out the most popular
+          // destination and make it the default.
+          if (SI->getSuccessor(0) == BB) {
+            std::map<BasicBlock*, unsigned> Popularity;
+            for (unsigned i = 1, e = SI->getNumCases(); i != e; ++i)
+              Popularity[SI->getSuccessor(i)]++;
+
+            // Find the most popular block.
+            unsigned MaxPop = 0;
+            BasicBlock *MaxBlock = 0;
+            for (std::map<BasicBlock*, unsigned>::iterator
+                   I = Popularity.begin(), E = Popularity.end(); I != E; ++I) {
+              if (I->second > MaxPop) {
+                MaxPop = I->second;
+                MaxBlock = I->first;
+              }
+            }
+            if (MaxBlock) {
+              // Make this the new default, allowing us to delete any explicit
+              // edges to it.
+              SI->setSuccessor(0, MaxBlock);
+              Changed = true;
+
+              // If MaxBlock has phinodes in it, remove MaxPop-1 entries from
+              // it.
+              if (isa<PHINode>(MaxBlock->begin()))
+                for (unsigned i = 0; i != MaxPop-1; ++i)
+                  MaxBlock->removePredecessor(SI->getParent());
+
+              for (unsigned i = 1, e = SI->getNumCases(); i != e; ++i)
+                if (SI->getSuccessor(i) == MaxBlock) {
+                  SI->removeCase(i);
+                  --i; --e;
+                }
+            }
+          }
+        } else if (InvokeInst *II = dyn_cast<InvokeInst>(TI)) {
+          if (II->getUnwindDest() == BB) {
+            // Convert the invoke to a call instruction.  This would be a good
+            // place to note that the call does not throw though.
+            BranchInst *BI = BranchInst::Create(II->getNormalDest(), II);
+            II->removeFromParent();   // Take out of symbol table
+
+            // Insert the call now...
+            SmallVector<Value*, 8> Args(II->op_begin()+3, II->op_end());
+            CallInst *CI = CallInst::Create(II->getCalledValue(),
+                                            Args.begin(), Args.end(),
+                                            II->getName(), BI);
+            CI->setCallingConv(II->getCallingConv());
+            CI->setAttributes(II->getAttributes());
+            // If the invoke produced a value, the Call does now instead.
+            II->replaceAllUsesWith(CI);
+            delete II;
+            Changed = true;
+          }
+        }
+      }
+
+      // If this block is now dead, remove it.
+      if (pred_begin(BB) == pred_end(BB)) {
+        // We know there are no successors, so just nuke the block.
+        M->getBasicBlockList().erase(BB);
+        return true;
+      }
+    }
+  }
+
+  // Merge basic blocks into their predecessor if there is only one distinct
+  // pred, and if there is only one distinct successor of the predecessor, and
+  // if there are no PHI nodes.
+  //
+  if (MergeBlockIntoPredecessor(BB))
+    return true;
+
+  // Otherwise, if this block only has a single predecessor, and if that block
+  // is a conditional branch, see if we can hoist any code from this block up
+  // into our predecessor.
+  pred_iterator PI(pred_begin(BB)), PE(pred_end(BB));
+  BasicBlock *OnlyPred = *PI++;
+  for (; PI != PE; ++PI)  // Search all predecessors, see if they are all same
+    if (*PI != OnlyPred) {
+      OnlyPred = 0;       // There are multiple different predecessors...
+      break;
+    }
+  
+  if (OnlyPred)
+    if (BranchInst *BI = dyn_cast<BranchInst>(OnlyPred->getTerminator()))
+      if (BI->isConditional()) {
+        // Get the other block.
+        BasicBlock *OtherBB = BI->getSuccessor(BI->getSuccessor(0) == BB);
+        PI = pred_begin(OtherBB);
+        ++PI;
+        
+        if (PI == pred_end(OtherBB)) {
+          // We have a conditional branch to two blocks that are only reachable
+          // from the condbr.  We know that the condbr dominates the two blocks,
+          // so see if there is any identical code in the "then" and "else"
+          // blocks.  If so, we can hoist it up to the branching block.
+          Changed |= HoistThenElseCodeToIf(BI);
+        } else {
+          BasicBlock* OnlySucc = NULL;
+          for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB);
+               SI != SE; ++SI) {
+            if (!OnlySucc)
+              OnlySucc = *SI;
+            else if (*SI != OnlySucc) {
+              OnlySucc = 0;     // There are multiple distinct successors!
+              break;
+            }
+          }
+
+          if (OnlySucc == OtherBB) {
+            // If BB's only successor is the other successor of the predecessor,
+            // i.e. a triangle, see if we can hoist any code from this block up
+            // to the "if" block.
+            Changed |= SpeculativelyExecuteBB(BI, BB);
+          }
+        }
+      }
+
+  for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI)
+    if (BranchInst *BI = dyn_cast<BranchInst>((*PI)->getTerminator()))
+      // Change br (X == 0 | X == 1), T, F into a switch instruction.
+      if (BI->isConditional() && isa<Instruction>(BI->getCondition())) {
+        Instruction *Cond = cast<Instruction>(BI->getCondition());
+        // If this is a bunch of seteq's or'd together, or if it's a bunch of
+        // 'setne's and'ed together, collect them.
+        Value *CompVal = 0;
+        std::vector<ConstantInt*> Values;
+        bool TrueWhenEqual = GatherValueComparisons(Cond, CompVal, Values);
+        if (CompVal && CompVal->getType()->isInteger()) {
+          // There might be duplicate constants in the list, which the switch
+          // instruction can't handle, remove them now.
+          std::sort(Values.begin(), Values.end(), ConstantIntOrdering());
+          Values.erase(std::unique(Values.begin(), Values.end()), Values.end());
+
+          // Figure out which block is which destination.
+          BasicBlock *DefaultBB = BI->getSuccessor(1);
+          BasicBlock *EdgeBB    = BI->getSuccessor(0);
+          if (!TrueWhenEqual) std::swap(DefaultBB, EdgeBB);
+
+          // Create the new switch instruction now.
+          SwitchInst *New = SwitchInst::Create(CompVal, DefaultBB,
+                                               Values.size(), BI);
+
+          // Add all of the 'cases' to the switch instruction.
+          for (unsigned i = 0, e = Values.size(); i != e; ++i)
+            New->addCase(Values[i], EdgeBB);
+
+          // We added edges from PI to the EdgeBB.  As such, if there were any
+          // PHI nodes in EdgeBB, they need entries to be added corresponding to
+          // the number of edges added.
+          for (BasicBlock::iterator BBI = EdgeBB->begin();
+               isa<PHINode>(BBI); ++BBI) {
+            PHINode *PN = cast<PHINode>(BBI);
+            Value *InVal = PN->getIncomingValueForBlock(*PI);
+            for (unsigned i = 0, e = Values.size()-1; i != e; ++i)
+              PN->addIncoming(InVal, *PI);
+          }
+
+          // Erase the old branch instruction.
+          EraseTerminatorInstAndDCECond(BI);
+          return true;
+        }
+      }
+
+  return Changed;
+}
diff --git a/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp b/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp
new file mode 100644
index 0000000..848f2b8
--- /dev/null
+++ b/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp
@@ -0,0 +1,139 @@
+//===- UnifyFunctionExitNodes.cpp - Make all functions have a single exit -===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass is used to ensure that functions have at most one return
+// instruction in them.  Additionally, it keeps track of which node is the new
+// exit node of the CFG.  If there are no exit nodes in the CFG, the getExitNode
+// method will return a null pointer.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/UnifyFunctionExitNodes.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/BasicBlock.h"
+#include "llvm/Function.h"
+#include "llvm/Instructions.h"
+#include "llvm/Type.h"
+#include "llvm/ADT/StringExtras.h"
+using namespace llvm;
+
+char UnifyFunctionExitNodes::ID = 0;
+static RegisterPass<UnifyFunctionExitNodes>
+X("mergereturn", "Unify function exit nodes");
+
+Pass *llvm::createUnifyFunctionExitNodesPass() {
+  return new UnifyFunctionExitNodes();
+}
+
+void UnifyFunctionExitNodes::getAnalysisUsage(AnalysisUsage &AU) const{
+  // We preserve the non-critical-edgeness property
+  AU.addPreservedID(BreakCriticalEdgesID);
+  // This is a cluster of orthogonal Transforms
+  AU.addPreservedID(PromoteMemoryToRegisterID);
+  AU.addPreservedID(LowerSwitchID);
+}
+
+// UnifyAllExitNodes - Unify all exit nodes of the CFG by creating a new
+// BasicBlock, and converting all returns to unconditional branches to this
+// new basic block.  The singular exit node is returned.
+//
+// If there are no return stmts in the Function, a null pointer is returned.
+//
+bool UnifyFunctionExitNodes::runOnFunction(Function &F) {
+  // Loop over all of the blocks in a function, tracking all of the blocks that
+  // return.
+  //
+  std::vector<BasicBlock*> ReturningBlocks;
+  std::vector<BasicBlock*> UnwindingBlocks;
+  std::vector<BasicBlock*> UnreachableBlocks;
+  for(Function::iterator I = F.begin(), E = F.end(); I != E; ++I)
+    if (isa<ReturnInst>(I->getTerminator()))
+      ReturningBlocks.push_back(I);
+    else if (isa<UnwindInst>(I->getTerminator()))
+      UnwindingBlocks.push_back(I);
+    else if (isa<UnreachableInst>(I->getTerminator()))
+      UnreachableBlocks.push_back(I);
+
+  // Handle unwinding blocks first.
+  if (UnwindingBlocks.empty()) {
+    UnwindBlock = 0;
+  } else if (UnwindingBlocks.size() == 1) {
+    UnwindBlock = UnwindingBlocks.front();
+  } else {
+    UnwindBlock = BasicBlock::Create("UnifiedUnwindBlock", &F);
+    new UnwindInst(UnwindBlock);
+
+    for (std::vector<BasicBlock*>::iterator I = UnwindingBlocks.begin(),
+           E = UnwindingBlocks.end(); I != E; ++I) {
+      BasicBlock *BB = *I;
+      BB->getInstList().pop_back();  // Remove the unwind insn
+      BranchInst::Create(UnwindBlock, BB);
+    }
+  }
+
+  // Then unreachable blocks.
+  if (UnreachableBlocks.empty()) {
+    UnreachableBlock = 0;
+  } else if (UnreachableBlocks.size() == 1) {
+    UnreachableBlock = UnreachableBlocks.front();
+  } else {
+    UnreachableBlock = BasicBlock::Create("UnifiedUnreachableBlock", &F);
+    new UnreachableInst(UnreachableBlock);
+
+    for (std::vector<BasicBlock*>::iterator I = UnreachableBlocks.begin(),
+           E = UnreachableBlocks.end(); I != E; ++I) {
+      BasicBlock *BB = *I;
+      BB->getInstList().pop_back();  // Remove the unreachable inst.
+      BranchInst::Create(UnreachableBlock, BB);
+    }
+  }
+
+  // Now handle return blocks.
+  if (ReturningBlocks.empty()) {
+    ReturnBlock = 0;
+    return false;                          // No blocks return
+  } else if (ReturningBlocks.size() == 1) {
+    ReturnBlock = ReturningBlocks.front(); // Already has a single return block
+    return false;
+  }
+
+  // Otherwise, we need to insert a new basic block into the function, add a PHI
+  // nodes (if the function returns values), and convert all of the return
+  // instructions into unconditional branches.
+  //
+  BasicBlock *NewRetBlock = BasicBlock::Create("UnifiedReturnBlock", &F);
+
+  PHINode *PN = 0;
+  if (F.getReturnType() == Type::VoidTy) {
+    ReturnInst::Create(NULL, NewRetBlock);
+  } else {
+    // If the function doesn't return void... add a PHI node to the block...
+    PN = PHINode::Create(F.getReturnType(), "UnifiedRetVal");
+    NewRetBlock->getInstList().push_back(PN);
+    ReturnInst::Create(PN, NewRetBlock);
+  }
+
+  // Loop over all of the blocks, replacing the return instruction with an
+  // unconditional branch.
+  //
+  for (std::vector<BasicBlock*>::iterator I = ReturningBlocks.begin(),
+         E = ReturningBlocks.end(); I != E; ++I) {
+    BasicBlock *BB = *I;
+
+    // Add an incoming element to the PHI node for every return instruction that
+    // is merging into this new block...
+    if (PN)
+      PN->addIncoming(BB->getTerminator()->getOperand(0), BB);
+
+    BB->getInstList().pop_back();  // Remove the return insn
+    BranchInst::Create(NewRetBlock, BB);
+  }
+  ReturnBlock = NewRetBlock;
+  return true;
+}
diff --git a/lib/Transforms/Utils/UnrollLoop.cpp b/lib/Transforms/Utils/UnrollLoop.cpp
new file mode 100644
index 0000000..caef7ec
--- /dev/null
+++ b/lib/Transforms/Utils/UnrollLoop.cpp
@@ -0,0 +1,369 @@
+//===-- UnrollLoop.cpp - Loop unrolling utilities -------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements some loop unrolling utilities. It does not define any
+// actual pass or policy, but provides a single function to perform loop
+// unrolling.
+//
+// It works best when loops have been canonicalized by the -indvars pass,
+// allowing it to determine the trip counts of loops easily.
+//
+// The process of unrolling can produce extraneous basic blocks linked with
+// unconditional branches.  This will be corrected in the future.
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "loop-unroll"
+#include "llvm/Transforms/Utils/UnrollLoop.h"
+#include "llvm/BasicBlock.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <cstdio>
+
+using namespace llvm;
+
+// TODO: Should these be here or in LoopUnroll?
+STATISTIC(NumCompletelyUnrolled, "Number of loops completely unrolled");
+STATISTIC(NumUnrolled,    "Number of loops unrolled (completely or otherwise)");
+
+/// RemapInstruction - Convert the instruction operands from referencing the
+/// current values into those specified by ValueMap.
+static inline void RemapInstruction(Instruction *I,
+                                    DenseMap<const Value *, Value*> &ValueMap) {
+  for (unsigned op = 0, E = I->getNumOperands(); op != E; ++op) {
+    Value *Op = I->getOperand(op);
+    DenseMap<const Value *, Value*>::iterator It = ValueMap.find(Op);
+    if (It != ValueMap.end()) Op = It->second;
+    I->setOperand(op, Op);
+  }
+}
+
+/// FoldBlockIntoPredecessor - Folds a basic block into its predecessor if it
+/// only has one predecessor, and that predecessor only has one successor.
+/// The LoopInfo Analysis that is passed will be kept consistent.
+/// Returns the new combined block.
+static BasicBlock *FoldBlockIntoPredecessor(BasicBlock *BB, LoopInfo* LI) {
+  // Merge basic blocks into their predecessor if there is only one distinct
+  // pred, and if there is only one distinct successor of the predecessor, and
+  // if there are no PHI nodes.
+  BasicBlock *OnlyPred = BB->getSinglePredecessor();
+  if (!OnlyPred) return 0;
+
+  if (OnlyPred->getTerminator()->getNumSuccessors() != 1)
+    return 0;
+
+  DOUT << "Merging: " << *BB << "into: " << *OnlyPred;
+
+  // Resolve any PHI nodes at the start of the block.  They are all
+  // guaranteed to have exactly one entry if they exist, unless there are
+  // multiple duplicate (but guaranteed to be equal) entries for the
+  // incoming edges.  This occurs when there are multiple edges from
+  // OnlyPred to OnlySucc.
+  FoldSingleEntryPHINodes(BB);
+
+  // Delete the unconditional branch from the predecessor...
+  OnlyPred->getInstList().pop_back();
+
+  // Move all definitions in the successor to the predecessor...
+  OnlyPred->getInstList().splice(OnlyPred->end(), BB->getInstList());
+
+  // Make all PHI nodes that referred to BB now refer to Pred as their
+  // source...
+  BB->replaceAllUsesWith(OnlyPred);
+
+  std::string OldName = BB->getName();
+
+  // Erase basic block from the function...
+  LI->removeBlock(BB);
+  BB->eraseFromParent();
+
+  // Inherit predecessor's name if it exists...
+  if (!OldName.empty() && !OnlyPred->hasName())
+    OnlyPred->setName(OldName);
+
+  return OnlyPred;
+}
+
+/// Unroll the given loop by Count. The loop must be in LCSSA form. Returns true
+/// if unrolling was succesful, or false if the loop was unmodified. Unrolling
+/// can only fail when the loop's latch block is not terminated by a conditional
+/// branch instruction. However, if the trip count (and multiple) are not known,
+/// loop unrolling will mostly produce more code that is no faster.
+///
+/// The LoopInfo Analysis that is passed will be kept consistent.
+///
+/// If a LoopPassManager is passed in, and the loop is fully removed, it will be
+/// removed from the LoopPassManager as well. LPM can also be NULL.
+bool llvm::UnrollLoop(Loop *L, unsigned Count, LoopInfo* LI, LPPassManager* LPM) {
+  assert(L->isLCSSAForm());
+
+  BasicBlock *Header = L->getHeader();
+  BasicBlock *LatchBlock = L->getLoopLatch();
+  BranchInst *BI = dyn_cast<BranchInst>(LatchBlock->getTerminator());
+  
+  if (!BI || BI->isUnconditional()) {
+    // The loop-rotate pass can be helpful to avoid this in many cases.
+    DOUT << "  Can't unroll; loop not terminated by a conditional branch.\n";
+    return false;
+  }
+
+  // Find trip count
+  unsigned TripCount = L->getSmallConstantTripCount();
+  // Find trip multiple if count is not available
+  unsigned TripMultiple = 1;
+  if (TripCount == 0)
+    TripMultiple = L->getSmallConstantTripMultiple();
+
+  if (TripCount != 0)
+    DOUT << "  Trip Count = " << TripCount << "\n";
+  if (TripMultiple != 1)
+    DOUT << "  Trip Multiple = " << TripMultiple << "\n";
+
+  // Effectively "DCE" unrolled iterations that are beyond the tripcount
+  // and will never be executed.
+  if (TripCount != 0 && Count > TripCount)
+    Count = TripCount;
+
+  assert(Count > 0);
+  assert(TripMultiple > 0);
+  assert(TripCount == 0 || TripCount % TripMultiple == 0);
+
+  // Are we eliminating the loop control altogether?
+  bool CompletelyUnroll = Count == TripCount;
+
+  // If we know the trip count, we know the multiple...
+  unsigned BreakoutTrip = 0;
+  if (TripCount != 0) {
+    BreakoutTrip = TripCount % Count;
+    TripMultiple = 0;
+  } else {
+    // Figure out what multiple to use.
+    BreakoutTrip = TripMultiple =
+      (unsigned)GreatestCommonDivisor64(Count, TripMultiple);
+  }
+
+  if (CompletelyUnroll) {
+    DOUT << "COMPLETELY UNROLLING loop %" << Header->getName()
+         << " with trip count " << TripCount << "!\n";
+  } else {
+    DOUT << "UNROLLING loop %" << Header->getName()
+         << " by " << Count;
+    if (TripMultiple == 0 || BreakoutTrip != TripMultiple) {
+      DOUT << " with a breakout at trip " << BreakoutTrip;
+    } else if (TripMultiple != 1) {
+      DOUT << " with " << TripMultiple << " trips per branch";
+    }
+    DOUT << "!\n";
+  }
+
+  std::vector<BasicBlock*> LoopBlocks = L->getBlocks();
+
+  bool ContinueOnTrue = L->contains(BI->getSuccessor(0));
+  BasicBlock *LoopExit = BI->getSuccessor(ContinueOnTrue);
+
+  // For the first iteration of the loop, we should use the precloned values for
+  // PHI nodes.  Insert associations now.
+  typedef DenseMap<const Value*, Value*> ValueMapTy;
+  ValueMapTy LastValueMap;
+  std::vector<PHINode*> OrigPHINode;
+  for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(I); ++I) {
+    PHINode *PN = cast<PHINode>(I);
+    OrigPHINode.push_back(PN);
+    if (Instruction *I = 
+                dyn_cast<Instruction>(PN->getIncomingValueForBlock(LatchBlock)))
+      if (L->contains(I->getParent()))
+        LastValueMap[I] = I;
+  }
+
+  std::vector<BasicBlock*> Headers;
+  std::vector<BasicBlock*> Latches;
+  Headers.push_back(Header);
+  Latches.push_back(LatchBlock);
+
+  for (unsigned It = 1; It != Count; ++It) {
+    char SuffixBuffer[100];
+    sprintf(SuffixBuffer, ".%d", It);
+    
+    std::vector<BasicBlock*> NewBlocks;
+    
+    for (std::vector<BasicBlock*>::iterator BB = LoopBlocks.begin(),
+         E = LoopBlocks.end(); BB != E; ++BB) {
+      ValueMapTy ValueMap;
+      BasicBlock *New = CloneBasicBlock(*BB, ValueMap, SuffixBuffer);
+      Header->getParent()->getBasicBlockList().push_back(New);
+
+      // Loop over all of the PHI nodes in the block, changing them to use the
+      // incoming values from the previous block.
+      if (*BB == Header)
+        for (unsigned i = 0, e = OrigPHINode.size(); i != e; ++i) {
+          PHINode *NewPHI = cast<PHINode>(ValueMap[OrigPHINode[i]]);
+          Value *InVal = NewPHI->getIncomingValueForBlock(LatchBlock);
+          if (Instruction *InValI = dyn_cast<Instruction>(InVal))
+            if (It > 1 && L->contains(InValI->getParent()))
+              InVal = LastValueMap[InValI];
+          ValueMap[OrigPHINode[i]] = InVal;
+          New->getInstList().erase(NewPHI);
+        }
+
+      // Update our running map of newest clones
+      LastValueMap[*BB] = New;
+      for (ValueMapTy::iterator VI = ValueMap.begin(), VE = ValueMap.end();
+           VI != VE; ++VI)
+        LastValueMap[VI->first] = VI->second;
+
+      L->addBasicBlockToLoop(New, LI->getBase());
+
+      // Add phi entries for newly created values to all exit blocks except
+      // the successor of the latch block.  The successor of the exit block will
+      // be updated specially after unrolling all the way.
+      if (*BB != LatchBlock)
+        for (Value::use_iterator UI = (*BB)->use_begin(), UE = (*BB)->use_end();
+             UI != UE;) {
+          Instruction *UseInst = cast<Instruction>(*UI);
+          ++UI;
+          if (isa<PHINode>(UseInst) && !L->contains(UseInst->getParent())) {
+            PHINode *phi = cast<PHINode>(UseInst);
+            Value *Incoming = phi->getIncomingValueForBlock(*BB);
+            phi->addIncoming(Incoming, New);
+          }
+        }
+
+      // Keep track of new headers and latches as we create them, so that
+      // we can insert the proper branches later.
+      if (*BB == Header)
+        Headers.push_back(New);
+      if (*BB == LatchBlock) {
+        Latches.push_back(New);
+
+        // Also, clear out the new latch's back edge so that it doesn't look
+        // like a new loop, so that it's amenable to being merged with adjacent
+        // blocks later on.
+        TerminatorInst *Term = New->getTerminator();
+        assert(L->contains(Term->getSuccessor(!ContinueOnTrue)));
+        assert(Term->getSuccessor(ContinueOnTrue) == LoopExit);
+        Term->setSuccessor(!ContinueOnTrue, NULL);
+      }
+
+      NewBlocks.push_back(New);
+    }
+    
+    // Remap all instructions in the most recent iteration
+    for (unsigned i = 0; i < NewBlocks.size(); ++i)
+      for (BasicBlock::iterator I = NewBlocks[i]->begin(),
+           E = NewBlocks[i]->end(); I != E; ++I)
+        RemapInstruction(I, LastValueMap);
+  }
+  
+  // The latch block exits the loop.  If there are any PHI nodes in the
+  // successor blocks, update them to use the appropriate values computed as the
+  // last iteration of the loop.
+  if (Count != 1) {
+    SmallPtrSet<PHINode*, 8> Users;
+    for (Value::use_iterator UI = LatchBlock->use_begin(),
+         UE = LatchBlock->use_end(); UI != UE; ++UI)
+      if (PHINode *phi = dyn_cast<PHINode>(*UI))
+        Users.insert(phi);
+    
+    BasicBlock *LastIterationBB = cast<BasicBlock>(LastValueMap[LatchBlock]);
+    for (SmallPtrSet<PHINode*,8>::iterator SI = Users.begin(), SE = Users.end();
+         SI != SE; ++SI) {
+      PHINode *PN = *SI;
+      Value *InVal = PN->removeIncomingValue(LatchBlock, false);
+      // If this value was defined in the loop, take the value defined by the
+      // last iteration of the loop.
+      if (Instruction *InValI = dyn_cast<Instruction>(InVal)) {
+        if (L->contains(InValI->getParent()))
+          InVal = LastValueMap[InVal];
+      }
+      PN->addIncoming(InVal, LastIterationBB);
+    }
+  }
+
+  // Now, if we're doing complete unrolling, loop over the PHI nodes in the
+  // original block, setting them to their incoming values.
+  if (CompletelyUnroll) {
+    BasicBlock *Preheader = L->getLoopPreheader();
+    for (unsigned i = 0, e = OrigPHINode.size(); i != e; ++i) {
+      PHINode *PN = OrigPHINode[i];
+      PN->replaceAllUsesWith(PN->getIncomingValueForBlock(Preheader));
+      Header->getInstList().erase(PN);
+    }
+  }
+
+  // Now that all the basic blocks for the unrolled iterations are in place,
+  // set up the branches to connect them.
+  for (unsigned i = 0, e = Latches.size(); i != e; ++i) {
+    // The original branch was replicated in each unrolled iteration.
+    BranchInst *Term = cast<BranchInst>(Latches[i]->getTerminator());
+
+    // The branch destination.
+    unsigned j = (i + 1) % e;
+    BasicBlock *Dest = Headers[j];
+    bool NeedConditional = true;
+
+    // For a complete unroll, make the last iteration end with a branch
+    // to the exit block.
+    if (CompletelyUnroll && j == 0) {
+      Dest = LoopExit;
+      NeedConditional = false;
+    }
+
+    // If we know the trip count or a multiple of it, we can safely use an
+    // unconditional branch for some iterations.
+    if (j != BreakoutTrip && (TripMultiple == 0 || j % TripMultiple != 0)) {
+      NeedConditional = false;
+    }
+
+    if (NeedConditional) {
+      // Update the conditional branch's successor for the following
+      // iteration.
+      Term->setSuccessor(!ContinueOnTrue, Dest);
+    } else {
+      Term->setUnconditionalDest(Dest);
+      // Merge adjacent basic blocks, if possible.
+      if (BasicBlock *Fold = FoldBlockIntoPredecessor(Dest, LI)) {
+        std::replace(Latches.begin(), Latches.end(), Dest, Fold);
+        std::replace(Headers.begin(), Headers.end(), Dest, Fold);
+      }
+    }
+  }
+  
+  // At this point, the code is well formed.  We now do a quick sweep over the
+  // inserted code, doing constant propagation and dead code elimination as we
+  // go.
+  const std::vector<BasicBlock*> &NewLoopBlocks = L->getBlocks();
+  for (std::vector<BasicBlock*>::const_iterator BB = NewLoopBlocks.begin(),
+       BBE = NewLoopBlocks.end(); BB != BBE; ++BB)
+    for (BasicBlock::iterator I = (*BB)->begin(), E = (*BB)->end(); I != E; ) {
+      Instruction *Inst = I++;
+
+      if (isInstructionTriviallyDead(Inst))
+        (*BB)->getInstList().erase(Inst);
+      else if (Constant *C = ConstantFoldInstruction(Inst)) {
+        Inst->replaceAllUsesWith(C);
+        (*BB)->getInstList().erase(Inst);
+      }
+    }
+
+  NumCompletelyUnrolled += CompletelyUnroll;
+  ++NumUnrolled;
+  // Remove the loop from the LoopPassManager if it's completely removed.
+  if (CompletelyUnroll && LPM != NULL)
+    LPM->deleteLoopFromQueue(L);
+
+  // If we didn't completely unroll the loop, it should still be in LCSSA form.
+  if (!CompletelyUnroll)
+    assert(L->isLCSSAForm());
+
+  return true;
+}
diff --git a/lib/Transforms/Utils/ValueMapper.cpp b/lib/Transforms/Utils/ValueMapper.cpp
new file mode 100644
index 0000000..20b676d
--- /dev/null
+++ b/lib/Transforms/Utils/ValueMapper.cpp
@@ -0,0 +1,143 @@
+//===- ValueMapper.cpp - Interface shared by lib/Transforms/Utils ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the MapValue function, which is shared by various parts of
+// the lib/Transforms/Utils library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/ValueMapper.h"
+#include "llvm/Constants.h"
+#include "llvm/GlobalValue.h"
+#include "llvm/Instruction.h"
+#include "llvm/MDNode.h"
+#include "llvm/ADT/SmallVector.h"
+using namespace llvm;
+
+Value *llvm::MapValue(const Value *V, ValueMapTy &VM) {
+  Value *&VMSlot = VM[V];
+  if (VMSlot) return VMSlot;      // Does it exist in the map yet?
+  
+  // NOTE: VMSlot can be invalidated by any reference to VM, which can grow the
+  // DenseMap.  This includes any recursive calls to MapValue.
+
+  // Global values do not need to be seeded into the ValueMap if they are using
+  // the identity mapping.
+  if (isa<GlobalValue>(V) || isa<InlineAsm>(V))
+    return VMSlot = const_cast<Value*>(V);
+
+  if (Constant *C = const_cast<Constant*>(dyn_cast<Constant>(V))) {
+    if (isa<ConstantInt>(C) || isa<ConstantFP>(C) ||
+        isa<ConstantPointerNull>(C) || isa<ConstantAggregateZero>(C) ||
+        isa<UndefValue>(C) || isa<MDString>(C))
+      return VMSlot = C;           // Primitive constants map directly
+    else if (ConstantArray *CA = dyn_cast<ConstantArray>(C)) {
+      for (User::op_iterator b = CA->op_begin(), i = b, e = CA->op_end();
+           i != e; ++i) {
+        Value *MV = MapValue(*i, VM);
+        if (MV != *i) {
+          // This array must contain a reference to a global, make a new array
+          // and return it.
+          //
+          std::vector<Constant*> Values;
+          Values.reserve(CA->getNumOperands());
+          for (User::op_iterator j = b; j != i; ++j)
+            Values.push_back(cast<Constant>(*j));
+          Values.push_back(cast<Constant>(MV));
+          for (++i; i != e; ++i)
+            Values.push_back(cast<Constant>(MapValue(*i, VM)));
+          return VM[V] = ConstantArray::get(CA->getType(), Values);
+        }
+      }
+      return VM[V] = C;
+
+    } else if (ConstantStruct *CS = dyn_cast<ConstantStruct>(C)) {
+      for (User::op_iterator b = CS->op_begin(), i = b, e = CS->op_end();
+           i != e; ++i) {
+        Value *MV = MapValue(*i, VM);
+        if (MV != *i) {
+          // This struct must contain a reference to a global, make a new struct
+          // and return it.
+          //
+          std::vector<Constant*> Values;
+          Values.reserve(CS->getNumOperands());
+          for (User::op_iterator j = b; j != i; ++j)
+            Values.push_back(cast<Constant>(*j));
+          Values.push_back(cast<Constant>(MV));
+          for (++i; i != e; ++i)
+            Values.push_back(cast<Constant>(MapValue(*i, VM)));
+          return VM[V] = ConstantStruct::get(CS->getType(), Values);
+        }
+      }
+      return VM[V] = C;
+
+    } else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C)) {
+      std::vector<Constant*> Ops;
+      for (User::op_iterator i = CE->op_begin(), e = CE->op_end(); i != e; ++i)
+        Ops.push_back(cast<Constant>(MapValue(*i, VM)));
+      return VM[V] = CE->getWithOperands(Ops);
+    } else if (ConstantVector *CP = dyn_cast<ConstantVector>(C)) {
+      for (User::op_iterator b = CP->op_begin(), i = b, e = CP->op_end();
+           i != e; ++i) {
+        Value *MV = MapValue(*i, VM);
+        if (MV != *i) {
+          // This vector value must contain a reference to a global, make a new
+          // vector constant and return it.
+          //
+          std::vector<Constant*> Values;
+          Values.reserve(CP->getNumOperands());
+          for (User::op_iterator j = b; j != i; ++j)
+            Values.push_back(cast<Constant>(*j));
+          Values.push_back(cast<Constant>(MV));
+          for (++i; i != e; ++i)
+            Values.push_back(cast<Constant>(MapValue(*i, VM)));
+          return VM[V] = ConstantVector::get(Values);
+        }
+      }
+      return VM[V] = C;
+      
+    } else if (MDNode *N = dyn_cast<MDNode>(C)) {
+      for (MDNode::const_elem_iterator b = N->elem_begin(), i = b,
+             e = N->elem_end(); i != e; ++i) {
+        if (!*i) continue;
+
+        Value *MV = MapValue(*i, VM);
+        if (MV != *i) {
+          // This MDNode must contain a reference to a global, make a new MDNode
+          // and return it.
+	  SmallVector<Value*, 8> Values;
+          Values.reserve(N->getNumElements());
+          for (MDNode::const_elem_iterator j = b; j != i; ++j)
+            Values.push_back(*j);
+          Values.push_back(MV);
+          for (++i; i != e; ++i)
+            Values.push_back(MapValue(*i, VM));
+          return VM[V] = MDNode::get(Values.data(), Values.size());
+        }
+      }
+      return VM[V] = C;
+
+    } else {
+      assert(0 && "Unknown type of constant!");
+    }
+  }
+
+  return 0;
+}
+
+/// RemapInstruction - Convert the instruction operands from referencing the
+/// current values into those specified by ValueMap.
+///
+void llvm::RemapInstruction(Instruction *I, ValueMapTy &ValueMap) {
+  for (User::op_iterator op = I->op_begin(), E = I->op_end(); op != E; ++op) {
+    Value *V = MapValue(*op, ValueMap);
+    assert(V && "Referenced value not in value map!");
+    *op = V;
+  }
+}
author	ed <ed@FreeBSD.org>	2009-06-02 17:52:33 +0000
committer	ed <ed@FreeBSD.org>	2009-06-02 17:52:33 +0000
commit	3277b69d734b9c90b44ebde4ede005717e2c3b2e (patch)
tree	64ba909838c23261cace781ece27d106134ea451 /lib/Transforms
download	FreeBSD-src-3277b69d734b9c90b44ebde4ede005717e2c3b2e.zip FreeBSD-src-3277b69d734b9c90b44ebde4ede005717e2c3b2e.tar.gz