5 files changed, 344 insertions, 137 deletions
diff --git a/lib/Transforms/Scalar/LICM.cpp b/lib/Transforms/Scalar/LICM.cpp
index 6d70cdc..e01e23f 100644
--- a/lib/Transforms/Scalar/LICM.cpp
+++ b/lib/Transforms/Scalar/LICM.cpp
@@ -75,10 +75,12 @@ DisablePromotion("disable-licm-promotion", cl::Hidden,
                  cl::desc("Disable memory promotion in LICM pass"));
 
 static bool inSubLoop(BasicBlock *BB, Loop *CurLoop, LoopInfo *LI);
-static bool isNotUsedInLoop(const Instruction &I, const Loop *CurLoop);
+static bool isNotUsedInLoop(const Instruction &I, const Loop *CurLoop,
+                            const LICMSafetyInfo *SafetyInfo);
 static bool hoist(Instruction &I, BasicBlock *Preheader);
 static bool sink(Instruction &I, const LoopInfo *LI, const DominatorTree *DT,
-                 const Loop *CurLoop, AliasSetTracker *CurAST );
+                 const Loop *CurLoop, AliasSetTracker *CurAST,
+                 const LICMSafetyInfo *SafetyInfo);
 static bool isGuaranteedToExecute(const Instruction &Inst,
                                   const DominatorTree *DT,
                                   const Loop *CurLoop,
@@ -92,10 +94,10 @@ static bool isSafeToExecuteUnconditionally(const Instruction &Inst,
 static bool pointerInvalidatedByLoop(Value *V, uint64_t Size,
                                      const AAMDNodes &AAInfo, 
                                      AliasSetTracker *CurAST);
-static Instruction *CloneInstructionInExitBlock(const Instruction &I,
-                                                BasicBlock &ExitBlock,
-                                                PHINode &PN,
-                                                const LoopInfo *LI);
+static Instruction *
+CloneInstructionInExitBlock(Instruction &I, BasicBlock &ExitBlock, PHINode &PN,
+                            const LoopInfo *LI,
+                            const LICMSafetyInfo *SafetyInfo);
 static bool canSinkOrHoistInst(Instruction &I, AliasAnalysis *AA,
                                DominatorTree *DT, TargetLibraryInfo *TLI,
                                Loop *CurLoop, AliasSetTracker *CurAST,
@@ -348,10 +350,10 @@ bool llvm::sinkRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
     // outside of the loop.  In this case, it doesn't even matter if the
     // operands of the instruction are loop invariant.
     //
-    if (isNotUsedInLoop(I, CurLoop) &&
+    if (isNotUsedInLoop(I, CurLoop, SafetyInfo) &&
         canSinkOrHoistInst(I, AA, DT, TLI, CurLoop, CurAST, SafetyInfo)) {
       ++II;
-      Changed |= sink(I, LI, DT, CurLoop, CurAST);
+      Changed |= sink(I, LI, DT, CurLoop, CurAST, SafetyInfo);
     }
   }
   return Changed;
@@ -432,6 +434,14 @@ void llvm::computeLICMSafetyInfo(LICMSafetyInfo * SafetyInfo, Loop * CurLoop) {
     for (BasicBlock::iterator I = (*BB)->begin(), E = (*BB)->end();
          (I != E) && !SafetyInfo->MayThrow; ++I)
       SafetyInfo->MayThrow |= I->mayThrow();
+
+  // Compute funclet colors if we might sink/hoist in a function with a funclet
+  // personality routine.
+  Function *Fn = CurLoop->getHeader()->getParent();
+  if (Fn->hasPersonalityFn())
+    if (Constant *PersonalityFn = Fn->getPersonalityFn())
+      if (isFuncletEHPersonality(classifyEHPersonality(PersonalityFn)))
+        SafetyInfo->BlockColors = colorEHFunclets(*Fn);
 }
 
 /// canSinkOrHoistInst - Return true if the hoister and sinker can handle this
@@ -466,6 +476,10 @@ bool canSinkOrHoistInst(Instruction &I, AliasAnalysis *AA, DominatorTree *DT,
     if (isa<DbgInfoIntrinsic>(I))
       return false;
 
+    // Don't sink calls which can throw.
+    if (CI->mayThrow())
+      return false;
+
     // Handle simple cases by querying alias analysis.
     FunctionModRefBehavior Behavior = AA->getModRefBehavior(CI);
     if (Behavior == FMRB_DoesNotAccessMemory)
@@ -534,10 +548,24 @@ static bool isTriviallyReplacablePHI(const PHINode &PN, const Instruction &I) {
 /// the loop. If this is true, we can sink the instruction to the exit
 /// blocks of the loop.
 ///
-static bool isNotUsedInLoop(const Instruction &I, const Loop *CurLoop) {
+static bool isNotUsedInLoop(const Instruction &I, const Loop *CurLoop,
+                            const LICMSafetyInfo *SafetyInfo) {
+  const auto &BlockColors = SafetyInfo->BlockColors;
   for (const User *U : I.users()) {
     const Instruction *UI = cast<Instruction>(U);
     if (const PHINode *PN = dyn_cast<PHINode>(UI)) {
+      const BasicBlock *BB = PN->getParent();
+      // We cannot sink uses in catchswitches.
+      if (isa<CatchSwitchInst>(BB->getTerminator()))
+        return false;
+
+      // We need to sink a callsite to a unique funclet.  Avoid sinking if the
+      // phi use is too muddled.
+      if (isa<CallInst>(I))
+        if (!BlockColors.empty() &&
+            BlockColors.find(const_cast<BasicBlock *>(BB))->second.size() != 1)
+          return false;
+
       // A PHI node where all of the incoming values are this instruction are
       // special -- they can just be RAUW'ed with the instruction and thus
       // don't require a use in the predecessor. This is a particular important
@@ -565,11 +593,41 @@ static bool isNotUsedInLoop(const Instruction &I, const Loop *CurLoop) {
   return true;
 }
 
-static Instruction *CloneInstructionInExitBlock(const Instruction &I,
-                                                BasicBlock &ExitBlock,
-                                                PHINode &PN,
-                                                const LoopInfo *LI) {
-  Instruction *New = I.clone();
+static Instruction *
+CloneInstructionInExitBlock(Instruction &I, BasicBlock &ExitBlock, PHINode &PN,
+                            const LoopInfo *LI,
+                            const LICMSafetyInfo *SafetyInfo) {
+  Instruction *New;
+  if (auto *CI = dyn_cast<CallInst>(&I)) {
+    const auto &BlockColors = SafetyInfo->BlockColors;
+
+    // Sinking call-sites need to be handled differently from other
+    // instructions.  The cloned call-site needs a funclet bundle operand
+    // appropriate for it's location in the CFG.
+    SmallVector<OperandBundleDef, 1> OpBundles;
+    for (unsigned BundleIdx = 0, BundleEnd = CI->getNumOperandBundles();
+         BundleIdx != BundleEnd; ++BundleIdx) {
+      OperandBundleUse Bundle = CI->getOperandBundleAt(BundleIdx);
+      if (Bundle.getTagID() == LLVMContext::OB_funclet)
+        continue;
+
+      OpBundles.emplace_back(Bundle);
+    }
+
+    if (!BlockColors.empty()) {
+      const ColorVector &CV = BlockColors.find(&ExitBlock)->second;
+      assert(CV.size() == 1 && "non-unique color for exit block!");
+      BasicBlock *BBColor = CV.front();
+      Instruction *EHPad = BBColor->getFirstNonPHI();
+      if (EHPad->isEHPad())
+        OpBundles.emplace_back("funclet", EHPad);
+    }
+
+    New = CallInst::Create(CI, OpBundles);
+  } else {
+    New = I.clone();
+  }
+
   ExitBlock.getInstList().insert(ExitBlock.getFirstInsertionPt(), New);
   if (!I.getName().empty()) New->setName(I.getName() + ".le");
 
@@ -601,7 +659,8 @@ static Instruction *CloneInstructionInExitBlock(const Instruction &I,
 /// position, and may either delete it or move it to outside of the loop.
 ///
 static bool sink(Instruction &I, const LoopInfo *LI, const DominatorTree *DT,
-                 const Loop *CurLoop, AliasSetTracker *CurAST ) {
+                 const Loop *CurLoop, AliasSetTracker *CurAST,
+                 const LICMSafetyInfo *SafetyInfo) {
   DEBUG(dbgs() << "LICM sinking instruction: " << I << "\n");
   bool Changed = false;
   if (isa<LoadInst>(I)) ++NumMovedLoads;
@@ -652,7 +711,7 @@ static bool sink(Instruction &I, const LoopInfo *LI, const DominatorTree *DT,
       New = It->second;
     else
       New = SunkCopies[ExitBlock] =
-            CloneInstructionInExitBlock(I, *ExitBlock, *PN, LI);
+          CloneInstructionInExitBlock(I, *ExitBlock, *PN, LI, SafetyInfo);
 
     PN->replaceAllUsesWith(New);
     PN->eraseFromParent();
@@ -950,6 +1009,21 @@ bool llvm::promoteLoopAccessesToScalars(AliasSet &AS,
   if (!GuaranteedToExecute)
     return Changed;
 
+  // Figure out the loop exits and their insertion points, if this is the
+  // first promotion.
+  if (ExitBlocks.empty()) {
+    CurLoop->getUniqueExitBlocks(ExitBlocks);
+    InsertPts.clear();
+    InsertPts.reserve(ExitBlocks.size());
+    for (BasicBlock *ExitBlock : ExitBlocks)
+      InsertPts.push_back(&*ExitBlock->getFirstInsertionPt());
+  }
+
+  // Can't insert into a catchswitch.
+  for (BasicBlock *ExitBlock : ExitBlocks)
+    if (isa<CatchSwitchInst>(ExitBlock->getTerminator()))
+      return Changed;
+
   // Otherwise, this is safe to promote, lets do it!
   DEBUG(dbgs() << "LICM: Promoting value stored to in loop: " <<*SomePtr<<'\n');
   Changed = true;
@@ -961,15 +1035,6 @@ bool llvm::promoteLoopAccessesToScalars(AliasSet &AS,
   // location is better than none.
   DebugLoc DL = LoopUses[0]->getDebugLoc();
 
-  // Figure out the loop exits and their insertion points, if this is the
-  // first promotion.
-  if (ExitBlocks.empty()) {
-    CurLoop->getUniqueExitBlocks(ExitBlocks);
-    InsertPts.resize(ExitBlocks.size());
-    for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i)
-      InsertPts[i] = &*ExitBlocks[i]->getFirstInsertionPt();
-  }
-
   // We use the SSAUpdater interface to insert phi nodes as required.
   SmallVector<PHINode*, 16> NewPHIs;
   SSAUpdater SSA(&NewPHIs);
diff --git a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index 2d577de..4521640 100644
--- a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -108,7 +108,11 @@ public:
 
 private:
   typedef SmallVector<StoreInst *, 8> StoreList;
-  StoreList StoreRefs;
+  StoreList StoreRefsForMemset;
+  StoreList StoreRefsForMemcpy;
+  bool HasMemset;
+  bool HasMemsetPattern;
+  bool HasMemcpy;
 
   /// \name Countable Loop Idiom Handling
   /// @{
@@ -118,17 +122,15 @@ private:
                       SmallVectorImpl<BasicBlock *> &ExitBlocks);
 
   void collectStores(BasicBlock *BB);
-  bool isLegalStore(StoreInst *SI);
+  bool isLegalStore(StoreInst *SI, bool &ForMemset, bool &ForMemcpy);
   bool processLoopStore(StoreInst *SI, const SCEV *BECount);
   bool processLoopMemSet(MemSetInst *MSI, const SCEV *BECount);
 
   bool processLoopStridedStore(Value *DestPtr, unsigned StoreSize,
-                               unsigned StoreAlignment, Value *SplatValue,
+                               unsigned StoreAlignment, Value *StoredVal,
                                Instruction *TheStore, const SCEVAddRecExpr *Ev,
                                const SCEV *BECount, bool NegStride);
-  bool processLoopStoreOfLoopLoad(StoreInst *SI, unsigned StoreSize,
-                                  const SCEVAddRecExpr *StoreEv,
-                                  const SCEV *BECount, bool NegStride);
+  bool processLoopStoreOfLoopLoad(StoreInst *SI, const SCEV *BECount);
 
   /// @}
   /// \name Noncountable Loop Idiom Handling
@@ -207,8 +209,13 @@ bool LoopIdiomRecognize::runOnLoop(Loop *L, LPPassManager &LPM) {
       *CurLoop->getHeader()->getParent());
   DL = &CurLoop->getHeader()->getModule()->getDataLayout();
 
-  if (SE->hasLoopInvariantBackedgeTakenCount(L))
-    return runOnCountableLoop();
+  HasMemset = TLI->has(LibFunc::memset);
+  HasMemsetPattern = TLI->has(LibFunc::memset_pattern16);
+  HasMemcpy = TLI->has(LibFunc::memcpy);
+
+  if (HasMemset || HasMemsetPattern || HasMemcpy)
+    if (SE->hasLoopInvariantBackedgeTakenCount(L))
+      return runOnCountableLoop();
 
   return runOnNoncountableLoop();
 }
@@ -297,7 +304,8 @@ static Constant *getMemSetPatternValue(Value *V, const DataLayout *DL) {
   return ConstantArray::get(AT, std::vector<Constant *>(ArraySize, C));
 }
 
-bool LoopIdiomRecognize::isLegalStore(StoreInst *SI) {
+bool LoopIdiomRecognize::isLegalStore(StoreInst *SI, bool &ForMemset,
+                                      bool &ForMemcpy) {
   // Don't touch volatile stores.
   if (!SI->isSimple())
     return false;
@@ -322,22 +330,86 @@ bool LoopIdiomRecognize::isLegalStore(StoreInst *SI) {
   if (!isa<SCEVConstant>(StoreEv->getOperand(1)))
     return false;
 
-  return true;
+  // See if the store can be turned into a memset.
+
+  // If the stored value is a byte-wise value (like i32 -1), then it may be
+  // turned into a memset of i8 -1, assuming that all the consecutive bytes
+  // are stored.  A store of i32 0x01020304 can never be turned into a memset,
+  // but it can be turned into memset_pattern if the target supports it.
+  Value *SplatValue = isBytewiseValue(StoredVal);
+  Constant *PatternValue = nullptr;
+
+  // If we're allowed to form a memset, and the stored value would be
+  // acceptable for memset, use it.
+  if (HasMemset && SplatValue &&
+      // Verify that the stored value is loop invariant.  If not, we can't
+      // promote the memset.
+      CurLoop->isLoopInvariant(SplatValue)) {
+    // It looks like we can use SplatValue.
+    ForMemset = true;
+    return true;
+  } else if (HasMemsetPattern &&
+             // Don't create memset_pattern16s with address spaces.
+             StorePtr->getType()->getPointerAddressSpace() == 0 &&
+             (PatternValue = getMemSetPatternValue(StoredVal, DL))) {
+    // It looks like we can use PatternValue!
+    ForMemset = true;
+    return true;
+  }
+
+  // Otherwise, see if the store can be turned into a memcpy.
+  if (HasMemcpy) {
+    // Check to see if the stride matches the size of the store.  If so, then we
+    // know that every byte is touched in the loop.
+    unsigned Stride = getStoreStride(StoreEv);
+    unsigned StoreSize = getStoreSizeInBytes(SI, DL);
+    if (StoreSize != Stride && StoreSize != -Stride)
+      return false;
+
+    // The store must be feeding a non-volatile load.
+    LoadInst *LI = dyn_cast<LoadInst>(SI->getValueOperand());
+    if (!LI || !LI->isSimple())
+      return false;
+
+    // See if the pointer expression is an AddRec like {base,+,1} on the current
+    // loop, which indicates a strided load.  If we have something else, it's a
+    // random load we can't handle.
+    const SCEVAddRecExpr *LoadEv =
+        dyn_cast<SCEVAddRecExpr>(SE->getSCEV(LI->getPointerOperand()));
+    if (!LoadEv || LoadEv->getLoop() != CurLoop || !LoadEv->isAffine())
+      return false;
+
+    // The store and load must share the same stride.
+    if (StoreEv->getOperand(1) != LoadEv->getOperand(1))
+      return false;
+
+    // Success.  This store can be converted into a memcpy.
+    ForMemcpy = true;
+    return true;
+  }
+  // This store can't be transformed into a memset/memcpy.
+  return false;
 }
 
 void LoopIdiomRecognize::collectStores(BasicBlock *BB) {
-  StoreRefs.clear();
+  StoreRefsForMemset.clear();
+  StoreRefsForMemcpy.clear();
   for (Instruction &I : *BB) {
     StoreInst *SI = dyn_cast<StoreInst>(&I);
     if (!SI)
       continue;
 
+    bool ForMemset = false;
+    bool ForMemcpy = false;
     // Make sure this is a strided store with a constant stride.
-    if (!isLegalStore(SI))
+    if (!isLegalStore(SI, ForMemset, ForMemcpy))
       continue;
 
     // Save the store locations.
-    StoreRefs.push_back(SI);
+    if (ForMemset)
+      StoreRefsForMemset.push_back(SI);
+    else if (ForMemcpy)
+      StoreRefsForMemcpy.push_back(SI);
   }
 }
 
@@ -357,9 +429,15 @@ bool LoopIdiomRecognize::runOnLoopBlock(
   bool MadeChange = false;
   // Look for store instructions, which may be optimized to memset/memcpy.
   collectStores(BB);
-  for (auto &SI : StoreRefs)
+
+  // Look for a single store which can be optimized into a memset.
+  for (auto &SI : StoreRefsForMemset)
     MadeChange |= processLoopStore(SI, BECount);
 
+  // Optimize the store into a memcpy, if it feeds an similarly strided load.
+  for (auto &SI : StoreRefsForMemcpy)
+    MadeChange |= processLoopStoreOfLoopLoad(SI, BECount);
+
   for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
     Instruction *Inst = &*I++;
     // Look for memset instructions, which may be optimized to a larger memset.
@@ -380,7 +458,7 @@ bool LoopIdiomRecognize::runOnLoopBlock(
   return MadeChange;
 }
 
-/// processLoopStore - See if this store can be promoted to a memset or memcpy.
+/// processLoopStore - See if this store can be promoted to a memset.
 bool LoopIdiomRecognize::processLoopStore(StoreInst *SI, const SCEV *BECount) {
   assert(SI->isSimple() && "Expected only non-volatile stores.");
 
@@ -398,12 +476,8 @@ bool LoopIdiomRecognize::processLoopStore(StoreInst *SI, const SCEV *BECount) {
   bool NegStride = StoreSize == -Stride;
 
   // See if we can optimize just this store in isolation.
-  if (processLoopStridedStore(StorePtr, StoreSize, SI->getAlignment(),
-                              StoredVal, SI, StoreEv, BECount, NegStride))
-    return true;
-
-  // Optimize the store into a memcpy, if it feeds an similarly strided load.
-  return processLoopStoreOfLoopLoad(SI, StoreSize, StoreEv, BECount, NegStride);
+  return processLoopStridedStore(StorePtr, StoreSize, SI->getAlignment(),
+                                 StoredVal, SI, StoreEv, BECount, NegStride);
 }
 
 /// processLoopMemSet - See if this memset can be promoted to a large memset.
@@ -440,8 +514,14 @@ bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI,
   if (!Stride || MSI->getLength() != Stride->getValue())
     return false;
 
+  // Verify that the memset value is loop invariant.  If not, we can't promote
+  // the memset.
+  Value *SplatValue = MSI->getValue();
+  if (!SplatValue || !CurLoop->isLoopInvariant(SplatValue))
+    return false;
+
   return processLoopStridedStore(Pointer, (unsigned)SizeInBytes,
-                                 MSI->getAlignment(), MSI->getValue(), MSI, Ev,
+                                 MSI->getAlignment(), SplatValue, MSI, Ev,
                                  BECount, /*NegStride=*/false);
 }
 
@@ -496,37 +576,19 @@ bool LoopIdiomRecognize::processLoopStridedStore(
     Value *DestPtr, unsigned StoreSize, unsigned StoreAlignment,
     Value *StoredVal, Instruction *TheStore, const SCEVAddRecExpr *Ev,
     const SCEV *BECount, bool NegStride) {
-
-  // If the stored value is a byte-wise value (like i32 -1), then it may be
-  // turned into a memset of i8 -1, assuming that all the consecutive bytes
-  // are stored.  A store of i32 0x01020304 can never be turned into a memset,
-  // but it can be turned into memset_pattern if the target supports it.
   Value *SplatValue = isBytewiseValue(StoredVal);
   Constant *PatternValue = nullptr;
-  unsigned DestAS = DestPtr->getType()->getPointerAddressSpace();
 
-  // If we're allowed to form a memset, and the stored value would be acceptable
-  // for memset, use it.
-  if (SplatValue && TLI->has(LibFunc::memset) &&
-      // Verify that the stored value is loop invariant.  If not, we can't
-      // promote the memset.
-      CurLoop->isLoopInvariant(SplatValue)) {
-    // Keep and use SplatValue.
-    PatternValue = nullptr;
-  } else if (DestAS == 0 && TLI->has(LibFunc::memset_pattern16) &&
-             (PatternValue = getMemSetPatternValue(StoredVal, DL))) {
-    // Don't create memset_pattern16s with address spaces.
-    // It looks like we can use PatternValue!
-    SplatValue = nullptr;
-  } else {
-    // Otherwise, this isn't an idiom we can transform.  For example, we can't
-    // do anything with a 3-byte store.
-    return false;
-  }
+  if (!SplatValue)
+    PatternValue = getMemSetPatternValue(StoredVal, DL);
+
+  assert((SplatValue || PatternValue) &&
+         "Expected either splat value or pattern value.");
 
   // The trip count of the loop and the base pointer of the addrec SCEV is
   // guaranteed to be loop invariant, which means that it should dominate the
   // header.  This allows us to insert code for it in the preheader.
+  unsigned DestAS = DestPtr->getType()->getPointerAddressSpace();
   BasicBlock *Preheader = CurLoop->getLoopPreheader();
   IRBuilder<> Builder(Preheader->getTerminator());
   SCEVExpander Expander(*SE, *DL, "loop-idiom");
@@ -608,29 +670,25 @@ bool LoopIdiomRecognize::processLoopStridedStore(
 /// If the stored value is a strided load in the same loop with the same stride
 /// this may be transformable into a memcpy.  This kicks in for stuff like
 ///   for (i) A[i] = B[i];
-bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(
-    StoreInst *SI, unsigned StoreSize, const SCEVAddRecExpr *StoreEv,
-    const SCEV *BECount, bool NegStride) {
-  // If we're not allowed to form memcpy, we fail.
-  if (!TLI->has(LibFunc::memcpy))
-    return false;
+bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
+                                                    const SCEV *BECount) {
+  assert(SI->isSimple() && "Expected only non-volatile stores.");
+
+  Value *StorePtr = SI->getPointerOperand();
+  const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
+  unsigned Stride = getStoreStride(StoreEv);
+  unsigned StoreSize = getStoreSizeInBytes(SI, DL);
+  bool NegStride = StoreSize == -Stride;
 
   // The store must be feeding a non-volatile load.
-  LoadInst *LI = dyn_cast<LoadInst>(SI->getValueOperand());
-  if (!LI || !LI->isSimple())
-    return false;
+  LoadInst *LI = cast<LoadInst>(SI->getValueOperand());
+  assert(LI->isSimple() && "Expected only non-volatile stores.");
 
   // See if the pointer expression is an AddRec like {base,+,1} on the current
   // loop, which indicates a strided load.  If we have something else, it's a
   // random load we can't handle.
   const SCEVAddRecExpr *LoadEv =
-      dyn_cast<SCEVAddRecExpr>(SE->getSCEV(LI->getPointerOperand()));
-  if (!LoadEv || LoadEv->getLoop() != CurLoop || !LoadEv->isAffine())
-    return false;
-
-  // The store and load must share the same stride.
-  if (StoreEv->getOperand(1) != LoadEv->getOperand(1))
-    return false;
+      cast<SCEVAddRecExpr>(SE->getSCEV(LI->getPointerOperand()));
 
   // The trip count of the loop and the base pointer of the addrec SCEV is
   // guaranteed to be loop invariant, which means that it should dominate the
diff --git a/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index 0333bf2..7354016 100644
--- a/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -481,6 +481,17 @@ Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst,
   return AMemSet;
 }
 
+static unsigned findCommonAlignment(const DataLayout &DL, const StoreInst *SI,
+                                     const LoadInst *LI) {
+  unsigned StoreAlign = SI->getAlignment();
+  if (!StoreAlign)
+    StoreAlign = DL.getABITypeAlignment(SI->getOperand(0)->getType());
+  unsigned LoadAlign = LI->getAlignment();
+  if (!LoadAlign)
+    LoadAlign = DL.getABITypeAlignment(LI->getType());
+
+  return std::min(StoreAlign, LoadAlign);
+}
 
 bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
   if (!SI->isSimple()) return false;
@@ -496,12 +507,84 @@ bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
 
   const DataLayout &DL = SI->getModule()->getDataLayout();
 
-  // Detect cases where we're performing call slot forwarding, but
-  // happen to be using a load-store pair to implement it, rather than
-  // a memcpy.
+  // Load to store forwarding can be interpreted as memcpy.
   if (LoadInst *LI = dyn_cast<LoadInst>(SI->getOperand(0))) {
     if (LI->isSimple() && LI->hasOneUse() &&
         LI->getParent() == SI->getParent()) {
+
+      auto *T = LI->getType();
+      if (T->isAggregateType()) {
+        AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
+        MemoryLocation LoadLoc = MemoryLocation::get(LI);
+
+        // We use alias analysis to check if an instruction may store to
+        // the memory we load from in between the load and the store. If
+        // such an instruction is found, we try to promote there instead
+        // of at the store position.
+        Instruction *P = SI;
+        for (BasicBlock::iterator I = ++LI->getIterator(), E = SI->getIterator();
+             I != E; ++I) {
+          if (!(AA.getModRefInfo(&*I, LoadLoc) & MRI_Mod))
+            continue;
+
+          // We found an instruction that may write to the loaded memory.
+          // We can try to promote at this position instead of the store
+          // position if nothing alias the store memory after this.
+          P = &*I;
+          for (; I != E; ++I) {
+            MemoryLocation StoreLoc = MemoryLocation::get(SI);
+            if (AA.getModRefInfo(&*I, StoreLoc) != MRI_NoModRef) {
+              DEBUG(dbgs() << "Alias " << *I << "\n");
+              P = nullptr;
+              break;
+            }
+          }
+
+          break;
+        }
+
+        // If a valid insertion position is found, then we can promote
+        // the load/store pair to a memcpy.
+        if (P) {
+          // If we load from memory that may alias the memory we store to,
+          // memmove must be used to preserve semantic. If not, memcpy can
+          // be used.
+          bool UseMemMove = false;
+          if (!AA.isNoAlias(MemoryLocation::get(SI), LoadLoc))
+            UseMemMove = true;
+
+          unsigned Align = findCommonAlignment(DL, SI, LI);
+          uint64_t Size = DL.getTypeStoreSize(T);
+
+          IRBuilder<> Builder(P);
+          Instruction *M;
+          if (UseMemMove)
+            M = Builder.CreateMemMove(SI->getPointerOperand(),
+                                      LI->getPointerOperand(), Size,
+                                      Align, SI->isVolatile());
+          else
+            M = Builder.CreateMemCpy(SI->getPointerOperand(),
+                                     LI->getPointerOperand(), Size,
+                                     Align, SI->isVolatile());
+
+          DEBUG(dbgs() << "Promoting " << *LI << " to " << *SI
+                       << " => " << *M << "\n");
+
+          MD->removeInstruction(SI);
+          SI->eraseFromParent();
+          MD->removeInstruction(LI);
+          LI->eraseFromParent();
+          ++NumMemCpyInstr;
+
+          // Make sure we do not invalidate the iterator.
+          BBI = M->getIterator();
+          return true;
+        }
+      }
+
+      // Detect cases where we're performing call slot forwarding, but
+      // happen to be using a load-store pair to implement it, rather than
+      // a memcpy.
       MemDepResult ldep = MD->getDependency(LI);
       CallInst *C = nullptr;
       if (ldep.isClobber() && !isa<MemCpyInst>(ldep.getInst()))
@@ -522,18 +605,11 @@ bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
       }
 
       if (C) {
-        unsigned storeAlign = SI->getAlignment();
-        if (!storeAlign)
-          storeAlign = DL.getABITypeAlignment(SI->getOperand(0)->getType());
-        unsigned loadAlign = LI->getAlignment();
-        if (!loadAlign)
-          loadAlign = DL.getABITypeAlignment(LI->getType());
-
         bool changed = performCallSlotOptzn(
             LI, SI->getPointerOperand()->stripPointerCasts(),
             LI->getPointerOperand()->stripPointerCasts(),
             DL.getTypeStoreSize(SI->getOperand(0)->getType()),
-            std::min(storeAlign, loadAlign), C);
+            findCommonAlignment(DL, SI, LI), C);
         if (changed) {
           MD->removeInstruction(SI);
           SI->eraseFromParent();
diff --git a/lib/Transforms/Scalar/Reassociate.cpp b/lib/Transforms/Scalar/Reassociate.cpp
index fb970c7..401a740 100644
--- a/lib/Transforms/Scalar/Reassociate.cpp
+++ b/lib/Transforms/Scalar/Reassociate.cpp
@@ -183,6 +183,8 @@ namespace {
     Value *OptimizeMul(BinaryOperator *I, SmallVectorImpl<ValueEntry> &Ops);
     Value *RemoveFactorFromExpression(Value *V, Value *Factor);
     void EraseInst(Instruction *I);
+    void RecursivelyEraseDeadInsts(Instruction *I,
+                                   SetVector<AssertingVH<Instruction>> &Insts);
     void OptimizeInst(Instruction *I);
     Instruction *canonicalizeNegConstExpr(Instruction *I);
   };
@@ -1926,6 +1928,22 @@ Value *Reassociate::OptimizeExpression(BinaryOperator *I,
   return nullptr;
 }
 
+// Remove dead instructions and if any operands are trivially dead add them to
+// Insts so they will be removed as well.
+void Reassociate::RecursivelyEraseDeadInsts(
+    Instruction *I, SetVector<AssertingVH<Instruction>> &Insts) {
+  assert(isInstructionTriviallyDead(I) && "Trivially dead instructions only!");
+  SmallVector<Value *, 4> Ops(I->op_begin(), I->op_end());
+  ValueRankMap.erase(I);
+  Insts.remove(I);
+  RedoInsts.remove(I);
+  I->eraseFromParent();
+  for (auto Op : Ops)
+    if (Instruction *OpInst = dyn_cast<Instruction>(Op))
+      if (OpInst->use_empty())
+        Insts.insert(OpInst);
+}
+
 /// Zap the given instruction, adding interesting operands to the work list.
 void Reassociate::EraseInst(Instruction *I) {
   assert(isInstructionTriviallyDead(I) && "Trivially dead instructions only!");
@@ -2255,7 +2273,21 @@ bool Reassociate::runOnFunction(Function &F) {
         ++II;
       }
 
-    // If this produced extra instructions to optimize, handle them now.
+    // Make a copy of all the instructions to be redone so we can remove dead
+    // instructions.
+    SetVector<AssertingVH<Instruction>> ToRedo(RedoInsts);
+    // Iterate over all instructions to be reevaluated and remove trivially dead
+    // instructions. If any operand of the trivially dead instruction becomes
+    // dead mark it for deletion as well. Continue this process until all
+    // trivially dead instructions have been removed.
+    while (!ToRedo.empty()) {
+      Instruction *I = ToRedo.pop_back_val();
+      if (isInstructionTriviallyDead(I))
+        RecursivelyEraseDeadInsts(I, ToRedo);
+    }
+
+    // Now that we have removed dead instructions, we can reoptimize the
+    // remaining instructions.
     while (!RedoInsts.empty()) {
       Instruction *I = RedoInsts.pop_back_val();
       if (isInstructionTriviallyDead(I))
diff --git a/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp b/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
index db127c3..5d253be 100644
--- a/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
+++ b/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
@@ -428,30 +428,15 @@ static BaseDefiningValueResult findBaseDefiningValue(Value *I) {
     // We should have never reached here if this argument isn't an gc value
     return BaseDefiningValueResult(I, true);
 
-  if (isa<GlobalVariable>(I))
-    // base case
+  if (isa<Constant>(I))
+    // We assume that objects with a constant base (e.g. a global) can't move
+    // and don't need to be reported to the collector because they are always
+    // live.  All constants have constant bases.  Besides global references, all
+    // kinds of constants (e.g. undef, constant expressions, null pointers) can
+    // be introduced by the inliner or the optimizer, especially on dynamically
+    // dead paths.  See e.g. test4 in constants.ll.
     return BaseDefiningValueResult(I, true);
 
-  // inlining could possibly introduce phi node that contains
-  // undef if callee has multiple returns
-  if (isa<UndefValue>(I))
-    // utterly meaningless, but useful for dealing with
-    // partially optimized code.
-    return BaseDefiningValueResult(I, true);
-
-  // Due to inheritance, this must be _after_ the global variable and undef
-  // checks
-  if (isa<Constant>(I)) {
-    assert(!isa<GlobalVariable>(I) && !isa<UndefValue>(I) &&
-           "order of checks wrong!");
-    // Note: Even for frontends which don't have constant references, we can
-    // see constants appearing after optimizations.  A simple example is
-    // specialization of an address computation on null feeding into a merge
-    // point where the actual use of the now-constant input is protected by
-    // another null check.  (e.g. test4 in constants.ll)
-    return BaseDefiningValueResult(I, true);
-  }
-
   if (CastInst *CI = dyn_cast<CastInst>(I)) {
     Value *Def = CI->stripPointerCasts();
     // If stripping pointer casts changes the address space there is an
@@ -1642,33 +1627,24 @@ insertRelocationStores(iterator_range<Value::user_iterator> GCRelocs,
                        DenseSet<Value *> &VisitedLiveValues) {
 
   for (User *U : GCRelocs) {
-    if (!isa<IntrinsicInst>(U))
+    GCRelocateInst *Relocate = dyn_cast<GCRelocateInst>(U);
+    if (!Relocate)
       continue;
 
-    IntrinsicInst *RelocatedValue = cast<IntrinsicInst>(U);
-
-    // We only care about relocates
-    if (RelocatedValue->getIntrinsicID() !=
-        Intrinsic::experimental_gc_relocate) {
-      continue;
-    }
-
-    GCRelocateOperands RelocateOperands(RelocatedValue);
-    Value *OriginalValue =
-        const_cast<Value *>(RelocateOperands.getDerivedPtr());
+    Value *OriginalValue = const_cast<Value *>(Relocate->getDerivedPtr());
     assert(AllocaMap.count(OriginalValue));
     Value *Alloca = AllocaMap[OriginalValue];
 
     // Emit store into the related alloca
     // All gc_relocates are i8 addrspace(1)* typed, and it must be bitcasted to
     // the correct type according to alloca.
-    assert(RelocatedValue->getNextNode() &&
+    assert(Relocate->getNextNode() &&
            "Should always have one since it's not a terminator");
-    IRBuilder<> Builder(RelocatedValue->getNextNode());
+    IRBuilder<> Builder(Relocate->getNextNode());
     Value *CastedRelocatedValue =
-      Builder.CreateBitCast(RelocatedValue,
+      Builder.CreateBitCast(Relocate,
                             cast<AllocaInst>(Alloca)->getAllocatedType(),
-                            suffixed_name_or(RelocatedValue, ".casted", ""));
+                            suffixed_name_or(Relocate, ".casted", ""));
 
     StoreInst *Store = new StoreInst(CastedRelocatedValue, Alloca);
     Store->insertAfter(cast<Instruction>(CastedRelocatedValue));