summaryrefslogtreecommitdiffstats
path: root/contrib/llvm/lib/Transforms
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/llvm/lib/Transforms')
-rw-r--r--contrib/llvm/lib/Transforms/Coroutines/CoroCleanup.cpp134
-rw-r--r--contrib/llvm/lib/Transforms/Coroutines/CoroEarly.cpp218
-rw-r--r--contrib/llvm/lib/Transforms/Coroutines/CoroElide.cpp317
-rw-r--r--contrib/llvm/lib/Transforms/Coroutines/CoroFrame.cpp727
-rw-r--r--contrib/llvm/lib/Transforms/Coroutines/CoroInstr.h318
-rw-r--r--contrib/llvm/lib/Transforms/Coroutines/CoroInternal.h107
-rw-r--r--contrib/llvm/lib/Transforms/Coroutines/CoroSplit.cpp640
-rw-r--r--contrib/llvm/lib/Transforms/Coroutines/Coroutines.cpp314
-rw-r--r--contrib/llvm/lib/Transforms/IPO/AlwaysInliner.cpp158
-rw-r--r--contrib/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp19
-rw-r--r--contrib/llvm/lib/Transforms/IPO/CrossDSOCFI.cpp2
-rw-r--r--contrib/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp10
-rw-r--r--contrib/llvm/lib/Transforms/IPO/FunctionAttrs.cpp103
-rw-r--r--contrib/llvm/lib/Transforms/IPO/FunctionImport.cpp559
-rw-r--r--contrib/llvm/lib/Transforms/IPO/GlobalDCE.cpp54
-rw-r--r--contrib/llvm/lib/Transforms/IPO/GlobalOpt.cpp48
-rw-r--r--contrib/llvm/lib/Transforms/IPO/GlobalSplit.cpp171
-rw-r--r--contrib/llvm/lib/Transforms/IPO/IPO.cpp8
-rw-r--r--contrib/llvm/lib/Transforms/IPO/InferFunctionAttrs.cpp2
-rw-r--r--contrib/llvm/lib/Transforms/IPO/InlineAlways.cpp103
-rw-r--r--contrib/llvm/lib/Transforms/IPO/InlineSimple.cpp46
-rw-r--r--contrib/llvm/lib/Transforms/IPO/Inliner.cpp663
-rw-r--r--contrib/llvm/lib/Transforms/IPO/Internalize.cpp2
-rw-r--r--contrib/llvm/lib/Transforms/IPO/LowerTypeTests.cpp975
-rw-r--r--contrib/llvm/lib/Transforms/IPO/MergeFunctions.cpp1217
-rw-r--r--contrib/llvm/lib/Transforms/IPO/PartialInlining.cpp239
-rw-r--r--contrib/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp138
-rw-r--r--contrib/llvm/lib/Transforms/IPO/PruneEH.cpp5
-rw-r--r--contrib/llvm/lib/Transforms/IPO/SampleProfile.cpp294
-rw-r--r--contrib/llvm/lib/Transforms/IPO/StripSymbols.cpp29
-rw-r--r--contrib/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp344
-rw-r--r--contrib/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp187
-rw-r--r--contrib/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp149
-rw-r--r--contrib/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp354
-rw-r--r--contrib/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp810
-rw-r--r--contrib/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp381
-rw-r--r--contrib/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp4391
-rw-r--r--contrib/llvm/lib/Transforms/InstCombine/InstCombineInternal.h256
-rw-r--r--contrib/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp111
-rw-r--r--contrib/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp173
-rw-r--r--contrib/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp38
-rw-r--r--contrib/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp537
-rw-r--r--contrib/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp365
-rw-r--r--contrib/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp319
-rw-r--r--contrib/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp203
-rw-r--r--contrib/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp288
-rw-r--r--contrib/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp913
-rw-r--r--contrib/llvm/lib/Transforms/Instrumentation/CFGMST.h8
-rw-r--r--contrib/llvm/lib/Transforms/Instrumentation/EfficiencySanitizer.cpp51
-rw-r--r--contrib/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp75
-rw-r--r--contrib/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp40
-rw-r--r--contrib/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp161
-rw-r--r--contrib/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp57
-rw-r--r--contrib/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp456
-rw-r--r--contrib/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp319
-rw-r--r--contrib/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp111
-rw-r--r--contrib/llvm/lib/Transforms/ObjCARC/ARCRuntimeEntryPoints.h9
-rw-r--r--contrib/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp37
-rw-r--r--contrib/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp16
-rw-r--r--contrib/llvm/lib/Transforms/ObjCARC/PtrState.cpp4
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/ADCE.cpp623
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp10
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/BDCE.cpp8
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp4
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp96
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/DCE.cpp2
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp277
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp193
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/Float2Int.cpp45
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/GVN.cpp153
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/GVNHoist.cpp560
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/GuardWidening.cpp3
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/IVUsersPrinter.cpp22
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp399
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp175
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp82
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/LICM.cpp367
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/LoadCombine.cpp7
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/LoopAccessAnalysisPrinter.cpp25
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp199
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/LoopDeletion.cpp15
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/LoopDistribute.cpp109
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp76
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp20
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/LoopInterchange.cpp250
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp47
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/LoopPassManager.cpp85
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp135
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/LoopRotation.cpp34
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp18
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/LoopSink.cpp335
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp640
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp439
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp94
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp85
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp36
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/LowerGuardIntrinsic.cpp30
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp47
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp7
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/NaryReassociate.cpp211
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/NewGVN.cpp2257
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp2
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp42
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp209
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/SCCP.cpp181
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/SROA.cpp221
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/Scalar.cpp25
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/Scalarizer.cpp79
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp15
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp2
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/Sink.cpp2
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp114
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp24
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp274
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp4
-rw-r--r--contrib/llvm/lib/Transforms/Utils/ASanStackFrameLayout.cpp100
-rw-r--r--contrib/llvm/lib/Transforms/Utils/AddDiscriminators.cpp52
-rw-r--r--contrib/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp18
-rw-r--r--contrib/llvm/lib/Transforms/Utils/BuildLibCalls.cpp1
-rw-r--r--contrib/llvm/lib/Transforms/Utils/BypassSlowDivision.cpp35
-rw-r--r--contrib/llvm/lib/Transforms/Utils/CloneModule.cpp5
-rw-r--r--contrib/llvm/lib/Transforms/Utils/CmpInstAnalysis.cpp76
-rw-r--r--contrib/llvm/lib/Transforms/Utils/CodeExtractor.cpp147
-rw-r--r--contrib/llvm/lib/Transforms/Utils/CtorUtils.cpp10
-rw-r--r--contrib/llvm/lib/Transforms/Utils/EscapeEnumerator.cpp96
-rw-r--r--contrib/llvm/lib/Transforms/Utils/Evaluator.cpp18
-rw-r--r--contrib/llvm/lib/Transforms/Utils/FlattenCFG.cpp9
-rw-r--r--contrib/llvm/lib/Transforms/Utils/FunctionComparator.cpp919
-rw-r--r--contrib/llvm/lib/Transforms/Utils/FunctionImportUtils.cpp93
-rw-r--r--contrib/llvm/lib/Transforms/Utils/GlobalStatus.cpp7
-rw-r--r--contrib/llvm/lib/Transforms/Utils/ImportedFunctionsInliningStatistics.cpp203
-rw-r--r--contrib/llvm/lib/Transforms/Utils/InlineFunction.cpp216
-rw-r--r--contrib/llvm/lib/Transforms/Utils/LCSSA.cpp59
-rw-r--r--contrib/llvm/lib/Transforms/Utils/LibCallsShrinkWrap.cpp571
-rw-r--r--contrib/llvm/lib/Transforms/Utils/Local.cpp180
-rw-r--r--contrib/llvm/lib/Transforms/Utils/LoopSimplify.cpp55
-rw-r--r--contrib/llvm/lib/Transforms/Utils/LoopUnroll.cpp198
-rw-r--r--contrib/llvm/lib/Transforms/Utils/LoopUnrollPeel.cpp414
-rw-r--r--contrib/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp40
-rw-r--r--contrib/llvm/lib/Transforms/Utils/LoopUtils.cpp174
-rw-r--r--contrib/llvm/lib/Transforms/Utils/LoopVersioning.cpp6
-rw-r--r--contrib/llvm/lib/Transforms/Utils/LowerInvoke.cpp49
-rw-r--r--contrib/llvm/lib/Transforms/Utils/LowerSwitch.cpp8
-rw-r--r--contrib/llvm/lib/Transforms/Utils/Mem2Reg.cpp2
-rw-r--r--contrib/llvm/lib/Transforms/Utils/MemorySSA.cpp1926
-rw-r--r--contrib/llvm/lib/Transforms/Utils/ModuleUtils.cpp104
-rw-r--r--contrib/llvm/lib/Transforms/Utils/NameAnonGlobals.cpp (renamed from contrib/llvm/lib/Transforms/Utils/NameAnonFunctions.cpp)63
-rw-r--r--contrib/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp2
-rw-r--r--contrib/llvm/lib/Transforms/Utils/SSAUpdater.cpp2
-rw-r--r--contrib/llvm/lib/Transforms/Utils/SimplifyCFG.cpp839
-rw-r--r--contrib/llvm/lib/Transforms/Utils/SimplifyInstructions.cpp41
-rw-r--r--contrib/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp148
-rw-r--r--contrib/llvm/lib/Transforms/Utils/StripGCRelocates.cpp80
-rw-r--r--contrib/llvm/lib/Transforms/Utils/StripNonLineTableDebugInfo.cpp42
-rw-r--r--contrib/llvm/lib/Transforms/Utils/SymbolRewriter.cpp80
-rw-r--r--contrib/llvm/lib/Transforms/Utils/Utils.cpp7
-rw-r--r--contrib/llvm/lib/Transforms/Utils/ValueMapper.cpp22
-rw-r--r--contrib/llvm/lib/Transforms/Vectorize/BBVectorize.cpp2
-rw-r--r--contrib/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp511
-rw-r--r--contrib/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp3014
-rw-r--r--contrib/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp463
161 files changed, 27740 insertions, 11564 deletions
diff --git a/contrib/llvm/lib/Transforms/Coroutines/CoroCleanup.cpp b/contrib/llvm/lib/Transforms/Coroutines/CoroCleanup.cpp
new file mode 100644
index 0000000..a97db6f
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Coroutines/CoroCleanup.cpp
@@ -0,0 +1,134 @@
+//===- CoroCleanup.cpp - Coroutine Cleanup Pass ---------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This pass lowers all remaining coroutine intrinsics.
+//===----------------------------------------------------------------------===//
+
+#include "CoroInternal.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Scalar.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "coro-cleanup"
+
+namespace {
+// Created on demand if CoroCleanup pass has work to do.
+struct Lowerer : coro::LowererBase {
+ IRBuilder<> Builder;
+ Lowerer(Module &M) : LowererBase(M), Builder(Context) {}
+ bool lowerRemainingCoroIntrinsics(Function &F);
+};
+}
+
+static void simplifyCFG(Function &F) {
+ llvm::legacy::FunctionPassManager FPM(F.getParent());
+ FPM.add(createCFGSimplificationPass());
+
+ FPM.doInitialization();
+ FPM.run(F);
+ FPM.doFinalization();
+}
+
+static void lowerSubFn(IRBuilder<> &Builder, CoroSubFnInst *SubFn) {
+ Builder.SetInsertPoint(SubFn);
+ Value *FrameRaw = SubFn->getFrame();
+ int Index = SubFn->getIndex();
+
+ auto *FrameTy = StructType::get(
+ SubFn->getContext(), {Builder.getInt8PtrTy(), Builder.getInt8PtrTy()});
+ PointerType *FramePtrTy = FrameTy->getPointerTo();
+
+ Builder.SetInsertPoint(SubFn);
+ auto *FramePtr = Builder.CreateBitCast(FrameRaw, FramePtrTy);
+ auto *Gep = Builder.CreateConstInBoundsGEP2_32(FrameTy, FramePtr, 0, Index);
+ auto *Load = Builder.CreateLoad(Gep);
+
+ SubFn->replaceAllUsesWith(Load);
+}
+
+bool Lowerer::lowerRemainingCoroIntrinsics(Function &F) {
+ bool Changed = false;
+
+ for (auto IB = inst_begin(F), E = inst_end(F); IB != E;) {
+ Instruction &I = *IB++;
+ if (auto *II = dyn_cast<IntrinsicInst>(&I)) {
+ switch (II->getIntrinsicID()) {
+ default:
+ continue;
+ case Intrinsic::coro_begin:
+ II->replaceAllUsesWith(II->getArgOperand(1));
+ break;
+ case Intrinsic::coro_free:
+ II->replaceAllUsesWith(II->getArgOperand(1));
+ break;
+ case Intrinsic::coro_alloc:
+ II->replaceAllUsesWith(ConstantInt::getTrue(Context));
+ break;
+ case Intrinsic::coro_id:
+ II->replaceAllUsesWith(ConstantTokenNone::get(Context));
+ break;
+ case Intrinsic::coro_subfn_addr:
+ lowerSubFn(Builder, cast<CoroSubFnInst>(II));
+ break;
+ }
+ II->eraseFromParent();
+ Changed = true;
+ }
+ }
+
+ if (Changed) {
+ // After replacement were made we can cleanup the function body a little.
+ simplifyCFG(F);
+ }
+ return Changed;
+}
+
+//===----------------------------------------------------------------------===//
+// Top Level Driver
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+struct CoroCleanup : FunctionPass {
+ static char ID; // Pass identification, replacement for typeid
+
+ CoroCleanup() : FunctionPass(ID) {}
+
+ std::unique_ptr<Lowerer> L;
+
+ // This pass has work to do only if we find intrinsics we are going to lower
+ // in the module.
+ bool doInitialization(Module &M) override {
+ if (coro::declaresIntrinsics(M, {"llvm.coro.alloc", "llvm.coro.begin",
+ "llvm.coro.subfn.addr", "llvm.coro.free",
+ "llvm.coro.id"}))
+ L = llvm::make_unique<Lowerer>(M);
+ return false;
+ }
+
+ bool runOnFunction(Function &F) override {
+ if (L)
+ return L->lowerRemainingCoroIntrinsics(F);
+ return false;
+ }
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ if (!L)
+ AU.setPreservesAll();
+ }
+};
+}
+
+char CoroCleanup::ID = 0;
+INITIALIZE_PASS(CoroCleanup, "coro-cleanup",
+ "Lower all coroutine related intrinsics", false, false)
+
+Pass *llvm::createCoroCleanupPass() { return new CoroCleanup(); }
diff --git a/contrib/llvm/lib/Transforms/Coroutines/CoroEarly.cpp b/contrib/llvm/lib/Transforms/Coroutines/CoroEarly.cpp
new file mode 100644
index 0000000..e8bb0ca
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Coroutines/CoroEarly.cpp
@@ -0,0 +1,218 @@
+//===- CoroEarly.cpp - Coroutine Early Function Pass ----------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This pass lowers coroutine intrinsics that hide the details of the exact
+// calling convention for coroutine resume and destroy functions and details of
+// the structure of the coroutine frame.
+//===----------------------------------------------------------------------===//
+
+#include "CoroInternal.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "coro-early"
+
+namespace {
+// Created on demand if CoroEarly pass has work to do.
+class Lowerer : public coro::LowererBase {
+ IRBuilder<> Builder;
+ PointerType *const AnyResumeFnPtrTy;
+
+ void lowerResumeOrDestroy(CallSite CS, CoroSubFnInst::ResumeKind);
+ void lowerCoroPromise(CoroPromiseInst *Intrin);
+ void lowerCoroDone(IntrinsicInst *II);
+
+public:
+ Lowerer(Module &M)
+ : LowererBase(M), Builder(Context),
+ AnyResumeFnPtrTy(FunctionType::get(Type::getVoidTy(Context), Int8Ptr,
+ /*isVarArg=*/false)
+ ->getPointerTo()) {}
+ bool lowerEarlyIntrinsics(Function &F);
+};
+}
+
+// Replace a direct call to coro.resume or coro.destroy with an indirect call to
+// an address returned by coro.subfn.addr intrinsic. This is done so that
+// CGPassManager recognizes devirtualization when CoroElide pass replaces a call
+// to coro.subfn.addr with an appropriate function address.
+void Lowerer::lowerResumeOrDestroy(CallSite CS,
+ CoroSubFnInst::ResumeKind Index) {
+ Value *ResumeAddr =
+ makeSubFnCall(CS.getArgOperand(0), Index, CS.getInstruction());
+ CS.setCalledFunction(ResumeAddr);
+ CS.setCallingConv(CallingConv::Fast);
+}
+
+// Coroutine promise field is always at the fixed offset from the beginning of
+// the coroutine frame. i8* coro.promise(i8*, i1 from) intrinsic adds an offset
+// to a passed pointer to move from coroutine frame to coroutine promise and
+// vice versa. Since we don't know exactly which coroutine frame it is, we build
+// a coroutine frame mock up starting with two function pointers, followed by a
+// properly aligned coroutine promise field.
+// TODO: Handle the case when coroutine promise alloca has align override.
+void Lowerer::lowerCoroPromise(CoroPromiseInst *Intrin) {
+ Value *Operand = Intrin->getArgOperand(0);
+ unsigned Alignement = Intrin->getAlignment();
+ Type *Int8Ty = Builder.getInt8Ty();
+
+ auto *SampleStruct =
+ StructType::get(Context, {AnyResumeFnPtrTy, AnyResumeFnPtrTy, Int8Ty});
+ const DataLayout &DL = TheModule.getDataLayout();
+ int64_t Offset = alignTo(
+ DL.getStructLayout(SampleStruct)->getElementOffset(2), Alignement);
+ if (Intrin->isFromPromise())
+ Offset = -Offset;
+
+ Builder.SetInsertPoint(Intrin);
+ Value *Replacement =
+ Builder.CreateConstInBoundsGEP1_32(Int8Ty, Operand, Offset);
+
+ Intrin->replaceAllUsesWith(Replacement);
+ Intrin->eraseFromParent();
+}
+
+// When a coroutine reaches final suspend point, it zeros out ResumeFnAddr in
+// the coroutine frame (it is UB to resume from a final suspend point).
+// The llvm.coro.done intrinsic is used to check whether a coroutine is
+// suspended at the final suspend point or not.
+void Lowerer::lowerCoroDone(IntrinsicInst *II) {
+ Value *Operand = II->getArgOperand(0);
+
+ // ResumeFnAddr is the first pointer sized element of the coroutine frame.
+ auto *FrameTy = Int8Ptr;
+ PointerType *FramePtrTy = FrameTy->getPointerTo();
+
+ Builder.SetInsertPoint(II);
+ auto *BCI = Builder.CreateBitCast(Operand, FramePtrTy);
+ auto *Gep = Builder.CreateConstInBoundsGEP1_32(FrameTy, BCI, 0);
+ auto *Load = Builder.CreateLoad(Gep);
+ auto *Cond = Builder.CreateICmpEQ(Load, NullPtr);
+
+ II->replaceAllUsesWith(Cond);
+ II->eraseFromParent();
+}
+
+// Prior to CoroSplit, calls to coro.begin needs to be marked as NoDuplicate,
+// as CoroSplit assumes there is exactly one coro.begin. After CoroSplit,
+// NoDuplicate attribute will be removed from coro.begin otherwise, it will
+// interfere with inlining.
+static void setCannotDuplicate(CoroIdInst *CoroId) {
+ for (User *U : CoroId->users())
+ if (auto *CB = dyn_cast<CoroBeginInst>(U))
+ CB->setCannotDuplicate();
+}
+
+bool Lowerer::lowerEarlyIntrinsics(Function &F) {
+ bool Changed = false;
+ CoroIdInst *CoroId = nullptr;
+ SmallVector<CoroFreeInst *, 4> CoroFrees;
+ for (auto IB = inst_begin(F), IE = inst_end(F); IB != IE;) {
+ Instruction &I = *IB++;
+ if (auto CS = CallSite(&I)) {
+ switch (CS.getIntrinsicID()) {
+ default:
+ continue;
+ case Intrinsic::coro_free:
+ CoroFrees.push_back(cast<CoroFreeInst>(&I));
+ break;
+ case Intrinsic::coro_suspend:
+ // Make sure that final suspend point is not duplicated as CoroSplit
+ // pass expects that there is at most one final suspend point.
+ if (cast<CoroSuspendInst>(&I)->isFinal())
+ CS.setCannotDuplicate();
+ break;
+ case Intrinsic::coro_end:
+ // Make sure that fallthrough coro.end is not duplicated as CoroSplit
+ // pass expects that there is at most one fallthrough coro.end.
+ if (cast<CoroEndInst>(&I)->isFallthrough())
+ CS.setCannotDuplicate();
+ break;
+ case Intrinsic::coro_id:
+ // Mark a function that comes out of the frontend that has a coro.id
+ // with a coroutine attribute.
+ if (auto *CII = cast<CoroIdInst>(&I)) {
+ if (CII->getInfo().isPreSplit()) {
+ F.addFnAttr(CORO_PRESPLIT_ATTR, UNPREPARED_FOR_SPLIT);
+ setCannotDuplicate(CII);
+ CII->setCoroutineSelf();
+ CoroId = cast<CoroIdInst>(&I);
+ }
+ }
+ break;
+ case Intrinsic::coro_resume:
+ lowerResumeOrDestroy(CS, CoroSubFnInst::ResumeIndex);
+ break;
+ case Intrinsic::coro_destroy:
+ lowerResumeOrDestroy(CS, CoroSubFnInst::DestroyIndex);
+ break;
+ case Intrinsic::coro_promise:
+ lowerCoroPromise(cast<CoroPromiseInst>(&I));
+ break;
+ case Intrinsic::coro_done:
+ lowerCoroDone(cast<IntrinsicInst>(&I));
+ break;
+ }
+ Changed = true;
+ }
+ }
+ // Make sure that all CoroFree reference the coro.id intrinsic.
+ // Token type is not exposed through coroutine C/C++ builtins to plain C, so
+ // we allow specifying none and fixing it up here.
+ if (CoroId)
+ for (CoroFreeInst *CF : CoroFrees)
+ CF->setArgOperand(0, CoroId);
+ return Changed;
+}
+
+//===----------------------------------------------------------------------===//
+// Top Level Driver
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+struct CoroEarly : public FunctionPass {
+ static char ID; // Pass identification, replacement for typeid.
+ CoroEarly() : FunctionPass(ID) {}
+
+ std::unique_ptr<Lowerer> L;
+
+ // This pass has work to do only if we find intrinsics we are going to lower
+ // in the module.
+ bool doInitialization(Module &M) override {
+ if (coro::declaresIntrinsics(M, {"llvm.coro.id", "llvm.coro.destroy",
+ "llvm.coro.done", "llvm.coro.end",
+ "llvm.coro.free", "llvm.coro.promise",
+ "llvm.coro.resume", "llvm.coro.suspend"}))
+ L = llvm::make_unique<Lowerer>(M);
+ return false;
+ }
+
+ bool runOnFunction(Function &F) override {
+ if (!L)
+ return false;
+
+ return L->lowerEarlyIntrinsics(F);
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ }
+};
+}
+
+char CoroEarly::ID = 0;
+INITIALIZE_PASS(CoroEarly, "coro-early", "Lower early coroutine intrinsics",
+ false, false)
+
+Pass *llvm::createCoroEarlyPass() { return new CoroEarly(); }
diff --git a/contrib/llvm/lib/Transforms/Coroutines/CoroElide.cpp b/contrib/llvm/lib/Transforms/Coroutines/CoroElide.cpp
new file mode 100644
index 0000000..99974d8
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Coroutines/CoroElide.cpp
@@ -0,0 +1,317 @@
+//===- CoroElide.cpp - Coroutine Frame Allocation Elision Pass ------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This pass replaces dynamic allocation of coroutine frame with alloca and
+// replaces calls to llvm.coro.resume and llvm.coro.destroy with direct calls
+// to coroutine sub-functions.
+//===----------------------------------------------------------------------===//
+
+#include "CoroInternal.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/ErrorHandling.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "coro-elide"
+
+namespace {
+// Created on demand if CoroElide pass has work to do.
+struct Lowerer : coro::LowererBase {
+ SmallVector<CoroIdInst *, 4> CoroIds;
+ SmallVector<CoroBeginInst *, 1> CoroBegins;
+ SmallVector<CoroAllocInst *, 1> CoroAllocs;
+ SmallVector<CoroSubFnInst *, 4> ResumeAddr;
+ SmallVector<CoroSubFnInst *, 4> DestroyAddr;
+ SmallVector<CoroFreeInst *, 1> CoroFrees;
+
+ Lowerer(Module &M) : LowererBase(M) {}
+
+ void elideHeapAllocations(Function *F, Type *FrameTy, AAResults &AA);
+ bool shouldElide() const;
+ bool processCoroId(CoroIdInst *, AAResults &AA);
+};
+} // end anonymous namespace
+
+// Go through the list of coro.subfn.addr intrinsics and replace them with the
+// provided constant.
+static void replaceWithConstant(Constant *Value,
+ SmallVectorImpl<CoroSubFnInst *> &Users) {
+ if (Users.empty())
+ return;
+
+ // See if we need to bitcast the constant to match the type of the intrinsic
+ // being replaced. Note: All coro.subfn.addr intrinsics return the same type,
+ // so we only need to examine the type of the first one in the list.
+ Type *IntrTy = Users.front()->getType();
+ Type *ValueTy = Value->getType();
+ if (ValueTy != IntrTy) {
+ // May need to tweak the function type to match the type expected at the
+ // use site.
+ assert(ValueTy->isPointerTy() && IntrTy->isPointerTy());
+ Value = ConstantExpr::getBitCast(Value, IntrTy);
+ }
+
+ // Now the value type matches the type of the intrinsic. Replace them all!
+ for (CoroSubFnInst *I : Users)
+ replaceAndRecursivelySimplify(I, Value);
+}
+
+// See if any operand of the call instruction references the coroutine frame.
+static bool operandReferences(CallInst *CI, AllocaInst *Frame, AAResults &AA) {
+ for (Value *Op : CI->operand_values())
+ if (AA.alias(Op, Frame) != NoAlias)
+ return true;
+ return false;
+}
+
+// Look for any tail calls referencing the coroutine frame and remove tail
+// attribute from them, since now coroutine frame resides on the stack and tail
+// call implies that the function does not references anything on the stack.
+static void removeTailCallAttribute(AllocaInst *Frame, AAResults &AA) {
+ Function &F = *Frame->getFunction();
+ MemoryLocation Mem(Frame);
+ for (Instruction &I : instructions(F))
+ if (auto *Call = dyn_cast<CallInst>(&I))
+ if (Call->isTailCall() && operandReferences(Call, Frame, AA)) {
+ // FIXME: If we ever hit this check. Evaluate whether it is more
+ // appropriate to retain musttail and allow the code to compile.
+ if (Call->isMustTailCall())
+ report_fatal_error("Call referring to the coroutine frame cannot be "
+ "marked as musttail");
+ Call->setTailCall(false);
+ }
+}
+
+// Given a resume function @f.resume(%f.frame* %frame), returns %f.frame type.
+static Type *getFrameType(Function *Resume) {
+ auto *ArgType = Resume->getArgumentList().front().getType();
+ return cast<PointerType>(ArgType)->getElementType();
+}
+
+// Finds first non alloca instruction in the entry block of a function.
+static Instruction *getFirstNonAllocaInTheEntryBlock(Function *F) {
+ for (Instruction &I : F->getEntryBlock())
+ if (!isa<AllocaInst>(&I))
+ return &I;
+ llvm_unreachable("no terminator in the entry block");
+}
+
+// To elide heap allocations we need to suppress code blocks guarded by
+// llvm.coro.alloc and llvm.coro.free instructions.
+void Lowerer::elideHeapAllocations(Function *F, Type *FrameTy, AAResults &AA) {
+ LLVMContext &C = FrameTy->getContext();
+ auto *InsertPt =
+ getFirstNonAllocaInTheEntryBlock(CoroIds.front()->getFunction());
+
+ // Replacing llvm.coro.alloc with false will suppress dynamic
+ // allocation as it is expected for the frontend to generate the code that
+ // looks like:
+ // id = coro.id(...)
+ // mem = coro.alloc(id) ? malloc(coro.size()) : 0;
+ // coro.begin(id, mem)
+ auto *False = ConstantInt::getFalse(C);
+ for (auto *CA : CoroAllocs) {
+ CA->replaceAllUsesWith(False);
+ CA->eraseFromParent();
+ }
+
+ // FIXME: Design how to transmit alignment information for every alloca that
+ // is spilled into the coroutine frame and recreate the alignment information
+ // here. Possibly we will need to do a mini SROA here and break the coroutine
+ // frame into individual AllocaInst recreating the original alignment.
+ auto *Frame = new AllocaInst(FrameTy, "", InsertPt);
+ auto *FrameVoidPtr =
+ new BitCastInst(Frame, Type::getInt8PtrTy(C), "vFrame", InsertPt);
+
+ for (auto *CB : CoroBegins) {
+ CB->replaceAllUsesWith(FrameVoidPtr);
+ CB->eraseFromParent();
+ }
+
+ // Since now coroutine frame lives on the stack we need to make sure that
+ // any tail call referencing it, must be made non-tail call.
+ removeTailCallAttribute(Frame, AA);
+}
+
+bool Lowerer::shouldElide() const {
+ // If no CoroAllocs, we cannot suppress allocation, so elision is not
+ // possible.
+ if (CoroAllocs.empty())
+ return false;
+
+ // Check that for every coro.begin there is a coro.destroy directly
+ // referencing the SSA value of that coro.begin. If the value escaped, then
+ // coro.destroy would have been referencing a memory location storing that
+ // value and not the virtual register.
+
+ SmallPtrSet<CoroBeginInst *, 8> ReferencedCoroBegins;
+
+ for (CoroSubFnInst *DA : DestroyAddr) {
+ if (auto *CB = dyn_cast<CoroBeginInst>(DA->getFrame()))
+ ReferencedCoroBegins.insert(CB);
+ else
+ return false;
+ }
+
+ // If size of the set is the same as total number of CoroBegins, means we
+ // found a coro.free or coro.destroy mentioning a coro.begin and we can
+ // perform heap elision.
+ return ReferencedCoroBegins.size() == CoroBegins.size();
+}
+
+bool Lowerer::processCoroId(CoroIdInst *CoroId, AAResults &AA) {
+ CoroBegins.clear();
+ CoroAllocs.clear();
+ CoroFrees.clear();
+ ResumeAddr.clear();
+ DestroyAddr.clear();
+
+ // Collect all coro.begin and coro.allocs associated with this coro.id.
+ for (User *U : CoroId->users()) {
+ if (auto *CB = dyn_cast<CoroBeginInst>(U))
+ CoroBegins.push_back(CB);
+ else if (auto *CA = dyn_cast<CoroAllocInst>(U))
+ CoroAllocs.push_back(CA);
+ else if (auto *CF = dyn_cast<CoroFreeInst>(U))
+ CoroFrees.push_back(CF);
+ }
+
+ // Collect all coro.subfn.addrs associated with coro.begin.
+ // Note, we only devirtualize the calls if their coro.subfn.addr refers to
+ // coro.begin directly. If we run into cases where this check is too
+ // conservative, we can consider relaxing the check.
+ for (CoroBeginInst *CB : CoroBegins) {
+ for (User *U : CB->users())
+ if (auto *II = dyn_cast<CoroSubFnInst>(U))
+ switch (II->getIndex()) {
+ case CoroSubFnInst::ResumeIndex:
+ ResumeAddr.push_back(II);
+ break;
+ case CoroSubFnInst::DestroyIndex:
+ DestroyAddr.push_back(II);
+ break;
+ default:
+ llvm_unreachable("unexpected coro.subfn.addr constant");
+ }
+ }
+
+ // PostSplit coro.id refers to an array of subfunctions in its Info
+ // argument.
+ ConstantArray *Resumers = CoroId->getInfo().Resumers;
+ assert(Resumers && "PostSplit coro.id Info argument must refer to an array"
+ "of coroutine subfunctions");
+ auto *ResumeAddrConstant =
+ ConstantExpr::getExtractValue(Resumers, CoroSubFnInst::ResumeIndex);
+
+ replaceWithConstant(ResumeAddrConstant, ResumeAddr);
+
+ bool ShouldElide = shouldElide();
+
+ auto *DestroyAddrConstant = ConstantExpr::getExtractValue(
+ Resumers,
+ ShouldElide ? CoroSubFnInst::CleanupIndex : CoroSubFnInst::DestroyIndex);
+
+ replaceWithConstant(DestroyAddrConstant, DestroyAddr);
+
+ if (ShouldElide) {
+ auto *FrameTy = getFrameType(cast<Function>(ResumeAddrConstant));
+ elideHeapAllocations(CoroId->getFunction(), FrameTy, AA);
+ coro::replaceCoroFree(CoroId, /*Elide=*/true);
+ }
+
+ return true;
+}
+
+// See if there are any coro.subfn.addr instructions referring to coro.devirt
+// trigger, if so, replace them with a direct call to devirt trigger function.
+static bool replaceDevirtTrigger(Function &F) {
+ SmallVector<CoroSubFnInst *, 1> DevirtAddr;
+ for (auto &I : instructions(F))
+ if (auto *SubFn = dyn_cast<CoroSubFnInst>(&I))
+ if (SubFn->getIndex() == CoroSubFnInst::RestartTrigger)
+ DevirtAddr.push_back(SubFn);
+
+ if (DevirtAddr.empty())
+ return false;
+
+ Module &M = *F.getParent();
+ Function *DevirtFn = M.getFunction(CORO_DEVIRT_TRIGGER_FN);
+ assert(DevirtFn && "coro.devirt.fn not found");
+ replaceWithConstant(DevirtFn, DevirtAddr);
+
+ return true;
+}
+
+//===----------------------------------------------------------------------===//
+// Top Level Driver
+//===----------------------------------------------------------------------===//
+
+namespace {
+struct CoroElide : FunctionPass {
+ static char ID;
+ CoroElide() : FunctionPass(ID) {}
+
+ std::unique_ptr<Lowerer> L;
+
+ bool doInitialization(Module &M) override {
+ if (coro::declaresIntrinsics(M, {"llvm.coro.id"}))
+ L = llvm::make_unique<Lowerer>(M);
+ return false;
+ }
+
+ bool runOnFunction(Function &F) override {
+ if (!L)
+ return false;
+
+ bool Changed = false;
+
+ if (F.hasFnAttribute(CORO_PRESPLIT_ATTR))
+ Changed = replaceDevirtTrigger(F);
+
+ L->CoroIds.clear();
+
+ // Collect all PostSplit coro.ids.
+ for (auto &I : instructions(F))
+ if (auto *CII = dyn_cast<CoroIdInst>(&I))
+ if (CII->getInfo().isPostSplit())
+ // If it is the coroutine itself, don't touch it.
+ if (CII->getCoroutine() != CII->getFunction())
+ L->CoroIds.push_back(CII);
+
+ // If we did not find any coro.id, there is nothing to do.
+ if (L->CoroIds.empty())
+ return Changed;
+
+ AAResults &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
+
+ for (auto *CII : L->CoroIds)
+ Changed |= L->processCoroId(CII, AA);
+
+ return Changed;
+ }
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<AAResultsWrapperPass>();
+ }
+};
+}
+
+char CoroElide::ID = 0;
+INITIALIZE_PASS_BEGIN(
+ CoroElide, "coro-elide",
+ "Coroutine frame allocation elision and indirect calls replacement", false,
+ false)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_END(
+ CoroElide, "coro-elide",
+ "Coroutine frame allocation elision and indirect calls replacement", false,
+ false)
+
+Pass *llvm::createCoroElidePass() { return new CoroElide(); }
diff --git a/contrib/llvm/lib/Transforms/Coroutines/CoroFrame.cpp b/contrib/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
new file mode 100644
index 0000000..bb28558a
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
@@ -0,0 +1,727 @@
+//===- CoroFrame.cpp - Builds and manipulates coroutine frame -------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This file contains classes used to discover if for a particular value
+// there from sue to definition that crosses a suspend block.
+//
+// Using the information discovered we form a Coroutine Frame structure to
+// contain those values. All uses of those values are replaced with appropriate
+// GEP + load from the coroutine frame. At the point of the definition we spill
+// the value into the coroutine frame.
+//
+// TODO: pack values tightly using liveness info.
+//===----------------------------------------------------------------------===//
+
+#include "CoroInternal.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/circular_raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+
+using namespace llvm;
+
+// The "coro-suspend-crossing" flag is very noisy. There is another debug type,
+// "coro-frame", which results in leaner debug spew.
+#define DEBUG_TYPE "coro-suspend-crossing"
+
+enum { SmallVectorThreshold = 32 };
+
+// Provides two way mapping between the blocks and numbers.
+namespace {
+class BlockToIndexMapping {
+ SmallVector<BasicBlock *, SmallVectorThreshold> V;
+
+public:
+ size_t size() const { return V.size(); }
+
+ BlockToIndexMapping(Function &F) {
+ for (BasicBlock &BB : F)
+ V.push_back(&BB);
+ std::sort(V.begin(), V.end());
+ }
+
+ size_t blockToIndex(BasicBlock *BB) const {
+ auto *I = std::lower_bound(V.begin(), V.end(), BB);
+ assert(I != V.end() && *I == BB && "BasicBlockNumberng: Unknown block");
+ return I - V.begin();
+ }
+
+ BasicBlock *indexToBlock(unsigned Index) const { return V[Index]; }
+};
+} // end anonymous namespace
+
+// The SuspendCrossingInfo maintains data that allows to answer a question
+// whether given two BasicBlocks A and B there is a path from A to B that
+// passes through a suspend point.
+//
+// For every basic block 'i' it maintains a BlockData that consists of:
+// Consumes: a bit vector which contains a set of indices of blocks that can
+// reach block 'i'
+// Kills: a bit vector which contains a set of indices of blocks that can
+// reach block 'i', but one of the path will cross a suspend point
+// Suspend: a boolean indicating whether block 'i' contains a suspend point.
+// End: a boolean indicating whether block 'i' contains a coro.end intrinsic.
+//
+namespace {
+struct SuspendCrossingInfo {
+ BlockToIndexMapping Mapping;
+
+ struct BlockData {
+ BitVector Consumes;
+ BitVector Kills;
+ bool Suspend = false;
+ bool End = false;
+ };
+ SmallVector<BlockData, SmallVectorThreshold> Block;
+
+ iterator_range<succ_iterator> successors(BlockData const &BD) const {
+ BasicBlock *BB = Mapping.indexToBlock(&BD - &Block[0]);
+ return llvm::successors(BB);
+ }
+
+ BlockData &getBlockData(BasicBlock *BB) {
+ return Block[Mapping.blockToIndex(BB)];
+ }
+
+ void dump() const;
+ void dump(StringRef Label, BitVector const &BV) const;
+
+ SuspendCrossingInfo(Function &F, coro::Shape &Shape);
+
+ bool hasPathCrossingSuspendPoint(BasicBlock *DefBB, BasicBlock *UseBB) const {
+ size_t const DefIndex = Mapping.blockToIndex(DefBB);
+ size_t const UseIndex = Mapping.blockToIndex(UseBB);
+
+ assert(Block[UseIndex].Consumes[DefIndex] && "use must consume def");
+ bool const Result = Block[UseIndex].Kills[DefIndex];
+ DEBUG(dbgs() << UseBB->getName() << " => " << DefBB->getName()
+ << " answer is " << Result << "\n");
+ return Result;
+ }
+
+ bool isDefinitionAcrossSuspend(BasicBlock *DefBB, User *U) const {
+ auto *I = cast<Instruction>(U);
+
+ // We rewrote PHINodes, so that only the ones with exactly one incoming
+ // value need to be analyzed.
+ if (auto *PN = dyn_cast<PHINode>(I))
+ if (PN->getNumIncomingValues() > 1)
+ return false;
+
+ BasicBlock *UseBB = I->getParent();
+ return hasPathCrossingSuspendPoint(DefBB, UseBB);
+ }
+
+ bool isDefinitionAcrossSuspend(Argument &A, User *U) const {
+ return isDefinitionAcrossSuspend(&A.getParent()->getEntryBlock(), U);
+ }
+
+ bool isDefinitionAcrossSuspend(Instruction &I, User *U) const {
+ return isDefinitionAcrossSuspend(I.getParent(), U);
+ }
+};
+} // end anonymous namespace
+
+LLVM_DUMP_METHOD void SuspendCrossingInfo::dump(StringRef Label,
+ BitVector const &BV) const {
+ dbgs() << Label << ":";
+ for (size_t I = 0, N = BV.size(); I < N; ++I)
+ if (BV[I])
+ dbgs() << " " << Mapping.indexToBlock(I)->getName();
+ dbgs() << "\n";
+}
+
+LLVM_DUMP_METHOD void SuspendCrossingInfo::dump() const {
+ for (size_t I = 0, N = Block.size(); I < N; ++I) {
+ BasicBlock *const B = Mapping.indexToBlock(I);
+ dbgs() << B->getName() << ":\n";
+ dump(" Consumes", Block[I].Consumes);
+ dump(" Kills", Block[I].Kills);
+ }
+ dbgs() << "\n";
+}
+
+SuspendCrossingInfo::SuspendCrossingInfo(Function &F, coro::Shape &Shape)
+ : Mapping(F) {
+ const size_t N = Mapping.size();
+ Block.resize(N);
+
+ // Initialize every block so that it consumes itself
+ for (size_t I = 0; I < N; ++I) {
+ auto &B = Block[I];
+ B.Consumes.resize(N);
+ B.Kills.resize(N);
+ B.Consumes.set(I);
+ }
+
+ // Mark all CoroEnd Blocks. We do not propagate Kills beyond coro.ends as
+ // the code beyond coro.end is reachable during initial invocation of the
+ // coroutine.
+ for (auto *CE : Shape.CoroEnds)
+ getBlockData(CE->getParent()).End = true;
+
+ // Mark all suspend blocks and indicate that they kill everything they
+ // consume. Note, that crossing coro.save also requires a spill, as any code
+ // between coro.save and coro.suspend may resume the coroutine and all of the
+ // state needs to be saved by that time.
+ auto markSuspendBlock = [&](IntrinsicInst* BarrierInst) {
+ BasicBlock *SuspendBlock = BarrierInst->getParent();
+ auto &B = getBlockData(SuspendBlock);
+ B.Suspend = true;
+ B.Kills |= B.Consumes;
+ };
+ for (CoroSuspendInst *CSI : Shape.CoroSuspends) {
+ markSuspendBlock(CSI);
+ markSuspendBlock(CSI->getCoroSave());
+ }
+
+ // Iterate propagating consumes and kills until they stop changing.
+ int Iteration = 0;
+ (void)Iteration;
+
+ bool Changed;
+ do {
+ DEBUG(dbgs() << "iteration " << ++Iteration);
+ DEBUG(dbgs() << "==============\n");
+
+ Changed = false;
+ for (size_t I = 0; I < N; ++I) {
+ auto &B = Block[I];
+ for (BasicBlock *SI : successors(B)) {
+
+ auto SuccNo = Mapping.blockToIndex(SI);
+
+ // Saved Consumes and Kills bitsets so that it is easy to see
+ // if anything changed after propagation.
+ auto &S = Block[SuccNo];
+ auto SavedConsumes = S.Consumes;
+ auto SavedKills = S.Kills;
+
+ // Propagate Kills and Consumes from block B into its successor S.
+ S.Consumes |= B.Consumes;
+ S.Kills |= B.Kills;
+
+ // If block B is a suspend block, it should propagate kills into the
+ // its successor for every block B consumes.
+ if (B.Suspend) {
+ S.Kills |= B.Consumes;
+ }
+ if (S.Suspend) {
+ // If block S is a suspend block, it should kill all of the blocks it
+ // consumes.
+ S.Kills |= S.Consumes;
+ } else if (S.End) {
+ // If block S is an end block, it should not propagate kills as the
+ // blocks following coro.end() are reached during initial invocation
+ // of the coroutine while all the data are still available on the
+ // stack or in the registers.
+ S.Kills.reset();
+ } else {
+ // This is reached when S block it not Suspend nor coro.end and it
+ // need to make sure that it is not in the kill set.
+ S.Kills.reset(SuccNo);
+ }
+
+ // See if anything changed.
+ Changed |= (S.Kills != SavedKills) || (S.Consumes != SavedConsumes);
+
+ if (S.Kills != SavedKills) {
+ DEBUG(dbgs() << "\nblock " << I << " follower " << SI->getName()
+ << "\n");
+ DEBUG(dump("S.Kills", S.Kills));
+ DEBUG(dump("SavedKills", SavedKills));
+ }
+ if (S.Consumes != SavedConsumes) {
+ DEBUG(dbgs() << "\nblock " << I << " follower " << SI << "\n");
+ DEBUG(dump("S.Consume", S.Consumes));
+ DEBUG(dump("SavedCons", SavedConsumes));
+ }
+ }
+ }
+ } while (Changed);
+ DEBUG(dump());
+}
+
+#undef DEBUG_TYPE // "coro-suspend-crossing"
+#define DEBUG_TYPE "coro-frame"
+
+// We build up the list of spills for every case where a use is separated
+// from the definition by a suspend point.
+
+struct Spill : std::pair<Value *, Instruction *> {
+ using base = std::pair<Value *, Instruction *>;
+
+ Spill(Value *Def, User *U) : base(Def, cast<Instruction>(U)) {}
+
+ Value *def() const { return first; }
+ Instruction *user() const { return second; }
+ BasicBlock *userBlock() const { return second->getParent(); }
+
+ std::pair<Value *, BasicBlock *> getKey() const {
+ return {def(), userBlock()};
+ }
+
+ bool operator<(Spill const &rhs) const { return getKey() < rhs.getKey(); }
+};
+
+// Note that there may be more than one record with the same value of Def in
+// the SpillInfo vector.
+using SpillInfo = SmallVector<Spill, 8>;
+
+#ifndef NDEBUG
+static void dump(StringRef Title, SpillInfo const &Spills) {
+ dbgs() << "------------- " << Title << "--------------\n";
+ Value *CurrentValue = nullptr;
+ for (auto const &E : Spills) {
+ if (CurrentValue != E.def()) {
+ CurrentValue = E.def();
+ CurrentValue->dump();
+ }
+ dbgs() << " user: ";
+ E.user()->dump();
+ }
+}
+#endif
+
+// Build a struct that will keep state for an active coroutine.
+// struct f.frame {
+// ResumeFnTy ResumeFnAddr;
+// ResumeFnTy DestroyFnAddr;
+// int ResumeIndex;
+// ... promise (if present) ...
+// ... spills ...
+// };
+static StructType *buildFrameType(Function &F, coro::Shape &Shape,
+ SpillInfo &Spills) {
+ LLVMContext &C = F.getContext();
+ SmallString<32> Name(F.getName());
+ Name.append(".Frame");
+ StructType *FrameTy = StructType::create(C, Name);
+ auto *FramePtrTy = FrameTy->getPointerTo();
+ auto *FnTy = FunctionType::get(Type::getVoidTy(C), FramePtrTy,
+ /*IsVarArgs=*/false);
+ auto *FnPtrTy = FnTy->getPointerTo();
+
+ // Figure out how wide should be an integer type storing the suspend index.
+ unsigned IndexBits = std::max(1U, Log2_64_Ceil(Shape.CoroSuspends.size()));
+ Type *PromiseType = Shape.PromiseAlloca
+ ? Shape.PromiseAlloca->getType()->getElementType()
+ : Type::getInt1Ty(C);
+ SmallVector<Type *, 8> Types{FnPtrTy, FnPtrTy, PromiseType,
+ Type::getIntNTy(C, IndexBits)};
+ Value *CurrentDef = nullptr;
+
+ // Create an entry for every spilled value.
+ for (auto const &S : Spills) {
+ if (CurrentDef == S.def())
+ continue;
+
+ CurrentDef = S.def();
+ // PromiseAlloca was already added to Types array earlier.
+ if (CurrentDef == Shape.PromiseAlloca)
+ continue;
+
+ Type *Ty = nullptr;
+ if (auto *AI = dyn_cast<AllocaInst>(CurrentDef))
+ Ty = AI->getAllocatedType();
+ else
+ Ty = CurrentDef->getType();
+
+ Types.push_back(Ty);
+ }
+ FrameTy->setBody(Types);
+
+ return FrameTy;
+}
+
+// Replace all alloca and SSA values that are accessed across suspend points
+// with GetElementPointer from coroutine frame + loads and stores. Create an
+// AllocaSpillBB that will become the new entry block for the resume parts of
+// the coroutine:
+//
+// %hdl = coro.begin(...)
+// whatever
+//
+// becomes:
+//
+// %hdl = coro.begin(...)
+// %FramePtr = bitcast i8* hdl to %f.frame*
+// br label %AllocaSpillBB
+//
+// AllocaSpillBB:
+// ; geps corresponding to allocas that were moved to coroutine frame
+// br label PostSpill
+//
+// PostSpill:
+// whatever
+//
+//
+static Instruction *insertSpills(SpillInfo &Spills, coro::Shape &Shape) {
+ auto *CB = Shape.CoroBegin;
+ IRBuilder<> Builder(CB->getNextNode());
+ PointerType *FramePtrTy = Shape.FrameTy->getPointerTo();
+ auto *FramePtr =
+ cast<Instruction>(Builder.CreateBitCast(CB, FramePtrTy, "FramePtr"));
+ Type *FrameTy = FramePtrTy->getElementType();
+
+ Value *CurrentValue = nullptr;
+ BasicBlock *CurrentBlock = nullptr;
+ Value *CurrentReload = nullptr;
+ unsigned Index = coro::Shape::LastKnownField;
+
+ // We need to keep track of any allocas that need "spilling"
+ // since they will live in the coroutine frame now, all access to them
+ // need to be changed, not just the access across suspend points
+ // we remember allocas and their indices to be handled once we processed
+ // all the spills.
+ SmallVector<std::pair<AllocaInst *, unsigned>, 4> Allocas;
+ // Promise alloca (if present) has a fixed field number (Shape::PromiseField)
+ if (Shape.PromiseAlloca)
+ Allocas.emplace_back(Shape.PromiseAlloca, coro::Shape::PromiseField);
+
+ // Create a load instruction to reload the spilled value from the coroutine
+ // frame.
+ auto CreateReload = [&](Instruction *InsertBefore) {
+ Builder.SetInsertPoint(InsertBefore);
+ auto *G = Builder.CreateConstInBoundsGEP2_32(FrameTy, FramePtr, 0, Index,
+ CurrentValue->getName() +
+ Twine(".reload.addr"));
+ return isa<AllocaInst>(CurrentValue)
+ ? G
+ : Builder.CreateLoad(G,
+ CurrentValue->getName() + Twine(".reload"));
+ };
+
+ for (auto const &E : Spills) {
+ // If we have not seen the value, generate a spill.
+ if (CurrentValue != E.def()) {
+ CurrentValue = E.def();
+ CurrentBlock = nullptr;
+ CurrentReload = nullptr;
+
+ ++Index;
+
+ if (auto *AI = dyn_cast<AllocaInst>(CurrentValue)) {
+ // Spilled AllocaInst will be replaced with GEP from the coroutine frame
+ // there is no spill required.
+ Allocas.emplace_back(AI, Index);
+ if (!AI->isStaticAlloca())
+ report_fatal_error("Coroutines cannot handle non static allocas yet");
+ } else {
+ // Otherwise, create a store instruction storing the value into the
+ // coroutine frame. For, argument, we will place the store instruction
+ // right after the coroutine frame pointer instruction, i.e. bitcase of
+ // coro.begin from i8* to %f.frame*. For all other values, the spill is
+ // placed immediately after the definition.
+ Builder.SetInsertPoint(
+ isa<Argument>(CurrentValue)
+ ? FramePtr->getNextNode()
+ : dyn_cast<Instruction>(E.def())->getNextNode());
+
+ auto *G = Builder.CreateConstInBoundsGEP2_32(
+ FrameTy, FramePtr, 0, Index,
+ CurrentValue->getName() + Twine(".spill.addr"));
+ Builder.CreateStore(CurrentValue, G);
+ }
+ }
+
+ // If we have not seen the use block, generate a reload in it.
+ if (CurrentBlock != E.userBlock()) {
+ CurrentBlock = E.userBlock();
+ CurrentReload = CreateReload(&*CurrentBlock->getFirstInsertionPt());
+ }
+
+ // If we have a single edge PHINode, remove it and replace it with a reload
+ // from the coroutine frame. (We already took care of multi edge PHINodes
+ // by rewriting them in the rewritePHIs function).
+ if (auto *PN = dyn_cast<PHINode>(E.user())) {
+ assert(PN->getNumIncomingValues() == 1 && "unexpected number of incoming "
+ "values in the PHINode");
+ PN->replaceAllUsesWith(CurrentReload);
+ PN->eraseFromParent();
+ continue;
+ }
+
+ // Replace all uses of CurrentValue in the current instruction with reload.
+ E.user()->replaceUsesOfWith(CurrentValue, CurrentReload);
+ }
+
+ BasicBlock *FramePtrBB = FramePtr->getParent();
+ Shape.AllocaSpillBlock =
+ FramePtrBB->splitBasicBlock(FramePtr->getNextNode(), "AllocaSpillBB");
+ Shape.AllocaSpillBlock->splitBasicBlock(&Shape.AllocaSpillBlock->front(),
+ "PostSpill");
+
+ Builder.SetInsertPoint(&Shape.AllocaSpillBlock->front());
+ // If we found any allocas, replace all of their remaining uses with Geps.
+ for (auto &P : Allocas) {
+ auto *G =
+ Builder.CreateConstInBoundsGEP2_32(FrameTy, FramePtr, 0, P.second);
+ // We are not using ReplaceInstWithInst(P.first, cast<Instruction>(G)) here,
+ // as we are changing location of the instruction.
+ G->takeName(P.first);
+ P.first->replaceAllUsesWith(G);
+ P.first->eraseFromParent();
+ }
+ return FramePtr;
+}
+
+static void rewritePHIs(BasicBlock &BB) {
+ // For every incoming edge we will create a block holding all
+ // incoming values in a single PHI nodes.
+ //
+ // loop:
+ // %n.val = phi i32[%n, %entry], [%inc, %loop]
+ //
+ // It will create:
+ //
+ // loop.from.entry:
+ // %n.loop.pre = phi i32 [%n, %entry]
+ // br %label loop
+ // loop.from.loop:
+ // %inc.loop.pre = phi i32 [%inc, %loop]
+ // br %label loop
+ //
+ // After this rewrite, further analysis will ignore any phi nodes with more
+ // than one incoming edge.
+
+ // TODO: Simplify PHINodes in the basic block to remove duplicate
+ // predecessors.
+
+ SmallVector<BasicBlock *, 8> Preds(pred_begin(&BB), pred_end(&BB));
+ for (BasicBlock *Pred : Preds) {
+ auto *IncomingBB = SplitEdge(Pred, &BB);
+ IncomingBB->setName(BB.getName() + Twine(".from.") + Pred->getName());
+ auto *PN = cast<PHINode>(&BB.front());
+ do {
+ int Index = PN->getBasicBlockIndex(IncomingBB);
+ Value *V = PN->getIncomingValue(Index);
+ PHINode *InputV = PHINode::Create(
+ V->getType(), 1, V->getName() + Twine(".") + BB.getName(),
+ &IncomingBB->front());
+ InputV->addIncoming(V, Pred);
+ PN->setIncomingValue(Index, InputV);
+ PN = dyn_cast<PHINode>(PN->getNextNode());
+ } while (PN);
+ }
+}
+
+static void rewritePHIs(Function &F) {
+ SmallVector<BasicBlock *, 8> WorkList;
+
+ for (BasicBlock &BB : F)
+ if (auto *PN = dyn_cast<PHINode>(&BB.front()))
+ if (PN->getNumIncomingValues() > 1)
+ WorkList.push_back(&BB);
+
+ for (BasicBlock *BB : WorkList)
+ rewritePHIs(*BB);
+}
+
+// Check for instructions that we can recreate on resume as opposed to spill
+// the result into a coroutine frame.
+static bool materializable(Instruction &V) {
+ return isa<CastInst>(&V) || isa<GetElementPtrInst>(&V) ||
+ isa<BinaryOperator>(&V) || isa<CmpInst>(&V) || isa<SelectInst>(&V);
+}
+
+// Check for structural coroutine intrinsics that should not be spilled into
+// the coroutine frame.
+static bool isCoroutineStructureIntrinsic(Instruction &I) {
+ return isa<CoroIdInst>(&I) || isa<CoroBeginInst>(&I) ||
+ isa<CoroSaveInst>(&I) || isa<CoroSuspendInst>(&I);
+}
+
+// For every use of the value that is across suspend point, recreate that value
+// after a suspend point.
+static void rewriteMaterializableInstructions(IRBuilder<> &IRB,
+ SpillInfo const &Spills) {
+ BasicBlock *CurrentBlock = nullptr;
+ Instruction *CurrentMaterialization = nullptr;
+ Instruction *CurrentDef = nullptr;
+
+ for (auto const &E : Spills) {
+ // If it is a new definition, update CurrentXXX variables.
+ if (CurrentDef != E.def()) {
+ CurrentDef = cast<Instruction>(E.def());
+ CurrentBlock = nullptr;
+ CurrentMaterialization = nullptr;
+ }
+
+ // If we have not seen this block, materialize the value.
+ if (CurrentBlock != E.userBlock()) {
+ CurrentBlock = E.userBlock();
+ CurrentMaterialization = cast<Instruction>(CurrentDef)->clone();
+ CurrentMaterialization->setName(CurrentDef->getName());
+ CurrentMaterialization->insertBefore(
+ &*CurrentBlock->getFirstInsertionPt());
+ }
+
+ if (auto *PN = dyn_cast<PHINode>(E.user())) {
+ assert(PN->getNumIncomingValues() == 1 && "unexpected number of incoming "
+ "values in the PHINode");
+ PN->replaceAllUsesWith(CurrentMaterialization);
+ PN->eraseFromParent();
+ continue;
+ }
+
+ // Replace all uses of CurrentDef in the current instruction with the
+ // CurrentMaterialization for the block.
+ E.user()->replaceUsesOfWith(CurrentDef, CurrentMaterialization);
+ }
+}
+
+// Move early uses of spilled variable after CoroBegin.
+// For example, if a parameter had address taken, we may end up with the code
+// like:
+// define @f(i32 %n) {
+// %n.addr = alloca i32
+// store %n, %n.addr
+// ...
+// call @coro.begin
+// we need to move the store after coro.begin
+static void moveSpillUsesAfterCoroBegin(Function &F, SpillInfo const &Spills,
+ CoroBeginInst *CoroBegin) {
+ DominatorTree DT(F);
+ SmallVector<Instruction *, 8> NeedsMoving;
+
+ Value *CurrentValue = nullptr;
+
+ for (auto const &E : Spills) {
+ if (CurrentValue == E.def())
+ continue;
+
+ CurrentValue = E.def();
+
+ for (User *U : CurrentValue->users()) {
+ Instruction *I = cast<Instruction>(U);
+ if (!DT.dominates(CoroBegin, I)) {
+ // TODO: Make this more robust. Currently if we run into a situation
+ // where simple instruction move won't work we panic and
+ // report_fatal_error.
+ for (User *UI : I->users()) {
+ if (!DT.dominates(CoroBegin, cast<Instruction>(UI)))
+ report_fatal_error("cannot move instruction since its users are not"
+ " dominated by CoroBegin");
+ }
+
+ DEBUG(dbgs() << "will move: " << *I << "\n");
+ NeedsMoving.push_back(I);
+ }
+ }
+ }
+
+ Instruction *InsertPt = CoroBegin->getNextNode();
+ for (Instruction *I : NeedsMoving)
+ I->moveBefore(InsertPt);
+}
+
+// Splits the block at a particular instruction unless it is the first
+// instruction in the block with a single predecessor.
+static BasicBlock *splitBlockIfNotFirst(Instruction *I, const Twine &Name) {
+ auto *BB = I->getParent();
+ if (&BB->front() == I) {
+ if (BB->getSinglePredecessor()) {
+ BB->setName(Name);
+ return BB;
+ }
+ }
+ return BB->splitBasicBlock(I, Name);
+}
+
+// Split above and below a particular instruction so that it
+// will be all alone by itself in a block.
+static void splitAround(Instruction *I, const Twine &Name) {
+ splitBlockIfNotFirst(I, Name);
+ splitBlockIfNotFirst(I->getNextNode(), "After" + Name);
+}
+
+void coro::buildCoroutineFrame(Function &F, Shape &Shape) {
+ // Lower coro.dbg.declare to coro.dbg.value, since we are going to rewrite
+ // access to local variables.
+ LowerDbgDeclare(F);
+
+ Shape.PromiseAlloca = Shape.CoroBegin->getId()->getPromise();
+ if (Shape.PromiseAlloca) {
+ Shape.CoroBegin->getId()->clearPromise();
+ }
+
+ // Make sure that all coro.save, coro.suspend and the fallthrough coro.end
+ // intrinsics are in their own blocks to simplify the logic of building up
+ // SuspendCrossing data.
+ for (CoroSuspendInst *CSI : Shape.CoroSuspends) {
+ splitAround(CSI->getCoroSave(), "CoroSave");
+ splitAround(CSI, "CoroSuspend");
+ }
+
+ // Put fallthrough CoroEnd into its own block. Note: Shape::buildFrom places
+ // the fallthrough coro.end as the first element of CoroEnds array.
+ splitAround(Shape.CoroEnds.front(), "CoroEnd");
+
+ // Transforms multi-edge PHI Nodes, so that any value feeding into a PHI will
+ // never has its definition separated from the PHI by the suspend point.
+ rewritePHIs(F);
+
+ // Build suspend crossing info.
+ SuspendCrossingInfo Checker(F, Shape);
+
+ IRBuilder<> Builder(F.getContext());
+ SpillInfo Spills;
+
+ // See if there are materializable instructions across suspend points.
+ for (Instruction &I : instructions(F))
+ if (materializable(I))
+ for (User *U : I.users())
+ if (Checker.isDefinitionAcrossSuspend(I, U))
+ Spills.emplace_back(&I, U);
+
+ // Rewrite materializable instructions to be materialized at the use point.
+ std::sort(Spills.begin(), Spills.end());
+ DEBUG(dump("Materializations", Spills));
+ rewriteMaterializableInstructions(Builder, Spills);
+
+ // Collect the spills for arguments and other not-materializable values.
+ Spills.clear();
+ for (Argument &A : F.getArgumentList())
+ for (User *U : A.users())
+ if (Checker.isDefinitionAcrossSuspend(A, U))
+ Spills.emplace_back(&A, U);
+
+ for (Instruction &I : instructions(F)) {
+ // Values returned from coroutine structure intrinsics should not be part
+ // of the Coroutine Frame.
+ if (isCoroutineStructureIntrinsic(I))
+ continue;
+ // The Coroutine Promise always included into coroutine frame, no need to
+ // check for suspend crossing.
+ if (Shape.PromiseAlloca == &I)
+ continue;
+
+ for (User *U : I.users())
+ if (Checker.isDefinitionAcrossSuspend(I, U)) {
+ // We cannot spill a token.
+ if (I.getType()->isTokenTy())
+ report_fatal_error(
+ "token definition is separated from the use by a suspend point");
+ assert(!materializable(I) &&
+ "rewriteMaterializable did not do its job");
+ Spills.emplace_back(&I, U);
+ }
+ }
+ std::sort(Spills.begin(), Spills.end());
+ DEBUG(dump("Spills", Spills));
+ moveSpillUsesAfterCoroBegin(F, Spills, Shape.CoroBegin);
+ Shape.FrameTy = buildFrameType(F, Shape, Spills);
+ Shape.FramePtr = insertSpills(Spills, Shape);
+}
diff --git a/contrib/llvm/lib/Transforms/Coroutines/CoroInstr.h b/contrib/llvm/lib/Transforms/Coroutines/CoroInstr.h
new file mode 100644
index 0000000..e03cef4
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Coroutines/CoroInstr.h
@@ -0,0 +1,318 @@
+//===-- CoroInstr.h - Coroutine Intrinsics Instruction Wrappers -*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This file defines classes that make it really easy to deal with intrinsic
+// functions with the isa/dyncast family of functions. In particular, this
+// allows you to do things like:
+//
+// if (auto *SF = dyn_cast<CoroSubFnInst>(Inst))
+// ... SF->getFrame() ...
+//
+// All intrinsic function calls are instances of the call instruction, so these
+// are all subclasses of the CallInst class. Note that none of these classes
+// has state or virtual methods, which is an important part of this gross/neat
+// hack working.
+//
+// The helpful comment above is borrowed from llvm/IntrinsicInst.h, we keep
+// coroutine intrinsic wrappers here since they are only used by the passes in
+// the Coroutine library.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IntrinsicInst.h"
+
+namespace llvm {
+
+/// This class represents the llvm.coro.subfn.addr instruction.
+class LLVM_LIBRARY_VISIBILITY CoroSubFnInst : public IntrinsicInst {
+ enum { FrameArg, IndexArg };
+
+public:
+ enum ResumeKind {
+ RestartTrigger = -1,
+ ResumeIndex,
+ DestroyIndex,
+ CleanupIndex,
+ IndexLast,
+ IndexFirst = RestartTrigger
+ };
+
+ Value *getFrame() const { return getArgOperand(FrameArg); }
+ ResumeKind getIndex() const {
+ int64_t Index = getRawIndex()->getValue().getSExtValue();
+ assert(Index >= IndexFirst && Index < IndexLast &&
+ "unexpected CoroSubFnInst index argument");
+ return static_cast<ResumeKind>(Index);
+ }
+
+ ConstantInt *getRawIndex() const {
+ return cast<ConstantInt>(getArgOperand(IndexArg));
+ }
+
+ // Methods to support type inquiry through isa, cast, and dyn_cast:
+ static inline bool classof(const IntrinsicInst *I) {
+ return I->getIntrinsicID() == Intrinsic::coro_subfn_addr;
+ }
+ static inline bool classof(const Value *V) {
+ return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
+ }
+};
+
+/// This represents the llvm.coro.alloc instruction.
+class LLVM_LIBRARY_VISIBILITY CoroAllocInst : public IntrinsicInst {
+public:
+ // Methods to support type inquiry through isa, cast, and dyn_cast:
+ static inline bool classof(const IntrinsicInst *I) {
+ return I->getIntrinsicID() == Intrinsic::coro_alloc;
+ }
+ static inline bool classof(const Value *V) {
+ return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
+ }
+};
+
+/// This represents the llvm.coro.alloc instruction.
+class LLVM_LIBRARY_VISIBILITY CoroIdInst : public IntrinsicInst {
+ enum { AlignArg, PromiseArg, CoroutineArg, InfoArg };
+
+public:
+ CoroAllocInst *getCoroAlloc() {
+ for (User *U : users())
+ if (auto *CA = dyn_cast<CoroAllocInst>(U))
+ return CA;
+ return nullptr;
+ }
+
+ IntrinsicInst *getCoroBegin() {
+ for (User *U : users())
+ if (auto *II = dyn_cast<IntrinsicInst>(U))
+ if (II->getIntrinsicID() == Intrinsic::coro_begin)
+ return II;
+ llvm_unreachable("no coro.begin associated with coro.id");
+ }
+
+ AllocaInst *getPromise() const {
+ Value *Arg = getArgOperand(PromiseArg);
+ return isa<ConstantPointerNull>(Arg)
+ ? nullptr
+ : cast<AllocaInst>(Arg->stripPointerCasts());
+ }
+
+ void clearPromise() {
+ Value *Arg = getArgOperand(PromiseArg);
+ setArgOperand(PromiseArg,
+ ConstantPointerNull::get(Type::getInt8PtrTy(getContext())));
+ if (isa<AllocaInst>(Arg))
+ return;
+ assert((isa<BitCastInst>(Arg) || isa<GetElementPtrInst>(Arg)) &&
+ "unexpected instruction designating the promise");
+ // TODO: Add a check that any remaining users of Inst are after coro.begin
+ // or add code to move the users after coro.begin.
+ auto *Inst = cast<Instruction>(Arg);
+ if (Inst->use_empty()) {
+ Inst->eraseFromParent();
+ return;
+ }
+ Inst->moveBefore(getCoroBegin()->getNextNode());
+ }
+
+ // Info argument of coro.id is
+ // fresh out of the frontend: null ;
+ // outlined : {Init, Return, Susp1, Susp2, ...} ;
+ // postsplit : [resume, destroy, cleanup] ;
+ //
+ // If parts of the coroutine were outlined to protect against undesirable
+ // code motion, these functions will be stored in a struct literal referred to
+ // by the Info parameter. Note: this is only needed before coroutine is split.
+ //
+ // After coroutine is split, resume functions are stored in an array
+ // referred to by this parameter.
+
+ struct Info {
+ ConstantStruct *OutlinedParts = nullptr;
+ ConstantArray *Resumers = nullptr;
+
+ bool hasOutlinedParts() const { return OutlinedParts != nullptr; }
+ bool isPostSplit() const { return Resumers != nullptr; }
+ bool isPreSplit() const { return !isPostSplit(); }
+ };
+ Info getInfo() const {
+ Info Result;
+ auto *GV = dyn_cast<GlobalVariable>(getRawInfo());
+ if (!GV)
+ return Result;
+
+ assert(GV->isConstant() && GV->hasDefinitiveInitializer());
+ Constant *Initializer = GV->getInitializer();
+ if ((Result.OutlinedParts = dyn_cast<ConstantStruct>(Initializer)))
+ return Result;
+
+ Result.Resumers = cast<ConstantArray>(Initializer);
+ return Result;
+ }
+ Constant *getRawInfo() const {
+ return cast<Constant>(getArgOperand(InfoArg)->stripPointerCasts());
+ }
+
+ void setInfo(Constant *C) { setArgOperand(InfoArg, C); }
+
+ Function *getCoroutine() const {
+ return cast<Function>(getArgOperand(CoroutineArg)->stripPointerCasts());
+ }
+ void setCoroutineSelf() {
+ assert(isa<ConstantPointerNull>(getArgOperand(CoroutineArg)) &&
+ "Coroutine argument is already assigned");
+ auto *const Int8PtrTy = Type::getInt8PtrTy(getContext());
+ setArgOperand(CoroutineArg,
+ ConstantExpr::getBitCast(getFunction(), Int8PtrTy));
+ }
+
+ // Methods to support type inquiry through isa, cast, and dyn_cast:
+ static inline bool classof(const IntrinsicInst *I) {
+ return I->getIntrinsicID() == Intrinsic::coro_id;
+ }
+ static inline bool classof(const Value *V) {
+ return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
+ }
+};
+
+/// This represents the llvm.coro.frame instruction.
+class LLVM_LIBRARY_VISIBILITY CoroFrameInst : public IntrinsicInst {
+public:
+ // Methods to support type inquiry through isa, cast, and dyn_cast:
+ static inline bool classof(const IntrinsicInst *I) {
+ return I->getIntrinsicID() == Intrinsic::coro_frame;
+ }
+ static inline bool classof(const Value *V) {
+ return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
+ }
+};
+
+/// This represents the llvm.coro.free instruction.
+class LLVM_LIBRARY_VISIBILITY CoroFreeInst : public IntrinsicInst {
+ enum { IdArg, FrameArg };
+
+public:
+ Value *getFrame() const { return getArgOperand(FrameArg); }
+
+ // Methods to support type inquiry through isa, cast, and dyn_cast:
+ static inline bool classof(const IntrinsicInst *I) {
+ return I->getIntrinsicID() == Intrinsic::coro_free;
+ }
+ static inline bool classof(const Value *V) {
+ return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
+ }
+};
+
+/// This class represents the llvm.coro.begin instruction.
+class LLVM_LIBRARY_VISIBILITY CoroBeginInst : public IntrinsicInst {
+ enum { IdArg, MemArg };
+
+public:
+ CoroIdInst *getId() const { return cast<CoroIdInst>(getArgOperand(IdArg)); }
+
+ Value *getMem() const { return getArgOperand(MemArg); }
+
+ // Methods for support type inquiry through isa, cast, and dyn_cast:
+ static inline bool classof(const IntrinsicInst *I) {
+ return I->getIntrinsicID() == Intrinsic::coro_begin;
+ }
+ static inline bool classof(const Value *V) {
+ return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
+ }
+};
+
+/// This represents the llvm.coro.save instruction.
+class LLVM_LIBRARY_VISIBILITY CoroSaveInst : public IntrinsicInst {
+public:
+ // Methods to support type inquiry through isa, cast, and dyn_cast:
+ static inline bool classof(const IntrinsicInst *I) {
+ return I->getIntrinsicID() == Intrinsic::coro_save;
+ }
+ static inline bool classof(const Value *V) {
+ return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
+ }
+};
+
+/// This represents the llvm.coro.promise instruction.
+class LLVM_LIBRARY_VISIBILITY CoroPromiseInst : public IntrinsicInst {
+ enum { FrameArg, AlignArg, FromArg };
+
+public:
+ bool isFromPromise() const {
+ return cast<Constant>(getArgOperand(FromArg))->isOneValue();
+ }
+ unsigned getAlignment() const {
+ return cast<ConstantInt>(getArgOperand(AlignArg))->getZExtValue();
+ }
+
+ // Methods to support type inquiry through isa, cast, and dyn_cast:
+ static inline bool classof(const IntrinsicInst *I) {
+ return I->getIntrinsicID() == Intrinsic::coro_promise;
+ }
+ static inline bool classof(const Value *V) {
+ return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
+ }
+};
+
+/// This represents the llvm.coro.suspend instruction.
+class LLVM_LIBRARY_VISIBILITY CoroSuspendInst : public IntrinsicInst {
+ enum { SaveArg, FinalArg };
+
+public:
+ CoroSaveInst *getCoroSave() const {
+ Value *Arg = getArgOperand(SaveArg);
+ if (auto *SI = dyn_cast<CoroSaveInst>(Arg))
+ return SI;
+ assert(isa<ConstantTokenNone>(Arg));
+ return nullptr;
+ }
+ bool isFinal() const {
+ return cast<Constant>(getArgOperand(FinalArg))->isOneValue();
+ }
+
+ // Methods to support type inquiry through isa, cast, and dyn_cast:
+ static inline bool classof(const IntrinsicInst *I) {
+ return I->getIntrinsicID() == Intrinsic::coro_suspend;
+ }
+ static inline bool classof(const Value *V) {
+ return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
+ }
+};
+
+/// This represents the llvm.coro.size instruction.
+class LLVM_LIBRARY_VISIBILITY CoroSizeInst : public IntrinsicInst {
+public:
+ // Methods to support type inquiry through isa, cast, and dyn_cast:
+ static inline bool classof(const IntrinsicInst *I) {
+ return I->getIntrinsicID() == Intrinsic::coro_size;
+ }
+ static inline bool classof(const Value *V) {
+ return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
+ }
+};
+
+/// This represents the llvm.coro.end instruction.
+class LLVM_LIBRARY_VISIBILITY CoroEndInst : public IntrinsicInst {
+ enum { FrameArg, UnwindArg };
+
+public:
+ bool isFallthrough() const { return !isUnwind(); }
+ bool isUnwind() const {
+ return cast<Constant>(getArgOperand(UnwindArg))->isOneValue();
+ }
+
+ // Methods to support type inquiry through isa, cast, and dyn_cast:
+ static inline bool classof(const IntrinsicInst *I) {
+ return I->getIntrinsicID() == Intrinsic::coro_end;
+ }
+ static inline bool classof(const Value *V) {
+ return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
+ }
+};
+
+} // End namespace llvm.
diff --git a/contrib/llvm/lib/Transforms/Coroutines/CoroInternal.h b/contrib/llvm/lib/Transforms/Coroutines/CoroInternal.h
new file mode 100644
index 0000000..1eac88d
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Coroutines/CoroInternal.h
@@ -0,0 +1,107 @@
+//===- CoroInternal.h - Internal Coroutine interfaces ---------*- C++ -*---===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// Common definitions/declarations used internally by coroutine lowering passes.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TRANSFORMS_COROUTINES_COROINTERNAL_H
+#define LLVM_LIB_TRANSFORMS_COROUTINES_COROINTERNAL_H
+
+#include "CoroInstr.h"
+#include "llvm/Transforms/Coroutines.h"
+
+namespace llvm {
+
+class CallGraph;
+class CallGraphSCC;
+class PassRegistry;
+
+void initializeCoroEarlyPass(PassRegistry &);
+void initializeCoroSplitPass(PassRegistry &);
+void initializeCoroElidePass(PassRegistry &);
+void initializeCoroCleanupPass(PassRegistry &);
+
+// CoroEarly pass marks every function that has coro.begin with a string
+// attribute "coroutine.presplit"="0". CoroSplit pass processes the coroutine
+// twice. First, it lets it go through complete IPO optimization pipeline as a
+// single function. It forces restart of the pipeline by inserting an indirect
+// call to an empty function "coro.devirt.trigger" which is devirtualized by
+// CoroElide pass that triggers a restart of the pipeline by CGPassManager.
+// When CoroSplit pass sees the same coroutine the second time, it splits it up,
+// adds coroutine subfunctions to the SCC to be processed by IPO pipeline.
+
+#define CORO_PRESPLIT_ATTR "coroutine.presplit"
+#define UNPREPARED_FOR_SPLIT "0"
+#define PREPARED_FOR_SPLIT "1"
+
+#define CORO_DEVIRT_TRIGGER_FN "coro.devirt.trigger"
+
+namespace coro {
+
+bool declaresIntrinsics(Module &M, std::initializer_list<StringRef>);
+void replaceAllCoroAllocs(CoroBeginInst *CB, bool Replacement);
+void replaceAllCoroFrees(CoroBeginInst *CB, Value *Replacement);
+void replaceCoroFree(CoroIdInst *CoroId, bool Elide);
+void updateCallGraph(Function &Caller, ArrayRef<Function *> Funcs,
+ CallGraph &CG, CallGraphSCC &SCC);
+
+// Keeps data and helper functions for lowering coroutine intrinsics.
+struct LowererBase {
+ Module &TheModule;
+ LLVMContext &Context;
+ PointerType *const Int8Ptr;
+ FunctionType *const ResumeFnType;
+ ConstantPointerNull *const NullPtr;
+
+ LowererBase(Module &M);
+ Value *makeSubFnCall(Value *Arg, int Index, Instruction *InsertPt);
+};
+
+// Holds structural Coroutine Intrinsics for a particular function and other
+// values used during CoroSplit pass.
+struct LLVM_LIBRARY_VISIBILITY Shape {
+ CoroBeginInst *CoroBegin;
+ SmallVector<CoroEndInst *, 4> CoroEnds;
+ SmallVector<CoroSizeInst *, 2> CoroSizes;
+ SmallVector<CoroSuspendInst *, 4> CoroSuspends;
+
+ // Field Indexes for known coroutine frame fields.
+ enum {
+ ResumeField,
+ DestroyField,
+ PromiseField,
+ IndexField,
+ LastKnownField = IndexField
+ };
+
+ StructType *FrameTy;
+ Instruction *FramePtr;
+ BasicBlock *AllocaSpillBlock;
+ SwitchInst *ResumeSwitch;
+ AllocaInst *PromiseAlloca;
+ bool HasFinalSuspend;
+
+ IntegerType *getIndexType() const {
+ assert(FrameTy && "frame type not assigned");
+ return cast<IntegerType>(FrameTy->getElementType(IndexField));
+ }
+ ConstantInt *getIndex(uint64_t Value) const {
+ return ConstantInt::get(getIndexType(), Value);
+ }
+
+ Shape() = default;
+ explicit Shape(Function &F) { buildFrom(F); }
+ void buildFrom(Function &F);
+};
+
+void buildCoroutineFrame(Function &F, Shape &Shape);
+
+} // End namespace coro.
+} // End namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Transforms/Coroutines/CoroSplit.cpp b/contrib/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
new file mode 100644
index 0000000..7a3f4f6
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
@@ -0,0 +1,640 @@
+//===- CoroSplit.cpp - Converts a coroutine into a state machine ----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This pass builds the coroutine frame and outlines resume and destroy parts
+// of the coroutine into separate functions.
+//
+// We present a coroutine to an LLVM as an ordinary function with suspension
+// points marked up with intrinsics. We let the optimizer party on the coroutine
+// as a single function for as long as possible. Shortly before the coroutine is
+// eligible to be inlined into its callers, we split up the coroutine into parts
+// corresponding to an initial, resume and destroy invocations of the coroutine,
+// add them to the current SCC and restart the IPO pipeline to optimize the
+// coroutine subfunctions we extracted before proceeding to the caller of the
+// coroutine.
+//===----------------------------------------------------------------------===//
+
+#include "CoroInternal.h"
+#include "llvm/Analysis/CallGraphSCCPass.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "coro-split"
+
+// Create an entry block for a resume function with a switch that will jump to
+// suspend points.
+static BasicBlock *createResumeEntryBlock(Function &F, coro::Shape &Shape) {
+ LLVMContext &C = F.getContext();
+
+ // resume.entry:
+ // %index.addr = getelementptr inbounds %f.Frame, %f.Frame* %FramePtr, i32 0,
+ // i32 2
+ // % index = load i32, i32* %index.addr
+ // switch i32 %index, label %unreachable [
+ // i32 0, label %resume.0
+ // i32 1, label %resume.1
+ // ...
+ // ]
+
+ auto *NewEntry = BasicBlock::Create(C, "resume.entry", &F);
+ auto *UnreachBB = BasicBlock::Create(C, "unreachable", &F);
+
+ IRBuilder<> Builder(NewEntry);
+ auto *FramePtr = Shape.FramePtr;
+ auto *FrameTy = Shape.FrameTy;
+ auto *GepIndex = Builder.CreateConstInBoundsGEP2_32(
+ FrameTy, FramePtr, 0, coro::Shape::IndexField, "index.addr");
+ auto *Index = Builder.CreateLoad(GepIndex, "index");
+ auto *Switch =
+ Builder.CreateSwitch(Index, UnreachBB, Shape.CoroSuspends.size());
+ Shape.ResumeSwitch = Switch;
+
+ size_t SuspendIndex = 0;
+ for (CoroSuspendInst *S : Shape.CoroSuspends) {
+ ConstantInt *IndexVal = Shape.getIndex(SuspendIndex);
+
+ // Replace CoroSave with a store to Index:
+ // %index.addr = getelementptr %f.frame... (index field number)
+ // store i32 0, i32* %index.addr1
+ auto *Save = S->getCoroSave();
+ Builder.SetInsertPoint(Save);
+ if (S->isFinal()) {
+ // Final suspend point is represented by storing zero in ResumeFnAddr.
+ auto *GepIndex = Builder.CreateConstInBoundsGEP2_32(FrameTy, FramePtr, 0,
+ 0, "ResumeFn.addr");
+ auto *NullPtr = ConstantPointerNull::get(cast<PointerType>(
+ cast<PointerType>(GepIndex->getType())->getElementType()));
+ Builder.CreateStore(NullPtr, GepIndex);
+ } else {
+ auto *GepIndex = Builder.CreateConstInBoundsGEP2_32(
+ FrameTy, FramePtr, 0, coro::Shape::IndexField, "index.addr");
+ Builder.CreateStore(IndexVal, GepIndex);
+ }
+ Save->replaceAllUsesWith(ConstantTokenNone::get(C));
+ Save->eraseFromParent();
+
+ // Split block before and after coro.suspend and add a jump from an entry
+ // switch:
+ //
+ // whateverBB:
+ // whatever
+ // %0 = call i8 @llvm.coro.suspend(token none, i1 false)
+ // switch i8 %0, label %suspend[i8 0, label %resume
+ // i8 1, label %cleanup]
+ // becomes:
+ //
+ // whateverBB:
+ // whatever
+ // br label %resume.0.landing
+ //
+ // resume.0: ; <--- jump from the switch in the resume.entry
+ // %0 = tail call i8 @llvm.coro.suspend(token none, i1 false)
+ // br label %resume.0.landing
+ //
+ // resume.0.landing:
+ // %1 = phi i8[-1, %whateverBB], [%0, %resume.0]
+ // switch i8 % 1, label %suspend [i8 0, label %resume
+ // i8 1, label %cleanup]
+
+ auto *SuspendBB = S->getParent();
+ auto *ResumeBB =
+ SuspendBB->splitBasicBlock(S, "resume." + Twine(SuspendIndex));
+ auto *LandingBB = ResumeBB->splitBasicBlock(
+ S->getNextNode(), ResumeBB->getName() + Twine(".landing"));
+ Switch->addCase(IndexVal, ResumeBB);
+
+ cast<BranchInst>(SuspendBB->getTerminator())->setSuccessor(0, LandingBB);
+ auto *PN = PHINode::Create(Builder.getInt8Ty(), 2, "", &LandingBB->front());
+ S->replaceAllUsesWith(PN);
+ PN->addIncoming(Builder.getInt8(-1), SuspendBB);
+ PN->addIncoming(S, ResumeBB);
+
+ ++SuspendIndex;
+ }
+
+ Builder.SetInsertPoint(UnreachBB);
+ Builder.CreateUnreachable();
+
+ return NewEntry;
+}
+
+// In Resumers, we replace fallthrough coro.end with ret void and delete the
+// rest of the block.
+static void replaceFallthroughCoroEnd(IntrinsicInst *End,
+ ValueToValueMapTy &VMap) {
+ auto *NewE = cast<IntrinsicInst>(VMap[End]);
+ ReturnInst::Create(NewE->getContext(), nullptr, NewE);
+
+ // Remove the rest of the block, by splitting it into an unreachable block.
+ auto *BB = NewE->getParent();
+ BB->splitBasicBlock(NewE);
+ BB->getTerminator()->eraseFromParent();
+}
+
+// Rewrite final suspend point handling. We do not use suspend index to
+// represent the final suspend point. Instead we zero-out ResumeFnAddr in the
+// coroutine frame, since it is undefined behavior to resume a coroutine
+// suspended at the final suspend point. Thus, in the resume function, we can
+// simply remove the last case (when coro::Shape is built, the final suspend
+// point (if present) is always the last element of CoroSuspends array).
+// In the destroy function, we add a code sequence to check if ResumeFnAddress
+// is Null, and if so, jump to the appropriate label to handle cleanup from the
+// final suspend point.
+static void handleFinalSuspend(IRBuilder<> &Builder, Value *FramePtr,
+ coro::Shape &Shape, SwitchInst *Switch,
+ bool IsDestroy) {
+ assert(Shape.HasFinalSuspend);
+ auto FinalCase = --Switch->case_end();
+ BasicBlock *ResumeBB = FinalCase.getCaseSuccessor();
+ Switch->removeCase(FinalCase);
+ if (IsDestroy) {
+ BasicBlock *OldSwitchBB = Switch->getParent();
+ auto *NewSwitchBB = OldSwitchBB->splitBasicBlock(Switch, "Switch");
+ Builder.SetInsertPoint(OldSwitchBB->getTerminator());
+ auto *GepIndex = Builder.CreateConstInBoundsGEP2_32(Shape.FrameTy, FramePtr,
+ 0, 0, "ResumeFn.addr");
+ auto *Load = Builder.CreateLoad(GepIndex);
+ auto *NullPtr =
+ ConstantPointerNull::get(cast<PointerType>(Load->getType()));
+ auto *Cond = Builder.CreateICmpEQ(Load, NullPtr);
+ Builder.CreateCondBr(Cond, ResumeBB, NewSwitchBB);
+ OldSwitchBB->getTerminator()->eraseFromParent();
+ }
+}
+
+// Create a resume clone by cloning the body of the original function, setting
+// new entry block and replacing coro.suspend an appropriate value to force
+// resume or cleanup pass for every suspend point.
+static Function *createClone(Function &F, Twine Suffix, coro::Shape &Shape,
+ BasicBlock *ResumeEntry, int8_t FnIndex) {
+ Module *M = F.getParent();
+ auto *FrameTy = Shape.FrameTy;
+ auto *FnPtrTy = cast<PointerType>(FrameTy->getElementType(0));
+ auto *FnTy = cast<FunctionType>(FnPtrTy->getElementType());
+
+ Function *NewF =
+ Function::Create(FnTy, GlobalValue::LinkageTypes::InternalLinkage,
+ F.getName() + Suffix, M);
+ NewF->addAttribute(1, Attribute::NonNull);
+ NewF->addAttribute(1, Attribute::NoAlias);
+
+ ValueToValueMapTy VMap;
+ // Replace all args with undefs. The buildCoroutineFrame algorithm already
+ // rewritten access to the args that occurs after suspend points with loads
+ // and stores to/from the coroutine frame.
+ for (Argument &A : F.getArgumentList())
+ VMap[&A] = UndefValue::get(A.getType());
+
+ SmallVector<ReturnInst *, 4> Returns;
+
+ if (DISubprogram *SP = F.getSubprogram()) {
+ // If we have debug info, add mapping for the metadata nodes that should not
+ // be cloned by CloneFunctionInfo.
+ auto &MD = VMap.MD();
+ MD[SP->getUnit()].reset(SP->getUnit());
+ MD[SP->getType()].reset(SP->getType());
+ MD[SP->getFile()].reset(SP->getFile());
+ }
+ CloneFunctionInto(NewF, &F, VMap, /*ModuleLevelChanges=*/true, Returns);
+
+ // Remove old returns.
+ for (ReturnInst *Return : Returns)
+ changeToUnreachable(Return, /*UseLLVMTrap=*/false);
+
+ // Remove old return attributes.
+ NewF->removeAttributes(
+ AttributeSet::ReturnIndex,
+ AttributeSet::get(
+ NewF->getContext(), AttributeSet::ReturnIndex,
+ AttributeFuncs::typeIncompatible(NewF->getReturnType())));
+
+ // Make AllocaSpillBlock the new entry block.
+ auto *SwitchBB = cast<BasicBlock>(VMap[ResumeEntry]);
+ auto *Entry = cast<BasicBlock>(VMap[Shape.AllocaSpillBlock]);
+ Entry->moveBefore(&NewF->getEntryBlock());
+ Entry->getTerminator()->eraseFromParent();
+ BranchInst::Create(SwitchBB, Entry);
+ Entry->setName("entry" + Suffix);
+
+ // Clear all predecessors of the new entry block.
+ auto *Switch = cast<SwitchInst>(VMap[Shape.ResumeSwitch]);
+ Entry->replaceAllUsesWith(Switch->getDefaultDest());
+
+ IRBuilder<> Builder(&NewF->getEntryBlock().front());
+
+ // Remap frame pointer.
+ Argument *NewFramePtr = &NewF->getArgumentList().front();
+ Value *OldFramePtr = cast<Value>(VMap[Shape.FramePtr]);
+ NewFramePtr->takeName(OldFramePtr);
+ OldFramePtr->replaceAllUsesWith(NewFramePtr);
+
+ // Remap vFrame pointer.
+ auto *NewVFrame = Builder.CreateBitCast(
+ NewFramePtr, Type::getInt8PtrTy(Builder.getContext()), "vFrame");
+ Value *OldVFrame = cast<Value>(VMap[Shape.CoroBegin]);
+ OldVFrame->replaceAllUsesWith(NewVFrame);
+
+ // Rewrite final suspend handling as it is not done via switch (allows to
+ // remove final case from the switch, since it is undefined behavior to resume
+ // the coroutine suspended at the final suspend point.
+ if (Shape.HasFinalSuspend) {
+ auto *Switch = cast<SwitchInst>(VMap[Shape.ResumeSwitch]);
+ bool IsDestroy = FnIndex != 0;
+ handleFinalSuspend(Builder, NewFramePtr, Shape, Switch, IsDestroy);
+ }
+
+ // Replace coro suspend with the appropriate resume index.
+ // Replacing coro.suspend with (0) will result in control flow proceeding to
+ // a resume label associated with a suspend point, replacing it with (1) will
+ // result in control flow proceeding to a cleanup label associated with this
+ // suspend point.
+ auto *NewValue = Builder.getInt8(FnIndex ? 1 : 0);
+ for (CoroSuspendInst *CS : Shape.CoroSuspends) {
+ auto *MappedCS = cast<CoroSuspendInst>(VMap[CS]);
+ MappedCS->replaceAllUsesWith(NewValue);
+ MappedCS->eraseFromParent();
+ }
+
+ // Remove coro.end intrinsics.
+ replaceFallthroughCoroEnd(Shape.CoroEnds.front(), VMap);
+ // FIXME: coming in upcoming patches:
+ // replaceUnwindCoroEnds(Shape.CoroEnds, VMap);
+
+ // Eliminate coro.free from the clones, replacing it with 'null' in cleanup,
+ // to suppress deallocation code.
+ coro::replaceCoroFree(cast<CoroIdInst>(VMap[Shape.CoroBegin->getId()]),
+ /*Elide=*/FnIndex == 2);
+
+ NewF->setCallingConv(CallingConv::Fast);
+
+ return NewF;
+}
+
+static void removeCoroEnds(coro::Shape &Shape) {
+ for (CoroEndInst *CE : Shape.CoroEnds)
+ CE->eraseFromParent();
+}
+
+static void replaceFrameSize(coro::Shape &Shape) {
+ if (Shape.CoroSizes.empty())
+ return;
+
+ // In the same function all coro.sizes should have the same result type.
+ auto *SizeIntrin = Shape.CoroSizes.back();
+ Module *M = SizeIntrin->getModule();
+ const DataLayout &DL = M->getDataLayout();
+ auto Size = DL.getTypeAllocSize(Shape.FrameTy);
+ auto *SizeConstant = ConstantInt::get(SizeIntrin->getType(), Size);
+
+ for (CoroSizeInst *CS : Shape.CoroSizes) {
+ CS->replaceAllUsesWith(SizeConstant);
+ CS->eraseFromParent();
+ }
+}
+
+// Create a global constant array containing pointers to functions provided and
+// set Info parameter of CoroBegin to point at this constant. Example:
+//
+// @f.resumers = internal constant [2 x void(%f.frame*)*]
+// [void(%f.frame*)* @f.resume, void(%f.frame*)* @f.destroy]
+// define void @f() {
+// ...
+// call i8* @llvm.coro.begin(i8* null, i32 0, i8* null,
+// i8* bitcast([2 x void(%f.frame*)*] * @f.resumers to i8*))
+//
+// Assumes that all the functions have the same signature.
+static void setCoroInfo(Function &F, CoroBeginInst *CoroBegin,
+ std::initializer_list<Function *> Fns) {
+
+ SmallVector<Constant *, 4> Args(Fns.begin(), Fns.end());
+ assert(!Args.empty());
+ Function *Part = *Fns.begin();
+ Module *M = Part->getParent();
+ auto *ArrTy = ArrayType::get(Part->getType(), Args.size());
+
+ auto *ConstVal = ConstantArray::get(ArrTy, Args);
+ auto *GV = new GlobalVariable(*M, ConstVal->getType(), /*isConstant=*/true,
+ GlobalVariable::PrivateLinkage, ConstVal,
+ F.getName() + Twine(".resumers"));
+
+ // Update coro.begin instruction to refer to this constant.
+ LLVMContext &C = F.getContext();
+ auto *BC = ConstantExpr::getPointerCast(GV, Type::getInt8PtrTy(C));
+ CoroBegin->getId()->setInfo(BC);
+}
+
+// Store addresses of Resume/Destroy/Cleanup functions in the coroutine frame.
+static void updateCoroFrame(coro::Shape &Shape, Function *ResumeFn,
+ Function *DestroyFn, Function *CleanupFn) {
+
+ IRBuilder<> Builder(Shape.FramePtr->getNextNode());
+ auto *ResumeAddr = Builder.CreateConstInBoundsGEP2_32(
+ Shape.FrameTy, Shape.FramePtr, 0, coro::Shape::ResumeField,
+ "resume.addr");
+ Builder.CreateStore(ResumeFn, ResumeAddr);
+
+ Value *DestroyOrCleanupFn = DestroyFn;
+
+ CoroIdInst *CoroId = Shape.CoroBegin->getId();
+ if (CoroAllocInst *CA = CoroId->getCoroAlloc()) {
+ // If there is a CoroAlloc and it returns false (meaning we elide the
+ // allocation, use CleanupFn instead of DestroyFn).
+ DestroyOrCleanupFn = Builder.CreateSelect(CA, DestroyFn, CleanupFn);
+ }
+
+ auto *DestroyAddr = Builder.CreateConstInBoundsGEP2_32(
+ Shape.FrameTy, Shape.FramePtr, 0, coro::Shape::DestroyField,
+ "destroy.addr");
+ Builder.CreateStore(DestroyOrCleanupFn, DestroyAddr);
+}
+
+static void postSplitCleanup(Function &F) {
+ removeUnreachableBlocks(F);
+ llvm::legacy::FunctionPassManager FPM(F.getParent());
+
+ FPM.add(createVerifierPass());
+ FPM.add(createSCCPPass());
+ FPM.add(createCFGSimplificationPass());
+ FPM.add(createEarlyCSEPass());
+ FPM.add(createCFGSimplificationPass());
+
+ FPM.doInitialization();
+ FPM.run(F);
+ FPM.doFinalization();
+}
+
+// Coroutine has no suspend points. Remove heap allocation for the coroutine
+// frame if possible.
+static void handleNoSuspendCoroutine(CoroBeginInst *CoroBegin, Type *FrameTy) {
+ auto *CoroId = CoroBegin->getId();
+ auto *AllocInst = CoroId->getCoroAlloc();
+ coro::replaceCoroFree(CoroId, /*Elide=*/AllocInst != nullptr);
+ if (AllocInst) {
+ IRBuilder<> Builder(AllocInst);
+ // FIXME: Need to handle overaligned members.
+ auto *Frame = Builder.CreateAlloca(FrameTy);
+ auto *VFrame = Builder.CreateBitCast(Frame, Builder.getInt8PtrTy());
+ AllocInst->replaceAllUsesWith(Builder.getFalse());
+ AllocInst->eraseFromParent();
+ CoroBegin->replaceAllUsesWith(VFrame);
+ } else {
+ CoroBegin->replaceAllUsesWith(CoroBegin->getMem());
+ }
+ CoroBegin->eraseFromParent();
+}
+
+// look for a very simple pattern
+// coro.save
+// no other calls
+// resume or destroy call
+// coro.suspend
+//
+// If there are other calls between coro.save and coro.suspend, they can
+// potentially resume or destroy the coroutine, so it is unsafe to eliminate a
+// suspend point.
+static bool simplifySuspendPoint(CoroSuspendInst *Suspend,
+ CoroBeginInst *CoroBegin) {
+ auto *Save = Suspend->getCoroSave();
+ auto *BB = Suspend->getParent();
+ if (BB != Save->getParent())
+ return false;
+
+ CallSite SingleCallSite;
+
+ // Check that we have only one CallSite.
+ for (Instruction *I = Save->getNextNode(); I != Suspend;
+ I = I->getNextNode()) {
+ if (isa<CoroFrameInst>(I))
+ continue;
+ if (isa<CoroSubFnInst>(I))
+ continue;
+ if (CallSite CS = CallSite(I)) {
+ if (SingleCallSite)
+ return false;
+ else
+ SingleCallSite = CS;
+ }
+ }
+ auto *CallInstr = SingleCallSite.getInstruction();
+ if (!CallInstr)
+ return false;
+
+ auto *Callee = SingleCallSite.getCalledValue()->stripPointerCasts();
+
+ // See if the callsite is for resumption or destruction of the coroutine.
+ auto *SubFn = dyn_cast<CoroSubFnInst>(Callee);
+ if (!SubFn)
+ return false;
+
+ // Does not refer to the current coroutine, we cannot do anything with it.
+ if (SubFn->getFrame() != CoroBegin)
+ return false;
+
+ // Replace llvm.coro.suspend with the value that results in resumption over
+ // the resume or cleanup path.
+ Suspend->replaceAllUsesWith(SubFn->getRawIndex());
+ Suspend->eraseFromParent();
+ Save->eraseFromParent();
+
+ // No longer need a call to coro.resume or coro.destroy.
+ CallInstr->eraseFromParent();
+
+ if (SubFn->user_empty())
+ SubFn->eraseFromParent();
+
+ return true;
+}
+
+// Remove suspend points that are simplified.
+static void simplifySuspendPoints(coro::Shape &Shape) {
+ auto &S = Shape.CoroSuspends;
+ size_t I = 0, N = S.size();
+ if (N == 0)
+ return;
+ for (;;) {
+ if (simplifySuspendPoint(S[I], Shape.CoroBegin)) {
+ if (--N == I)
+ break;
+ std::swap(S[I], S[N]);
+ continue;
+ }
+ if (++I == N)
+ break;
+ }
+ S.resize(N);
+}
+
+static void splitCoroutine(Function &F, CallGraph &CG, CallGraphSCC &SCC) {
+ coro::Shape Shape(F);
+ if (!Shape.CoroBegin)
+ return;
+
+ simplifySuspendPoints(Shape);
+ buildCoroutineFrame(F, Shape);
+ replaceFrameSize(Shape);
+
+ // If there are no suspend points, no split required, just remove
+ // the allocation and deallocation blocks, they are not needed.
+ if (Shape.CoroSuspends.empty()) {
+ handleNoSuspendCoroutine(Shape.CoroBegin, Shape.FrameTy);
+ removeCoroEnds(Shape);
+ postSplitCleanup(F);
+ coro::updateCallGraph(F, {}, CG, SCC);
+ return;
+ }
+
+ auto *ResumeEntry = createResumeEntryBlock(F, Shape);
+ auto ResumeClone = createClone(F, ".resume", Shape, ResumeEntry, 0);
+ auto DestroyClone = createClone(F, ".destroy", Shape, ResumeEntry, 1);
+ auto CleanupClone = createClone(F, ".cleanup", Shape, ResumeEntry, 2);
+
+ // We no longer need coro.end in F.
+ removeCoroEnds(Shape);
+
+ postSplitCleanup(F);
+ postSplitCleanup(*ResumeClone);
+ postSplitCleanup(*DestroyClone);
+ postSplitCleanup(*CleanupClone);
+
+ // Store addresses resume/destroy/cleanup functions in the coroutine frame.
+ updateCoroFrame(Shape, ResumeClone, DestroyClone, CleanupClone);
+
+ // Create a constant array referring to resume/destroy/clone functions pointed
+ // by the last argument of @llvm.coro.info, so that CoroElide pass can
+ // determined correct function to call.
+ setCoroInfo(F, Shape.CoroBegin, {ResumeClone, DestroyClone, CleanupClone});
+
+ // Update call graph and add the functions we created to the SCC.
+ coro::updateCallGraph(F, {ResumeClone, DestroyClone, CleanupClone}, CG, SCC);
+}
+
+// When we see the coroutine the first time, we insert an indirect call to a
+// devirt trigger function and mark the coroutine that it is now ready for
+// split.
+static void prepareForSplit(Function &F, CallGraph &CG) {
+ Module &M = *F.getParent();
+#ifndef NDEBUG
+ Function *DevirtFn = M.getFunction(CORO_DEVIRT_TRIGGER_FN);
+ assert(DevirtFn && "coro.devirt.trigger function not found");
+#endif
+
+ F.addFnAttr(CORO_PRESPLIT_ATTR, PREPARED_FOR_SPLIT);
+
+ // Insert an indirect call sequence that will be devirtualized by CoroElide
+ // pass:
+ // %0 = call i8* @llvm.coro.subfn.addr(i8* null, i8 -1)
+ // %1 = bitcast i8* %0 to void(i8*)*
+ // call void %1(i8* null)
+ coro::LowererBase Lowerer(M);
+ Instruction *InsertPt = F.getEntryBlock().getTerminator();
+ auto *Null = ConstantPointerNull::get(Type::getInt8PtrTy(F.getContext()));
+ auto *DevirtFnAddr =
+ Lowerer.makeSubFnCall(Null, CoroSubFnInst::RestartTrigger, InsertPt);
+ auto *IndirectCall = CallInst::Create(DevirtFnAddr, Null, "", InsertPt);
+
+ // Update CG graph with an indirect call we just added.
+ CG[&F]->addCalledFunction(IndirectCall, CG.getCallsExternalNode());
+}
+
+// Make sure that there is a devirtualization trigger function that CoroSplit
+// pass uses the force restart CGSCC pipeline. If devirt trigger function is not
+// found, we will create one and add it to the current SCC.
+static void createDevirtTriggerFunc(CallGraph &CG, CallGraphSCC &SCC) {
+ Module &M = CG.getModule();
+ if (M.getFunction(CORO_DEVIRT_TRIGGER_FN))
+ return;
+
+ LLVMContext &C = M.getContext();
+ auto *FnTy = FunctionType::get(Type::getVoidTy(C), Type::getInt8PtrTy(C),
+ /*IsVarArgs=*/false);
+ Function *DevirtFn =
+ Function::Create(FnTy, GlobalValue::LinkageTypes::PrivateLinkage,
+ CORO_DEVIRT_TRIGGER_FN, &M);
+ DevirtFn->addFnAttr(Attribute::AlwaysInline);
+ auto *Entry = BasicBlock::Create(C, "entry", DevirtFn);
+ ReturnInst::Create(C, Entry);
+
+ auto *Node = CG.getOrInsertFunction(DevirtFn);
+
+ SmallVector<CallGraphNode *, 8> Nodes(SCC.begin(), SCC.end());
+ Nodes.push_back(Node);
+ SCC.initialize(Nodes);
+}
+
+//===----------------------------------------------------------------------===//
+// Top Level Driver
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+struct CoroSplit : public CallGraphSCCPass {
+ static char ID; // Pass identification, replacement for typeid
+ CoroSplit() : CallGraphSCCPass(ID) {}
+
+ bool Run = false;
+
+ // A coroutine is identified by the presence of coro.begin intrinsic, if
+ // we don't have any, this pass has nothing to do.
+ bool doInitialization(CallGraph &CG) override {
+ Run = coro::declaresIntrinsics(CG.getModule(), {"llvm.coro.begin"});
+ return CallGraphSCCPass::doInitialization(CG);
+ }
+
+ bool runOnSCC(CallGraphSCC &SCC) override {
+ if (!Run)
+ return false;
+
+ // Find coroutines for processing.
+ SmallVector<Function *, 4> Coroutines;
+ for (CallGraphNode *CGN : SCC)
+ if (auto *F = CGN->getFunction())
+ if (F->hasFnAttribute(CORO_PRESPLIT_ATTR))
+ Coroutines.push_back(F);
+
+ if (Coroutines.empty())
+ return false;
+
+ CallGraph &CG = getAnalysis<CallGraphWrapperPass>().getCallGraph();
+ createDevirtTriggerFunc(CG, SCC);
+
+ for (Function *F : Coroutines) {
+ Attribute Attr = F->getFnAttribute(CORO_PRESPLIT_ATTR);
+ StringRef Value = Attr.getValueAsString();
+ DEBUG(dbgs() << "CoroSplit: Processing coroutine '" << F->getName()
+ << "' state: " << Value << "\n");
+ if (Value == UNPREPARED_FOR_SPLIT) {
+ prepareForSplit(*F, CG);
+ continue;
+ }
+ F->removeFnAttr(CORO_PRESPLIT_ATTR);
+ splitCoroutine(*F, CG, SCC);
+ }
+ return true;
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ CallGraphSCCPass::getAnalysisUsage(AU);
+ }
+};
+}
+
+char CoroSplit::ID = 0;
+INITIALIZE_PASS(
+ CoroSplit, "coro-split",
+ "Split coroutine into a set of functions driving its state machine", false,
+ false)
+
+Pass *llvm::createCoroSplitPass() { return new CoroSplit(); }
diff --git a/contrib/llvm/lib/Transforms/Coroutines/Coroutines.cpp b/contrib/llvm/lib/Transforms/Coroutines/Coroutines.cpp
new file mode 100644
index 0000000..877ec34
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Coroutines/Coroutines.cpp
@@ -0,0 +1,314 @@
+//===-- Coroutines.cpp ----------------------------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This file implements the common infrastructure for Coroutine Passes.
+//===----------------------------------------------------------------------===//
+
+#include "CoroInternal.h"
+#include "llvm/Analysis/CallGraphSCCPass.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/IPO/PassManagerBuilder.h"
+#include "llvm/Transforms/Utils/Local.h"
+
+using namespace llvm;
+
+void llvm::initializeCoroutines(PassRegistry &Registry) {
+ initializeCoroEarlyPass(Registry);
+ initializeCoroSplitPass(Registry);
+ initializeCoroElidePass(Registry);
+ initializeCoroCleanupPass(Registry);
+}
+
+static void addCoroutineOpt0Passes(const PassManagerBuilder &Builder,
+ legacy::PassManagerBase &PM) {
+ PM.add(createCoroSplitPass());
+ PM.add(createCoroElidePass());
+
+ PM.add(createBarrierNoopPass());
+ PM.add(createCoroCleanupPass());
+}
+
+static void addCoroutineEarlyPasses(const PassManagerBuilder &Builder,
+ legacy::PassManagerBase &PM) {
+ PM.add(createCoroEarlyPass());
+}
+
+static void addCoroutineScalarOptimizerPasses(const PassManagerBuilder &Builder,
+ legacy::PassManagerBase &PM) {
+ PM.add(createCoroElidePass());
+}
+
+static void addCoroutineSCCPasses(const PassManagerBuilder &Builder,
+ legacy::PassManagerBase &PM) {
+ PM.add(createCoroSplitPass());
+}
+
+static void addCoroutineOptimizerLastPasses(const PassManagerBuilder &Builder,
+ legacy::PassManagerBase &PM) {
+ PM.add(createCoroCleanupPass());
+}
+
+void llvm::addCoroutinePassesToExtensionPoints(PassManagerBuilder &Builder) {
+ Builder.addExtension(PassManagerBuilder::EP_EarlyAsPossible,
+ addCoroutineEarlyPasses);
+ Builder.addExtension(PassManagerBuilder::EP_EnabledOnOptLevel0,
+ addCoroutineOpt0Passes);
+ Builder.addExtension(PassManagerBuilder::EP_CGSCCOptimizerLate,
+ addCoroutineSCCPasses);
+ Builder.addExtension(PassManagerBuilder::EP_ScalarOptimizerLate,
+ addCoroutineScalarOptimizerPasses);
+ Builder.addExtension(PassManagerBuilder::EP_OptimizerLast,
+ addCoroutineOptimizerLastPasses);
+}
+
+// Construct the lowerer base class and initialize its members.
+coro::LowererBase::LowererBase(Module &M)
+ : TheModule(M), Context(M.getContext()),
+ Int8Ptr(Type::getInt8PtrTy(Context)),
+ ResumeFnType(FunctionType::get(Type::getVoidTy(Context), Int8Ptr,
+ /*isVarArg=*/false)),
+ NullPtr(ConstantPointerNull::get(Int8Ptr)) {}
+
+// Creates a sequence of instructions to obtain a resume function address using
+// llvm.coro.subfn.addr. It generates the following sequence:
+//
+// call i8* @llvm.coro.subfn.addr(i8* %Arg, i8 %index)
+// bitcast i8* %2 to void(i8*)*
+
+Value *coro::LowererBase::makeSubFnCall(Value *Arg, int Index,
+ Instruction *InsertPt) {
+ auto *IndexVal = ConstantInt::get(Type::getInt8Ty(Context), Index);
+ auto *Fn = Intrinsic::getDeclaration(&TheModule, Intrinsic::coro_subfn_addr);
+
+ assert(Index >= CoroSubFnInst::IndexFirst &&
+ Index < CoroSubFnInst::IndexLast &&
+ "makeSubFnCall: Index value out of range");
+ auto *Call = CallInst::Create(Fn, {Arg, IndexVal}, "", InsertPt);
+
+ auto *Bitcast =
+ new BitCastInst(Call, ResumeFnType->getPointerTo(), "", InsertPt);
+ return Bitcast;
+}
+
+#ifndef NDEBUG
+static bool isCoroutineIntrinsicName(StringRef Name) {
+ // NOTE: Must be sorted!
+ static const char *const CoroIntrinsics[] = {
+ "llvm.coro.alloc", "llvm.coro.begin", "llvm.coro.destroy",
+ "llvm.coro.done", "llvm.coro.end", "llvm.coro.frame",
+ "llvm.coro.free", "llvm.coro.id", "llvm.coro.param",
+ "llvm.coro.promise", "llvm.coro.resume", "llvm.coro.save",
+ "llvm.coro.size", "llvm.coro.subfn.addr", "llvm.coro.suspend",
+ };
+ return Intrinsic::lookupLLVMIntrinsicByName(CoroIntrinsics, Name) != -1;
+}
+#endif
+
+// Verifies if a module has named values listed. Also, in debug mode verifies
+// that names are intrinsic names.
+bool coro::declaresIntrinsics(Module &M,
+ std::initializer_list<StringRef> List) {
+
+ for (StringRef Name : List) {
+ assert(isCoroutineIntrinsicName(Name) && "not a coroutine intrinsic");
+ if (M.getNamedValue(Name))
+ return true;
+ }
+
+ return false;
+}
+
+// Replace all coro.frees associated with the provided CoroId either with 'null'
+// if Elide is true and with its frame parameter otherwise.
+void coro::replaceCoroFree(CoroIdInst *CoroId, bool Elide) {
+ SmallVector<CoroFreeInst *, 4> CoroFrees;
+ for (User *U : CoroId->users())
+ if (auto CF = dyn_cast<CoroFreeInst>(U))
+ CoroFrees.push_back(CF);
+
+ if (CoroFrees.empty())
+ return;
+
+ Value *Replacement =
+ Elide ? ConstantPointerNull::get(Type::getInt8PtrTy(CoroId->getContext()))
+ : CoroFrees.front()->getFrame();
+
+ for (CoroFreeInst *CF : CoroFrees) {
+ CF->replaceAllUsesWith(Replacement);
+ CF->eraseFromParent();
+ }
+}
+
+// FIXME: This code is stolen from CallGraph::addToCallGraph(Function *F), which
+// happens to be private. It is better for this functionality exposed by the
+// CallGraph.
+static void buildCGN(CallGraph &CG, CallGraphNode *Node) {
+ Function *F = Node->getFunction();
+
+ // Look for calls by this function.
+ for (Instruction &I : instructions(F))
+ if (CallSite CS = CallSite(cast<Value>(&I))) {
+ const Function *Callee = CS.getCalledFunction();
+ if (!Callee || !Intrinsic::isLeaf(Callee->getIntrinsicID()))
+ // Indirect calls of intrinsics are not allowed so no need to check.
+ // We can be more precise here by using TargetArg returned by
+ // Intrinsic::isLeaf.
+ Node->addCalledFunction(CS, CG.getCallsExternalNode());
+ else if (!Callee->isIntrinsic())
+ Node->addCalledFunction(CS, CG.getOrInsertFunction(Callee));
+ }
+}
+
+// Rebuild CGN after we extracted parts of the code from ParentFunc into
+// NewFuncs. Builds CGNs for the NewFuncs and adds them to the current SCC.
+void coro::updateCallGraph(Function &ParentFunc, ArrayRef<Function *> NewFuncs,
+ CallGraph &CG, CallGraphSCC &SCC) {
+ // Rebuild CGN from scratch for the ParentFunc
+ auto *ParentNode = CG[&ParentFunc];
+ ParentNode->removeAllCalledFunctions();
+ buildCGN(CG, ParentNode);
+
+ SmallVector<CallGraphNode *, 8> Nodes(SCC.begin(), SCC.end());
+
+ for (Function *F : NewFuncs) {
+ CallGraphNode *Callee = CG.getOrInsertFunction(F);
+ Nodes.push_back(Callee);
+ buildCGN(CG, Callee);
+ }
+
+ SCC.initialize(Nodes);
+}
+
+static void clear(coro::Shape &Shape) {
+ Shape.CoroBegin = nullptr;
+ Shape.CoroEnds.clear();
+ Shape.CoroSizes.clear();
+ Shape.CoroSuspends.clear();
+
+ Shape.FrameTy = nullptr;
+ Shape.FramePtr = nullptr;
+ Shape.AllocaSpillBlock = nullptr;
+ Shape.ResumeSwitch = nullptr;
+ Shape.PromiseAlloca = nullptr;
+ Shape.HasFinalSuspend = false;
+}
+
+static CoroSaveInst *createCoroSave(CoroBeginInst *CoroBegin,
+ CoroSuspendInst *SuspendInst) {
+ Module *M = SuspendInst->getModule();
+ auto *Fn = Intrinsic::getDeclaration(M, Intrinsic::coro_save);
+ auto *SaveInst =
+ cast<CoroSaveInst>(CallInst::Create(Fn, CoroBegin, "", SuspendInst));
+ assert(!SuspendInst->getCoroSave());
+ SuspendInst->setArgOperand(0, SaveInst);
+ return SaveInst;
+}
+
+// Collect "interesting" coroutine intrinsics.
+void coro::Shape::buildFrom(Function &F) {
+ size_t FinalSuspendIndex = 0;
+ clear(*this);
+ SmallVector<CoroFrameInst *, 8> CoroFrames;
+ for (Instruction &I : instructions(F)) {
+ if (auto II = dyn_cast<IntrinsicInst>(&I)) {
+ switch (II->getIntrinsicID()) {
+ default:
+ continue;
+ case Intrinsic::coro_size:
+ CoroSizes.push_back(cast<CoroSizeInst>(II));
+ break;
+ case Intrinsic::coro_frame:
+ CoroFrames.push_back(cast<CoroFrameInst>(II));
+ break;
+ case Intrinsic::coro_suspend:
+ CoroSuspends.push_back(cast<CoroSuspendInst>(II));
+ if (CoroSuspends.back()->isFinal()) {
+ if (HasFinalSuspend)
+ report_fatal_error(
+ "Only one suspend point can be marked as final");
+ HasFinalSuspend = true;
+ FinalSuspendIndex = CoroSuspends.size() - 1;
+ }
+ break;
+ case Intrinsic::coro_begin: {
+ auto CB = cast<CoroBeginInst>(II);
+ if (CB->getId()->getInfo().isPreSplit()) {
+ if (CoroBegin)
+ report_fatal_error(
+ "coroutine should have exactly one defining @llvm.coro.begin");
+ CB->addAttribute(AttributeSet::ReturnIndex, Attribute::NonNull);
+ CB->addAttribute(AttributeSet::ReturnIndex, Attribute::NoAlias);
+ CB->removeAttribute(AttributeSet::FunctionIndex,
+ Attribute::NoDuplicate);
+ CoroBegin = CB;
+ }
+ break;
+ }
+ case Intrinsic::coro_end:
+ CoroEnds.push_back(cast<CoroEndInst>(II));
+ if (CoroEnds.back()->isFallthrough()) {
+ // Make sure that the fallthrough coro.end is the first element in the
+ // CoroEnds vector.
+ if (CoroEnds.size() > 1) {
+ if (CoroEnds.front()->isFallthrough())
+ report_fatal_error(
+ "Only one coro.end can be marked as fallthrough");
+ std::swap(CoroEnds.front(), CoroEnds.back());
+ }
+ }
+ break;
+ }
+ }
+ }
+
+ // If for some reason, we were not able to find coro.begin, bailout.
+ if (!CoroBegin) {
+ // Replace coro.frame which are supposed to be lowered to the result of
+ // coro.begin with undef.
+ auto *Undef = UndefValue::get(Type::getInt8PtrTy(F.getContext()));
+ for (CoroFrameInst *CF : CoroFrames) {
+ CF->replaceAllUsesWith(Undef);
+ CF->eraseFromParent();
+ }
+
+ // Replace all coro.suspend with undef and remove related coro.saves if
+ // present.
+ for (CoroSuspendInst *CS : CoroSuspends) {
+ CS->replaceAllUsesWith(UndefValue::get(CS->getType()));
+ CS->eraseFromParent();
+ if (auto *CoroSave = CS->getCoroSave())
+ CoroSave->eraseFromParent();
+ }
+
+ // Replace all coro.ends with unreachable instruction.
+ for (CoroEndInst *CE : CoroEnds)
+ changeToUnreachable(CE, /*UseLLVMTrap=*/false);
+
+ return;
+ }
+
+ // The coro.free intrinsic is always lowered to the result of coro.begin.
+ for (CoroFrameInst *CF : CoroFrames) {
+ CF->replaceAllUsesWith(CoroBegin);
+ CF->eraseFromParent();
+ }
+
+ // Canonicalize coro.suspend by inserting a coro.save if needed.
+ for (CoroSuspendInst *CS : CoroSuspends)
+ if (!CS->getCoroSave())
+ createCoroSave(CoroBegin, CS);
+
+ // Move final suspend to be the last element in the CoroSuspends vector.
+ if (HasFinalSuspend &&
+ FinalSuspendIndex != CoroSuspends.size() - 1)
+ std::swap(CoroSuspends[FinalSuspendIndex], CoroSuspends.back());
+}
diff --git a/contrib/llvm/lib/Transforms/IPO/AlwaysInliner.cpp b/contrib/llvm/lib/Transforms/IPO/AlwaysInliner.cpp
new file mode 100644
index 0000000..b7d9600
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/IPO/AlwaysInliner.cpp
@@ -0,0 +1,158 @@
+//===- InlineAlways.cpp - Code to inline always_inline functions ----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a custom inliner that handles only functions that
+// are marked as "always inline".
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO/AlwaysInliner.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/Analysis/InlineCost.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/IPO/Inliner.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "inline"
+
+PreservedAnalyses AlwaysInlinerPass::run(Module &M, ModuleAnalysisManager &) {
+ InlineFunctionInfo IFI;
+ SmallSetVector<CallSite, 16> Calls;
+ bool Changed = false;
+ SmallVector<Function *, 16> InlinedFunctions;
+ for (Function &F : M)
+ if (!F.isDeclaration() && F.hasFnAttribute(Attribute::AlwaysInline) &&
+ isInlineViable(F)) {
+ Calls.clear();
+
+ for (User *U : F.users())
+ if (auto CS = CallSite(U))
+ if (CS.getCalledFunction() == &F)
+ Calls.insert(CS);
+
+ for (CallSite CS : Calls)
+ // FIXME: We really shouldn't be able to fail to inline at this point!
+ // We should do something to log or check the inline failures here.
+ Changed |= InlineFunction(CS, IFI);
+
+ // Remember to try and delete this function afterward. This both avoids
+ // re-walking the rest of the module and avoids dealing with any iterator
+ // invalidation issues while deleting functions.
+ InlinedFunctions.push_back(&F);
+ }
+
+ // Remove any live functions.
+ erase_if(InlinedFunctions, [&](Function *F) {
+ F->removeDeadConstantUsers();
+ return !F->isDefTriviallyDead();
+ });
+
+ // Delete the non-comdat ones from the module and also from our vector.
+ auto NonComdatBegin = partition(
+ InlinedFunctions, [&](Function *F) { return F->hasComdat(); });
+ for (Function *F : make_range(NonComdatBegin, InlinedFunctions.end()))
+ M.getFunctionList().erase(F);
+ InlinedFunctions.erase(NonComdatBegin, InlinedFunctions.end());
+
+ if (!InlinedFunctions.empty()) {
+ // Now we just have the comdat functions. Filter out the ones whose comdats
+ // are not actually dead.
+ filterDeadComdatFunctions(M, InlinedFunctions);
+ // The remaining functions are actually dead.
+ for (Function *F : InlinedFunctions)
+ M.getFunctionList().erase(F);
+ }
+
+ return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
+}
+
+namespace {
+
+/// Inliner pass which only handles "always inline" functions.
+///
+/// Unlike the \c AlwaysInlinerPass, this uses the more heavyweight \c Inliner
+/// base class to provide several facilities such as array alloca merging.
+class AlwaysInlinerLegacyPass : public LegacyInlinerBase {
+
+public:
+ AlwaysInlinerLegacyPass() : LegacyInlinerBase(ID, /*InsertLifetime*/ true) {
+ initializeAlwaysInlinerLegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ AlwaysInlinerLegacyPass(bool InsertLifetime)
+ : LegacyInlinerBase(ID, InsertLifetime) {
+ initializeAlwaysInlinerLegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ /// Main run interface method. We override here to avoid calling skipSCC().
+ bool runOnSCC(CallGraphSCC &SCC) override { return inlineCalls(SCC); }
+
+ static char ID; // Pass identification, replacement for typeid
+
+ InlineCost getInlineCost(CallSite CS) override;
+
+ using llvm::Pass::doFinalization;
+ bool doFinalization(CallGraph &CG) override {
+ return removeDeadFunctions(CG, /*AlwaysInlineOnly=*/true);
+ }
+};
+}
+
+char AlwaysInlinerLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(AlwaysInlinerLegacyPass, "always-inline",
+ "Inliner for always_inline functions", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(AlwaysInlinerLegacyPass, "always-inline",
+ "Inliner for always_inline functions", false, false)
+
+Pass *llvm::createAlwaysInlinerLegacyPass(bool InsertLifetime) {
+ return new AlwaysInlinerLegacyPass(InsertLifetime);
+}
+
+/// \brief Get the inline cost for the always-inliner.
+///
+/// The always inliner *only* handles functions which are marked with the
+/// attribute to force inlining. As such, it is dramatically simpler and avoids
+/// using the powerful (but expensive) inline cost analysis. Instead it uses
+/// a very simple and boring direct walk of the instructions looking for
+/// impossible-to-inline constructs.
+///
+/// Note, it would be possible to go to some lengths to cache the information
+/// computed here, but as we only expect to do this for relatively few and
+/// small functions which have the explicit attribute to force inlining, it is
+/// likely not worth it in practice.
+InlineCost AlwaysInlinerLegacyPass::getInlineCost(CallSite CS) {
+ Function *Callee = CS.getCalledFunction();
+
+ // Only inline direct calls to functions with always-inline attributes
+ // that are viable for inlining. FIXME: We shouldn't even get here for
+ // declarations.
+ if (Callee && !Callee->isDeclaration() &&
+ CS.hasFnAttr(Attribute::AlwaysInline) && isInlineViable(*Callee))
+ return InlineCost::getAlways();
+
+ return InlineCost::getNever();
+}
diff --git a/contrib/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp b/contrib/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
index 0716a3a..65b7bad 100644
--- a/contrib/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
@@ -40,7 +40,6 @@
#include "llvm/Analysis/CallGraphSCCPass.h"
#include "llvm/Analysis/Loads.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/CFG.h"
#include "llvm/IR/CallSite.h"
#include "llvm/IR/Constants.h"
@@ -177,8 +176,7 @@ static bool isDenselyPacked(Type *type, const DataLayout &DL) {
// For homogenous sequential types, check for padding within members.
if (SequentialType *seqTy = dyn_cast<SequentialType>(type))
- return isa<PointerType>(seqTy) ||
- isDenselyPacked(seqTy->getElementType(), DL);
+ return isDenselyPacked(seqTy->getElementType(), DL);
// Check for padding within and between elements of a struct.
StructType *StructTy = cast<StructType>(type);
@@ -375,8 +373,8 @@ static bool AllCallersPassInValidPointerForArgument(Argument *Arg) {
unsigned ArgNo = Arg->getArgNo();
- // Look at all call sites of the function. At this pointer we know we only
- // have direct callees.
+ // Look at all call sites of the function. At this point we know we only have
+ // direct callees.
for (User *U : Callee->users()) {
CallSite CS(U);
assert(CS && "Should only have direct calls!");
@@ -600,7 +598,7 @@ static bool isSafeToPromoteArgument(Argument *Arg, bool isByValOrInAlloca,
// Because there could be several/many load instructions, remember which
// blocks we know to be transparent to the load.
- SmallPtrSet<BasicBlock*, 16> TranspBlocks;
+ df_iterator_default_set<BasicBlock*, 16> TranspBlocks;
for (LoadInst *Load : Loads) {
// Check to see if the load is invalidated from the start of the block to
@@ -836,7 +834,10 @@ DoPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote,
Type::getInt64Ty(F->getContext()));
Ops.push_back(ConstantInt::get(IdxTy, II));
// Keep track of the type we're currently indexing.
- ElTy = cast<CompositeType>(ElTy)->getTypeAtIndex(II);
+ if (auto *ElPTy = dyn_cast<PointerType>(ElTy))
+ ElTy = ElPTy->getElementType();
+ else
+ ElTy = cast<CompositeType>(ElTy)->getTypeAtIndex(II);
}
// And create a GEP to extract those indices.
V = GetElementPtrInst::Create(ArgIndex.first, V, Ops,
@@ -886,8 +887,8 @@ DoPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote,
cast<CallInst>(New)->setCallingConv(CS.getCallingConv());
cast<CallInst>(New)->setAttributes(AttributeSet::get(New->getContext(),
AttributesVec));
- if (cast<CallInst>(Call)->isTailCall())
- cast<CallInst>(New)->setTailCall();
+ cast<CallInst>(New)->setTailCallKind(
+ cast<CallInst>(Call)->getTailCallKind());
}
New->setDebugLoc(Call->getDebugLoc());
Args.clear();
diff --git a/contrib/llvm/lib/Transforms/IPO/CrossDSOCFI.cpp b/contrib/llvm/lib/Transforms/IPO/CrossDSOCFI.cpp
index 58731ea..ba2e60d 100644
--- a/contrib/llvm/lib/Transforms/IPO/CrossDSOCFI.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/CrossDSOCFI.cpp
@@ -155,7 +155,7 @@ bool CrossDSOCFI::runOnModule(Module &M) {
return true;
}
-PreservedAnalyses CrossDSOCFIPass::run(Module &M, AnalysisManager<Module> &AM) {
+PreservedAnalyses CrossDSOCFIPass::run(Module &M, ModuleAnalysisManager &AM) {
CrossDSOCFI Impl;
bool Changed = Impl.runOnModule(M);
if (!Changed)
diff --git a/contrib/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp b/contrib/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp
index c8c895b..1a5ed46 100644
--- a/contrib/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp
@@ -190,8 +190,8 @@ bool DeadArgumentEliminationPass::DeleteDeadVarargs(Function &Fn) {
New = CallInst::Create(NF, Args, OpBundles, "", Call);
cast<CallInst>(New)->setCallingConv(CS.getCallingConv());
cast<CallInst>(New)->setAttributes(PAL);
- if (cast<CallInst>(Call)->isTailCall())
- cast<CallInst>(New)->setTailCall();
+ cast<CallInst>(New)->setTailCallKind(
+ cast<CallInst>(Call)->getTailCallKind());
}
New->setDebugLoc(Call->getDebugLoc());
@@ -270,7 +270,7 @@ bool DeadArgumentEliminationPass::RemoveDeadArgumentsFromCallers(Function &Fn) {
SmallVector<unsigned, 8> UnusedArgs;
for (Argument &Arg : Fn.args()) {
- if (Arg.use_empty() && !Arg.hasByValOrInAllocaAttr())
+ if (!Arg.hasSwiftErrorAttr() && Arg.use_empty() && !Arg.hasByValOrInAllocaAttr())
UnusedArgs.push_back(Arg.getArgNo());
}
@@ -896,8 +896,8 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) {
New = CallInst::Create(NF, Args, OpBundles, "", Call);
cast<CallInst>(New)->setCallingConv(CS.getCallingConv());
cast<CallInst>(New)->setAttributes(NewCallPAL);
- if (cast<CallInst>(Call)->isTailCall())
- cast<CallInst>(New)->setTailCall();
+ cast<CallInst>(New)->setTailCallKind(
+ cast<CallInst>(Call)->getTailCallKind());
}
New->setDebugLoc(Call->getDebugLoc());
diff --git a/contrib/llvm/lib/Transforms/IPO/FunctionAttrs.cpp b/contrib/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
index 787f434..402a665 100644
--- a/contrib/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
@@ -42,6 +42,7 @@ using namespace llvm;
STATISTIC(NumReadNone, "Number of functions marked readnone");
STATISTIC(NumReadOnly, "Number of functions marked readonly");
STATISTIC(NumNoCapture, "Number of arguments marked nocapture");
+STATISTIC(NumReturned, "Number of arguments marked returned");
STATISTIC(NumReadNoneArg, "Number of arguments marked readnone");
STATISTIC(NumReadOnlyArg, "Number of arguments marked readonly");
STATISTIC(NumNoAlias, "Number of function returns marked noalias");
@@ -331,23 +332,16 @@ struct ArgumentUsesTracker : public CaptureTracker {
namespace llvm {
template <> struct GraphTraits<ArgumentGraphNode *> {
- typedef ArgumentGraphNode NodeType;
typedef ArgumentGraphNode *NodeRef;
typedef SmallVectorImpl<ArgumentGraphNode *>::iterator ChildIteratorType;
- static inline NodeType *getEntryNode(NodeType *A) { return A; }
- static inline ChildIteratorType child_begin(NodeType *N) {
- return N->Uses.begin();
- }
- static inline ChildIteratorType child_end(NodeType *N) {
- return N->Uses.end();
- }
+ static NodeRef getEntryNode(NodeRef A) { return A; }
+ static ChildIteratorType child_begin(NodeRef N) { return N->Uses.begin(); }
+ static ChildIteratorType child_end(NodeRef N) { return N->Uses.end(); }
};
template <>
struct GraphTraits<ArgumentGraph *> : public GraphTraits<ArgumentGraphNode *> {
- static NodeType *getEntryNode(ArgumentGraph *AG) {
- return AG->getEntryNode();
- }
+ static NodeRef getEntryNode(ArgumentGraph *AG) { return AG->getEntryNode(); }
static ChildIteratorType nodes_begin(ArgumentGraph *AG) {
return AG->begin();
}
@@ -447,8 +441,8 @@ determinePointerReadAttrs(Argument *A,
// to a operand bundle use, these cannot participate in the optimistic SCC
// analysis. Instead, we model the operand bundle uses as arguments in
// call to a function external to the SCC.
- if (!SCCNodes.count(&*std::next(F->arg_begin(), UseIndex)) ||
- IsOperandBundleUse) {
+ if (IsOperandBundleUse ||
+ !SCCNodes.count(&*std::next(F->arg_begin(), UseIndex))) {
// The accessors used on CallSite here do the right thing for calls and
// invokes with operand bundles.
@@ -484,6 +478,59 @@ determinePointerReadAttrs(Argument *A,
return IsRead ? Attribute::ReadOnly : Attribute::ReadNone;
}
+/// Deduce returned attributes for the SCC.
+static bool addArgumentReturnedAttrs(const SCCNodeSet &SCCNodes) {
+ bool Changed = false;
+
+ AttrBuilder B;
+ B.addAttribute(Attribute::Returned);
+
+ // Check each function in turn, determining if an argument is always returned.
+ for (Function *F : SCCNodes) {
+ // We can infer and propagate function attributes only when we know that the
+ // definition we'll get at link time is *exactly* the definition we see now.
+ // For more details, see GlobalValue::mayBeDerefined.
+ if (!F->hasExactDefinition())
+ continue;
+
+ if (F->getReturnType()->isVoidTy())
+ continue;
+
+ // There is nothing to do if an argument is already marked as 'returned'.
+ if (any_of(F->args(),
+ [](const Argument &Arg) { return Arg.hasReturnedAttr(); }))
+ continue;
+
+ auto FindRetArg = [&]() -> Value * {
+ Value *RetArg = nullptr;
+ for (BasicBlock &BB : *F)
+ if (auto *Ret = dyn_cast<ReturnInst>(BB.getTerminator())) {
+ // Note that stripPointerCasts should look through functions with
+ // returned arguments.
+ Value *RetVal = Ret->getReturnValue()->stripPointerCasts();
+ if (!isa<Argument>(RetVal) || RetVal->getType() != F->getReturnType())
+ return nullptr;
+
+ if (!RetArg)
+ RetArg = RetVal;
+ else if (RetArg != RetVal)
+ return nullptr;
+ }
+
+ return RetArg;
+ };
+
+ if (Value *RetArg = FindRetArg()) {
+ auto *A = cast<Argument>(RetArg);
+ A->addAttr(AttributeSet::get(F->getContext(), A->getArgNo() + 1, B));
+ ++NumReturned;
+ Changed = true;
+ }
+ }
+
+ return Changed;
+}
+
/// Deduce nocapture attributes for the SCC.
static bool addArgumentAttrs(const SCCNodeSet &SCCNodes) {
bool Changed = false;
@@ -726,7 +773,8 @@ static bool isFunctionMallocLike(Function *F, const SCCNodeSet &SCCNodes) {
break;
if (CS.getCalledFunction() && SCCNodes.count(CS.getCalledFunction()))
break;
- } // fall-through
+ LLVM_FALLTHROUGH;
+ }
default:
return false; // Did not come from an allocation.
}
@@ -986,9 +1034,11 @@ static bool addNoRecurseAttrs(const SCCNodeSet &SCCNodes) {
}
PreservedAnalyses PostOrderFunctionAttrsPass::run(LazyCallGraph::SCC &C,
- CGSCCAnalysisManager &AM) {
+ CGSCCAnalysisManager &AM,
+ LazyCallGraph &CG,
+ CGSCCUpdateResult &) {
FunctionAnalysisManager &FAM =
- AM.getResult<FunctionAnalysisManagerCGSCCProxy>(C).getManager();
+ AM.getResult<FunctionAnalysisManagerCGSCCProxy>(C, CG).getManager();
// We pass a lambda into functions to wire them up to the analysis manager
// for getting function analyses.
@@ -1025,6 +1075,7 @@ PreservedAnalyses PostOrderFunctionAttrsPass::run(LazyCallGraph::SCC &C,
}
bool Changed = false;
+ Changed |= addArgumentReturnedAttrs(SCCNodes);
Changed |= addReadAttrs(SCCNodes, AARGetter);
Changed |= addArgumentAttrs(SCCNodes);
@@ -1044,7 +1095,8 @@ namespace {
struct PostOrderFunctionAttrsLegacyPass : public CallGraphSCCPass {
static char ID; // Pass identification, replacement for typeid
PostOrderFunctionAttrsLegacyPass() : CallGraphSCCPass(ID) {
- initializePostOrderFunctionAttrsLegacyPassPass(*PassRegistry::getPassRegistry());
+ initializePostOrderFunctionAttrsLegacyPassPass(
+ *PassRegistry::getPassRegistry());
}
bool runOnSCC(CallGraphSCC &SCC) override;
@@ -1066,7 +1118,9 @@ INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass)
INITIALIZE_PASS_END(PostOrderFunctionAttrsLegacyPass, "functionattrs",
"Deduce function attributes", false, false)
-Pass *llvm::createPostOrderFunctionAttrsLegacyPass() { return new PostOrderFunctionAttrsLegacyPass(); }
+Pass *llvm::createPostOrderFunctionAttrsLegacyPass() {
+ return new PostOrderFunctionAttrsLegacyPass();
+}
template <typename AARGetterT>
static bool runImpl(CallGraphSCC &SCC, AARGetterT AARGetter) {
@@ -1090,6 +1144,7 @@ static bool runImpl(CallGraphSCC &SCC, AARGetterT AARGetter) {
SCCNodes.insert(F);
}
+ Changed |= addArgumentReturnedAttrs(SCCNodes);
Changed |= addReadAttrs(SCCNodes, AARGetter);
Changed |= addArgumentAttrs(SCCNodes);
@@ -1127,7 +1182,8 @@ namespace {
struct ReversePostOrderFunctionAttrsLegacyPass : public ModulePass {
static char ID; // Pass identification, replacement for typeid
ReversePostOrderFunctionAttrsLegacyPass() : ModulePass(ID) {
- initializeReversePostOrderFunctionAttrsLegacyPassPass(*PassRegistry::getPassRegistry());
+ initializeReversePostOrderFunctionAttrsLegacyPassPass(
+ *PassRegistry::getPassRegistry());
}
bool runOnModule(Module &M) override;
@@ -1216,10 +1272,17 @@ bool ReversePostOrderFunctionAttrsLegacyPass::runOnModule(Module &M) {
}
PreservedAnalyses
-ReversePostOrderFunctionAttrsPass::run(Module &M, AnalysisManager<Module> &AM) {
+ReversePostOrderFunctionAttrsPass::run(Module &M, ModuleAnalysisManager &AM) {
auto &CG = AM.getResult<CallGraphAnalysis>(M);
bool Changed = deduceFunctionAttributeInRPO(M, CG);
+
+ // CallGraphAnalysis holds AssertingVH and must be invalidated eagerly so
+ // that other passes don't delete stuff from under it.
+ // FIXME: We need to invalidate this to avoid PR28400. Is there a better
+ // solution?
+ AM.invalidate<CallGraphAnalysis>(M);
+
if (!Changed)
return PreservedAnalyses::all();
PreservedAnalyses PA;
diff --git a/contrib/llvm/lib/Transforms/IPO/FunctionImport.cpp b/contrib/llvm/lib/Transforms/IPO/FunctionImport.cpp
index c9d075e..6b32f6c 100644
--- a/contrib/llvm/lib/Transforms/IPO/FunctionImport.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/FunctionImport.cpp
@@ -21,6 +21,7 @@
#include "llvm/IR/DiagnosticPrinter.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Module.h"
+#include "llvm/IR/Verifier.h"
#include "llvm/IRReader/IRReader.h"
#include "llvm/Linker/Linker.h"
#include "llvm/Object/IRObjectFile.h"
@@ -35,7 +36,10 @@
using namespace llvm;
-STATISTIC(NumImported, "Number of functions imported");
+STATISTIC(NumImportedFunctions, "Number of functions imported");
+STATISTIC(NumImportedModules, "Number of modules imported from");
+STATISTIC(NumDeadSymbols, "Number of dead stripped symbols in index");
+STATISTIC(NumLiveSymbols, "Number of live symbols in index");
/// Limit on instruction count of imported functions.
static cl::opt<unsigned> ImportInstrLimit(
@@ -49,9 +53,28 @@ static cl::opt<float>
"`import-instr-limit` threshold by this factor "
"before processing newly imported functions"));
+static cl::opt<float> ImportHotInstrFactor(
+ "import-hot-evolution-factor", cl::init(1.0), cl::Hidden,
+ cl::value_desc("x"),
+ cl::desc("As we import functions called from hot callsite, multiply the "
+ "`import-instr-limit` threshold by this factor "
+ "before processing newly imported functions"));
+
+static cl::opt<float> ImportHotMultiplier(
+ "import-hot-multiplier", cl::init(3.0), cl::Hidden, cl::value_desc("x"),
+ cl::desc("Multiply the `import-instr-limit` threshold for hot callsites"));
+
+// FIXME: This multiplier was not really tuned up.
+static cl::opt<float> ImportColdMultiplier(
+ "import-cold-multiplier", cl::init(0), cl::Hidden, cl::value_desc("N"),
+ cl::desc("Multiply the `import-instr-limit` threshold for cold callsites"));
+
static cl::opt<bool> PrintImports("print-imports", cl::init(false), cl::Hidden,
cl::desc("Print imported functions"));
+static cl::opt<bool> ComputeDead("compute-dead", cl::init(true), cl::Hidden,
+ cl::desc("Compute dead symbols"));
+
// Temporary allows the function import pass to disable always linking
// referenced discardable symbols.
static cl::opt<bool>
@@ -88,69 +111,6 @@ static std::unique_ptr<Module> loadFile(const std::string &FileName,
namespace {
-// Return true if the Summary describes a GlobalValue that can be externally
-// referenced, i.e. it does not need renaming (linkage is not local) or renaming
-// is possible (does not have a section for instance).
-static bool canBeExternallyReferenced(const GlobalValueSummary &Summary) {
- if (!Summary.needsRenaming())
- return true;
-
- if (Summary.hasSection())
- // Can't rename a global that needs renaming if has a section.
- return false;
-
- return true;
-}
-
-// Return true if \p GUID describes a GlobalValue that can be externally
-// referenced, i.e. it does not need renaming (linkage is not local) or
-// renaming is possible (does not have a section for instance).
-static bool canBeExternallyReferenced(const ModuleSummaryIndex &Index,
- GlobalValue::GUID GUID) {
- auto Summaries = Index.findGlobalValueSummaryList(GUID);
- if (Summaries == Index.end())
- return true;
- if (Summaries->second.size() != 1)
- // If there are multiple globals with this GUID, then we know it is
- // not a local symbol, and it is necessarily externally referenced.
- return true;
-
- // We don't need to check for the module path, because if it can't be
- // externally referenced and we call it, it is necessarilly in the same
- // module
- return canBeExternallyReferenced(**Summaries->second.begin());
-}
-
-// Return true if the global described by \p Summary can be imported in another
-// module.
-static bool eligibleForImport(const ModuleSummaryIndex &Index,
- const GlobalValueSummary &Summary) {
- if (!canBeExternallyReferenced(Summary))
- // Can't import a global that needs renaming if has a section for instance.
- // FIXME: we may be able to import it by copying it without promotion.
- return false;
-
- // Check references (and potential calls) in the same module. If the current
- // value references a global that can't be externally referenced it is not
- // eligible for import.
- bool AllRefsCanBeExternallyReferenced =
- llvm::all_of(Summary.refs(), [&](const ValueInfo &VI) {
- return canBeExternallyReferenced(Index, VI.getGUID());
- });
- if (!AllRefsCanBeExternallyReferenced)
- return false;
-
- if (auto *FuncSummary = dyn_cast<FunctionSummary>(&Summary)) {
- bool AllCallsCanBeExternallyReferenced = llvm::all_of(
- FuncSummary->calls(), [&](const FunctionSummary::EdgeTy &Edge) {
- return canBeExternallyReferenced(Index, Edge.first.getGUID());
- });
- if (!AllCallsCanBeExternallyReferenced)
- return false;
- }
- return true;
-}
-
/// Given a list of possible callee implementation for a call site, select one
/// that fits the \p Threshold.
///
@@ -188,7 +148,7 @@ selectCallee(const ModuleSummaryIndex &Index,
if (Summary->instCount() > Threshold)
return false;
- if (!eligibleForImport(Index, *Summary))
+ if (Summary->notEligibleToImport())
return false;
return true;
@@ -210,63 +170,17 @@ static const GlobalValueSummary *selectCallee(GlobalValue::GUID GUID,
return selectCallee(Index, CalleeSummaryList->second, Threshold);
}
-/// Mark the global \p GUID as export by module \p ExportModulePath if found in
-/// this module. If it is a GlobalVariable, we also mark any referenced global
-/// in the current module as exported.
-static void exportGlobalInModule(const ModuleSummaryIndex &Index,
- StringRef ExportModulePath,
- GlobalValue::GUID GUID,
- FunctionImporter::ExportSetTy &ExportList) {
- auto FindGlobalSummaryInModule =
- [&](GlobalValue::GUID GUID) -> GlobalValueSummary *{
- auto SummaryList = Index.findGlobalValueSummaryList(GUID);
- if (SummaryList == Index.end())
- // This global does not have a summary, it is not part of the ThinLTO
- // process
- return nullptr;
- auto SummaryIter = llvm::find_if(
- SummaryList->second,
- [&](const std::unique_ptr<GlobalValueSummary> &Summary) {
- return Summary->modulePath() == ExportModulePath;
- });
- if (SummaryIter == SummaryList->second.end())
- return nullptr;
- return SummaryIter->get();
- };
-
- auto *Summary = FindGlobalSummaryInModule(GUID);
- if (!Summary)
- return;
- // We found it in the current module, mark as exported
- ExportList.insert(GUID);
-
- auto GVS = dyn_cast<GlobalVarSummary>(Summary);
- if (!GVS)
- return;
- // FunctionImportGlobalProcessing::doPromoteLocalToGlobal() will always
- // trigger importing the initializer for `constant unnamed addr` globals that
- // are referenced. We conservatively export all the referenced symbols for
- // every global to workaround this, so that the ExportList is accurate.
- // FIXME: with a "isConstant" flag in the summary we could be more targetted.
- for (auto &Ref : GVS->refs()) {
- auto GUID = Ref.getGUID();
- auto *RefSummary = FindGlobalSummaryInModule(GUID);
- if (RefSummary)
- // Found a ref in the current module, mark it as exported
- ExportList.insert(GUID);
- }
-}
-
-using EdgeInfo = std::pair<const FunctionSummary *, unsigned /* Threshold */>;
+using EdgeInfo = std::tuple<const FunctionSummary *, unsigned /* Threshold */,
+ GlobalValue::GUID>;
/// Compute the list of functions to import for a given caller. Mark these
/// imported functions and the symbols they reference in their source module as
/// exported from their source module.
static void computeImportForFunction(
const FunctionSummary &Summary, const ModuleSummaryIndex &Index,
- unsigned Threshold, const GVSummaryMapTy &DefinedGVSummaries,
+ const unsigned Threshold, const GVSummaryMapTy &DefinedGVSummaries,
SmallVectorImpl<EdgeInfo> &Worklist,
- FunctionImporter::ImportMapTy &ImportsForModule,
+ FunctionImporter::ImportMapTy &ImportList,
StringMap<FunctionImporter::ExportSetTy> *ExportLists = nullptr) {
for (auto &Edge : Summary.calls()) {
auto GUID = Edge.first.getGUID();
@@ -277,7 +191,18 @@ static void computeImportForFunction(
continue;
}
- auto *CalleeSummary = selectCallee(GUID, Threshold, Index);
+ auto GetBonusMultiplier = [](CalleeInfo::HotnessType Hotness) -> float {
+ if (Hotness == CalleeInfo::HotnessType::Hot)
+ return ImportHotMultiplier;
+ if (Hotness == CalleeInfo::HotnessType::Cold)
+ return ImportColdMultiplier;
+ return 1.0;
+ };
+
+ const auto NewThreshold =
+ Threshold * GetBonusMultiplier(Edge.second.Hotness);
+
+ auto *CalleeSummary = selectCallee(GUID, NewThreshold, Index);
if (!CalleeSummary) {
DEBUG(dbgs() << "ignored! No qualifying callee with summary found.\n");
continue;
@@ -293,40 +218,59 @@ static void computeImportForFunction(
} else
ResolvedCalleeSummary = cast<FunctionSummary>(CalleeSummary);
- assert(ResolvedCalleeSummary->instCount() <= Threshold &&
+ assert(ResolvedCalleeSummary->instCount() <= NewThreshold &&
"selectCallee() didn't honor the threshold");
+ auto GetAdjustedThreshold = [](unsigned Threshold, bool IsHotCallsite) {
+ // Adjust the threshold for next level of imported functions.
+ // The threshold is different for hot callsites because we can then
+ // inline chains of hot calls.
+ if (IsHotCallsite)
+ return Threshold * ImportHotInstrFactor;
+ return Threshold * ImportInstrFactor;
+ };
+
+ bool IsHotCallsite = Edge.second.Hotness == CalleeInfo::HotnessType::Hot;
+ const auto AdjThreshold = GetAdjustedThreshold(Threshold, IsHotCallsite);
+
auto ExportModulePath = ResolvedCalleeSummary->modulePath();
- auto &ProcessedThreshold = ImportsForModule[ExportModulePath][GUID];
+ auto &ProcessedThreshold = ImportList[ExportModulePath][GUID];
/// Since the traversal of the call graph is DFS, we can revisit a function
/// a second time with a higher threshold. In this case, it is added back to
/// the worklist with the new threshold.
- if (ProcessedThreshold && ProcessedThreshold >= Threshold) {
+ if (ProcessedThreshold && ProcessedThreshold >= AdjThreshold) {
DEBUG(dbgs() << "ignored! Target was already seen with Threshold "
<< ProcessedThreshold << "\n");
continue;
}
+ bool PreviouslyImported = ProcessedThreshold != 0;
// Mark this function as imported in this module, with the current Threshold
- ProcessedThreshold = Threshold;
+ ProcessedThreshold = AdjThreshold;
// Make exports in the source module.
if (ExportLists) {
auto &ExportList = (*ExportLists)[ExportModulePath];
ExportList.insert(GUID);
- // Mark all functions and globals referenced by this function as exported
- // to the outside if they are defined in the same source module.
- for (auto &Edge : ResolvedCalleeSummary->calls()) {
- auto CalleeGUID = Edge.first.getGUID();
- exportGlobalInModule(Index, ExportModulePath, CalleeGUID, ExportList);
- }
- for (auto &Ref : ResolvedCalleeSummary->refs()) {
- auto GUID = Ref.getGUID();
- exportGlobalInModule(Index, ExportModulePath, GUID, ExportList);
+ if (!PreviouslyImported) {
+ // This is the first time this function was exported from its source
+ // module, so mark all functions and globals it references as exported
+ // to the outside if they are defined in the same source module.
+ // For efficiency, we unconditionally add all the referenced GUIDs
+ // to the ExportList for this module, and will prune out any not
+ // defined in the module later in a single pass.
+ for (auto &Edge : ResolvedCalleeSummary->calls()) {
+ auto CalleeGUID = Edge.first.getGUID();
+ ExportList.insert(CalleeGUID);
+ }
+ for (auto &Ref : ResolvedCalleeSummary->refs()) {
+ auto GUID = Ref.getGUID();
+ ExportList.insert(GUID);
+ }
}
}
// Insert the newly imported function to the worklist.
- Worklist.push_back(std::make_pair(ResolvedCalleeSummary, Threshold));
+ Worklist.emplace_back(ResolvedCalleeSummary, AdjThreshold, GUID);
}
}
@@ -335,8 +279,9 @@ static void computeImportForFunction(
/// another module (that may require promotion).
static void ComputeImportForModule(
const GVSummaryMapTy &DefinedGVSummaries, const ModuleSummaryIndex &Index,
- FunctionImporter::ImportMapTy &ImportsForModule,
- StringMap<FunctionImporter::ExportSetTy> *ExportLists = nullptr) {
+ FunctionImporter::ImportMapTy &ImportList,
+ StringMap<FunctionImporter::ExportSetTy> *ExportLists = nullptr,
+ const DenseSet<GlobalValue::GUID> *DeadSymbols = nullptr) {
// Worklist contains the list of function imported in this module, for which
// we will analyse the callees and may import further down the callgraph.
SmallVector<EdgeInfo, 128> Worklist;
@@ -344,6 +289,10 @@ static void ComputeImportForModule(
// Populate the worklist with the import for the functions in the current
// module
for (auto &GVSummary : DefinedGVSummaries) {
+ if (DeadSymbols && DeadSymbols->count(GVSummary.first)) {
+ DEBUG(dbgs() << "Ignores Dead GUID: " << GVSummary.first << "\n");
+ continue;
+ }
auto *Summary = GVSummary.second;
if (auto *AS = dyn_cast<AliasSummary>(Summary))
Summary = &AS->getAliasee();
@@ -353,21 +302,26 @@ static void ComputeImportForModule(
continue;
DEBUG(dbgs() << "Initalize import for " << GVSummary.first << "\n");
computeImportForFunction(*FuncSummary, Index, ImportInstrLimit,
- DefinedGVSummaries, Worklist, ImportsForModule,
+ DefinedGVSummaries, Worklist, ImportList,
ExportLists);
}
+ // Process the newly imported functions and add callees to the worklist.
while (!Worklist.empty()) {
auto FuncInfo = Worklist.pop_back_val();
- auto *Summary = FuncInfo.first;
- auto Threshold = FuncInfo.second;
-
- // Process the newly imported functions and add callees to the worklist.
- // Adjust the threshold
- Threshold = Threshold * ImportInstrFactor;
+ auto *Summary = std::get<0>(FuncInfo);
+ auto Threshold = std::get<1>(FuncInfo);
+ auto GUID = std::get<2>(FuncInfo);
+
+ // Check if we later added this summary with a higher threshold.
+ // If so, skip this entry.
+ auto ExportModulePath = Summary->modulePath();
+ auto &LatestProcessedThreshold = ImportList[ExportModulePath][GUID];
+ if (LatestProcessedThreshold > Threshold)
+ continue;
computeImportForFunction(*Summary, Index, Threshold, DefinedGVSummaries,
- Worklist, ImportsForModule, ExportLists);
+ Worklist, ImportList, ExportLists);
}
}
@@ -378,14 +332,31 @@ void llvm::ComputeCrossModuleImport(
const ModuleSummaryIndex &Index,
const StringMap<GVSummaryMapTy> &ModuleToDefinedGVSummaries,
StringMap<FunctionImporter::ImportMapTy> &ImportLists,
- StringMap<FunctionImporter::ExportSetTy> &ExportLists) {
+ StringMap<FunctionImporter::ExportSetTy> &ExportLists,
+ const DenseSet<GlobalValue::GUID> *DeadSymbols) {
// For each module that has function defined, compute the import/export lists.
for (auto &DefinedGVSummaries : ModuleToDefinedGVSummaries) {
- auto &ImportsForModule = ImportLists[DefinedGVSummaries.first()];
+ auto &ImportList = ImportLists[DefinedGVSummaries.first()];
DEBUG(dbgs() << "Computing import for Module '"
<< DefinedGVSummaries.first() << "'\n");
- ComputeImportForModule(DefinedGVSummaries.second, Index, ImportsForModule,
- &ExportLists);
+ ComputeImportForModule(DefinedGVSummaries.second, Index, ImportList,
+ &ExportLists, DeadSymbols);
+ }
+
+ // When computing imports we added all GUIDs referenced by anything
+ // imported from the module to its ExportList. Now we prune each ExportList
+ // of any not defined in that module. This is more efficient than checking
+ // while computing imports because some of the summary lists may be long
+ // due to linkonce (comdat) copies.
+ for (auto &ELI : ExportLists) {
+ const auto &DefinedGVSummaries =
+ ModuleToDefinedGVSummaries.lookup(ELI.first());
+ for (auto EI = ELI.second.begin(); EI != ELI.second.end();) {
+ if (!DefinedGVSummaries.count(*EI))
+ EI = ELI.second.erase(EI);
+ else
+ ++EI;
+ }
}
#ifndef NDEBUG
@@ -431,45 +402,120 @@ void llvm::ComputeCrossModuleImportForModule(
#endif
}
+DenseSet<GlobalValue::GUID> llvm::computeDeadSymbols(
+ const ModuleSummaryIndex &Index,
+ const DenseSet<GlobalValue::GUID> &GUIDPreservedSymbols) {
+ if (!ComputeDead)
+ return DenseSet<GlobalValue::GUID>();
+ if (GUIDPreservedSymbols.empty())
+ // Don't do anything when nothing is live, this is friendly with tests.
+ return DenseSet<GlobalValue::GUID>();
+ DenseSet<GlobalValue::GUID> LiveSymbols = GUIDPreservedSymbols;
+ SmallVector<GlobalValue::GUID, 128> Worklist;
+ Worklist.reserve(LiveSymbols.size() * 2);
+ for (auto GUID : LiveSymbols) {
+ DEBUG(dbgs() << "Live root: " << GUID << "\n");
+ Worklist.push_back(GUID);
+ }
+ // Add values flagged in the index as live roots to the worklist.
+ for (const auto &Entry : Index) {
+ bool IsLiveRoot = llvm::any_of(
+ Entry.second,
+ [&](const std::unique_ptr<llvm::GlobalValueSummary> &Summary) {
+ return Summary->liveRoot();
+ });
+ if (!IsLiveRoot)
+ continue;
+ DEBUG(dbgs() << "Live root (summary): " << Entry.first << "\n");
+ Worklist.push_back(Entry.first);
+ }
+
+ while (!Worklist.empty()) {
+ auto GUID = Worklist.pop_back_val();
+ auto It = Index.findGlobalValueSummaryList(GUID);
+ if (It == Index.end()) {
+ DEBUG(dbgs() << "Not in index: " << GUID << "\n");
+ continue;
+ }
+
+ // FIXME: we should only make the prevailing copy live here
+ for (auto &Summary : It->second) {
+ for (auto Ref : Summary->refs()) {
+ auto RefGUID = Ref.getGUID();
+ if (LiveSymbols.insert(RefGUID).second) {
+ DEBUG(dbgs() << "Marking live (ref): " << RefGUID << "\n");
+ Worklist.push_back(RefGUID);
+ }
+ }
+ if (auto *FS = dyn_cast<FunctionSummary>(Summary.get())) {
+ for (auto Call : FS->calls()) {
+ auto CallGUID = Call.first.getGUID();
+ if (LiveSymbols.insert(CallGUID).second) {
+ DEBUG(dbgs() << "Marking live (call): " << CallGUID << "\n");
+ Worklist.push_back(CallGUID);
+ }
+ }
+ }
+ if (auto *AS = dyn_cast<AliasSummary>(Summary.get())) {
+ auto AliaseeGUID = AS->getAliasee().getOriginalName();
+ if (LiveSymbols.insert(AliaseeGUID).second) {
+ DEBUG(dbgs() << "Marking live (alias): " << AliaseeGUID << "\n");
+ Worklist.push_back(AliaseeGUID);
+ }
+ }
+ }
+ }
+ DenseSet<GlobalValue::GUID> DeadSymbols;
+ DeadSymbols.reserve(
+ std::min(Index.size(), Index.size() - LiveSymbols.size()));
+ for (auto &Entry : Index) {
+ auto GUID = Entry.first;
+ if (!LiveSymbols.count(GUID)) {
+ DEBUG(dbgs() << "Marking dead: " << GUID << "\n");
+ DeadSymbols.insert(GUID);
+ }
+ }
+ DEBUG(dbgs() << LiveSymbols.size() << " symbols Live, and "
+ << DeadSymbols.size() << " symbols Dead \n");
+ NumDeadSymbols += DeadSymbols.size();
+ NumLiveSymbols += LiveSymbols.size();
+ return DeadSymbols;
+}
+
/// Compute the set of summaries needed for a ThinLTO backend compilation of
/// \p ModulePath.
void llvm::gatherImportedSummariesForModule(
StringRef ModulePath,
const StringMap<GVSummaryMapTy> &ModuleToDefinedGVSummaries,
- const StringMap<FunctionImporter::ImportMapTy> &ImportLists,
+ const FunctionImporter::ImportMapTy &ImportList,
std::map<std::string, GVSummaryMapTy> &ModuleToSummariesForIndex) {
// Include all summaries from the importing module.
ModuleToSummariesForIndex[ModulePath] =
ModuleToDefinedGVSummaries.lookup(ModulePath);
- auto ModuleImports = ImportLists.find(ModulePath);
- if (ModuleImports != ImportLists.end()) {
- // Include summaries for imports.
- for (auto &ILI : ModuleImports->second) {
- auto &SummariesForIndex = ModuleToSummariesForIndex[ILI.first()];
- const auto &DefinedGVSummaries =
- ModuleToDefinedGVSummaries.lookup(ILI.first());
- for (auto &GI : ILI.second) {
- const auto &DS = DefinedGVSummaries.find(GI.first);
- assert(DS != DefinedGVSummaries.end() &&
- "Expected a defined summary for imported global value");
- SummariesForIndex[GI.first] = DS->second;
- }
+ // Include summaries for imports.
+ for (auto &ILI : ImportList) {
+ auto &SummariesForIndex = ModuleToSummariesForIndex[ILI.first()];
+ const auto &DefinedGVSummaries =
+ ModuleToDefinedGVSummaries.lookup(ILI.first());
+ for (auto &GI : ILI.second) {
+ const auto &DS = DefinedGVSummaries.find(GI.first);
+ assert(DS != DefinedGVSummaries.end() &&
+ "Expected a defined summary for imported global value");
+ SummariesForIndex[GI.first] = DS->second;
}
}
}
/// Emit the files \p ModulePath will import from into \p OutputFilename.
-std::error_code llvm::EmitImportsFiles(
- StringRef ModulePath, StringRef OutputFilename,
- const StringMap<FunctionImporter::ImportMapTy> &ImportLists) {
- auto ModuleImports = ImportLists.find(ModulePath);
+std::error_code
+llvm::EmitImportsFiles(StringRef ModulePath, StringRef OutputFilename,
+ const FunctionImporter::ImportMapTy &ModuleImports) {
std::error_code EC;
raw_fd_ostream ImportsOS(OutputFilename, EC, sys::fs::OpenFlags::F_None);
if (EC)
return EC;
- if (ModuleImports != ImportLists.end())
- for (auto &ILI : ModuleImports->second)
- ImportsOS << ILI.first() << "\n";
+ for (auto &ILI : ModuleImports)
+ ImportsOS << ILI.first() << "\n";
return std::error_code();
}
@@ -489,6 +535,15 @@ void llvm::thinLTOResolveWeakForLinkerModule(
DEBUG(dbgs() << "ODR fixing up linkage for `" << GV.getName() << "` from "
<< GV.getLinkage() << " to " << NewLinkage << "\n");
GV.setLinkage(NewLinkage);
+ // Remove functions converted to available_externally from comdats,
+ // as this is a declaration for the linker, and will be dropped eventually.
+ // It is illegal for comdats to contain declarations.
+ auto *GO = dyn_cast_or_null<GlobalObject>(&GV);
+ if (GO && GO->isDeclarationForLinker() && GO->hasComdat()) {
+ assert(GO->hasAvailableExternallyLinkage() &&
+ "Expected comdat on definition (possibly available external)");
+ GO->setComdat(nullptr);
+ }
};
// Process functions and global now
@@ -506,7 +561,7 @@ void llvm::thinLTOInternalizeModule(Module &TheModule,
// Parse inline ASM and collect the list of symbols that are not defined in
// the current module.
StringSet<> AsmUndefinedRefs;
- object::IRObjectFile::CollectAsmUndefinedRefs(
+ ModuleSymbolTable::CollectAsmSymbols(
Triple(TheModule.getTargetTriple()), TheModule.getModuleInlineAsm(),
[&AsmUndefinedRefs](StringRef Name, object::BasicSymbolRef::Flags Flags) {
if (Flags & object::BasicSymbolRef::SF_Undefined)
@@ -561,7 +616,7 @@ void llvm::thinLTOInternalizeModule(Module &TheModule,
// Automatically import functions in Module \p DestModule based on the summaries
// index.
//
-bool FunctionImporter::importFunctions(
+Expected<bool> FunctionImporter::importFunctions(
Module &DestModule, const FunctionImporter::ImportMapTy &ImportList,
bool ForceImportReferencedDiscardableSymbols) {
DEBUG(dbgs() << "Starting import for Module "
@@ -579,14 +634,17 @@ bool FunctionImporter::importFunctions(
// Get the module for the import
const auto &FunctionsToImportPerModule = ImportList.find(Name);
assert(FunctionsToImportPerModule != ImportList.end());
- std::unique_ptr<Module> SrcModule = ModuleLoader(Name);
+ Expected<std::unique_ptr<Module>> SrcModuleOrErr = ModuleLoader(Name);
+ if (!SrcModuleOrErr)
+ return SrcModuleOrErr.takeError();
+ std::unique_ptr<Module> SrcModule = std::move(*SrcModuleOrErr);
assert(&DestModule.getContext() == &SrcModule->getContext() &&
"Context mismatch");
// If modules were created with lazy metadata loading, materialize it
// now, before linking it (otherwise this will be a noop).
- SrcModule->materializeMetadata();
- UpgradeDebugInfo(*SrcModule);
+ if (Error Err = SrcModule->materializeMetadata())
+ return std::move(Err);
auto &ImportGUIDs = FunctionsToImportPerModule->second;
// Find the globals to import
@@ -600,7 +658,8 @@ bool FunctionImporter::importFunctions(
<< " " << F.getName() << " from "
<< SrcModule->getSourceFileName() << "\n");
if (Import) {
- F.materialize();
+ if (Error Err = F.materialize())
+ return std::move(Err);
if (EnableImportMetadata) {
// Add 'thinlto_src_module' metadata for statistics and debugging.
F.setMetadata(
@@ -622,7 +681,8 @@ bool FunctionImporter::importFunctions(
<< " " << GV.getName() << " from "
<< SrcModule->getSourceFileName() << "\n");
if (Import) {
- GV.materialize();
+ if (Error Err = GV.materialize())
+ return std::move(Err);
GlobalsToImport.insert(&GV);
}
}
@@ -648,13 +708,19 @@ bool FunctionImporter::importFunctions(
<< " " << GO->getName() << " from "
<< SrcModule->getSourceFileName() << "\n");
#endif
- GO->materialize();
+ if (Error Err = GO->materialize())
+ return std::move(Err);
GlobalsToImport.insert(GO);
- GA.materialize();
+ if (Error Err = GA.materialize())
+ return std::move(Err);
GlobalsToImport.insert(&GA);
}
}
+ // Upgrade debug info after we're done materializing all the globals and we
+ // have loaded all the required metadata!
+ UpgradeDebugInfo(*SrcModule);
+
// Link in the specified functions.
if (renameModuleForThinLTO(*SrcModule, Index, &GlobalsToImport))
return true;
@@ -674,9 +740,10 @@ bool FunctionImporter::importFunctions(
report_fatal_error("Function Import: link error");
ImportedCount += GlobalsToImport.size();
+ NumImportedModules++;
}
- NumImported += ImportedCount;
+ NumImportedFunctions += ImportedCount;
DEBUG(dbgs() << "Imported " << ImportedCount << " functions for Module "
<< DestModule.getModuleIdentifier() << "\n");
@@ -689,106 +756,94 @@ static cl::opt<std::string>
SummaryFile("summary-file",
cl::desc("The summary file to use for function importing."));
-static void diagnosticHandler(const DiagnosticInfo &DI) {
- raw_ostream &OS = errs();
- DiagnosticPrinterRawOStream DP(OS);
- DI.print(DP);
- OS << '\n';
-}
+static bool doImportingForModule(Module &M) {
+ if (SummaryFile.empty())
+ report_fatal_error("error: -function-import requires -summary-file\n");
+ Expected<std::unique_ptr<ModuleSummaryIndex>> IndexPtrOrErr =
+ getModuleSummaryIndexForFile(SummaryFile);
+ if (!IndexPtrOrErr) {
+ logAllUnhandledErrors(IndexPtrOrErr.takeError(), errs(),
+ "Error loading file '" + SummaryFile + "': ");
+ return false;
+ }
+ std::unique_ptr<ModuleSummaryIndex> Index = std::move(*IndexPtrOrErr);
+
+ // First step is collecting the import list.
+ FunctionImporter::ImportMapTy ImportList;
+ ComputeCrossModuleImportForModule(M.getModuleIdentifier(), *Index,
+ ImportList);
+
+ // Conservatively mark all internal values as promoted. This interface is
+ // only used when doing importing via the function importing pass. The pass
+ // is only enabled when testing importing via the 'opt' tool, which does
+ // not do the ThinLink that would normally determine what values to promote.
+ for (auto &I : *Index) {
+ for (auto &S : I.second) {
+ if (GlobalValue::isLocalLinkage(S->linkage()))
+ S->setLinkage(GlobalValue::ExternalLinkage);
+ }
+ }
-/// Parse the summary index out of an IR file and return the summary
-/// index object if found, or nullptr if not.
-static std::unique_ptr<ModuleSummaryIndex> getModuleSummaryIndexForFile(
- StringRef Path, std::string &Error,
- const DiagnosticHandlerFunction &DiagnosticHandler) {
- std::unique_ptr<MemoryBuffer> Buffer;
- ErrorOr<std::unique_ptr<MemoryBuffer>> BufferOrErr =
- MemoryBuffer::getFile(Path);
- if (std::error_code EC = BufferOrErr.getError()) {
- Error = EC.message();
- return nullptr;
+ // Next we need to promote to global scope and rename any local values that
+ // are potentially exported to other modules.
+ if (renameModuleForThinLTO(M, *Index, nullptr)) {
+ errs() << "Error renaming module\n";
+ return false;
}
- Buffer = std::move(BufferOrErr.get());
- ErrorOr<std::unique_ptr<object::ModuleSummaryIndexObjectFile>> ObjOrErr =
- object::ModuleSummaryIndexObjectFile::create(Buffer->getMemBufferRef(),
- DiagnosticHandler);
- if (std::error_code EC = ObjOrErr.getError()) {
- Error = EC.message();
- return nullptr;
+
+ // Perform the import now.
+ auto ModuleLoader = [&M](StringRef Identifier) {
+ return loadFile(Identifier, M.getContext());
+ };
+ FunctionImporter Importer(*Index, ModuleLoader);
+ Expected<bool> Result = Importer.importFunctions(
+ M, ImportList, !DontForceImportReferencedDiscardableSymbols);
+
+ // FIXME: Probably need to propagate Errors through the pass manager.
+ if (!Result) {
+ logAllUnhandledErrors(Result.takeError(), errs(),
+ "Error importing module: ");
+ return false;
}
- return (*ObjOrErr)->takeIndex();
+
+ return *Result;
}
namespace {
/// Pass that performs cross-module function import provided a summary file.
-class FunctionImportPass : public ModulePass {
- /// Optional module summary index to use for importing, otherwise
- /// the summary-file option must be specified.
- const ModuleSummaryIndex *Index;
-
+class FunctionImportLegacyPass : public ModulePass {
public:
/// Pass identification, replacement for typeid
static char ID;
/// Specify pass name for debug output
- const char *getPassName() const override { return "Function Importing"; }
+ StringRef getPassName() const override { return "Function Importing"; }
- explicit FunctionImportPass(const ModuleSummaryIndex *Index = nullptr)
- : ModulePass(ID), Index(Index) {}
+ explicit FunctionImportLegacyPass() : ModulePass(ID) {}
bool runOnModule(Module &M) override {
if (skipModule(M))
return false;
- if (SummaryFile.empty() && !Index)
- report_fatal_error("error: -function-import requires -summary-file or "
- "file from frontend\n");
- std::unique_ptr<ModuleSummaryIndex> IndexPtr;
- if (!SummaryFile.empty()) {
- if (Index)
- report_fatal_error("error: -summary-file and index from frontend\n");
- std::string Error;
- IndexPtr =
- getModuleSummaryIndexForFile(SummaryFile, Error, diagnosticHandler);
- if (!IndexPtr) {
- errs() << "Error loading file '" << SummaryFile << "': " << Error
- << "\n";
- return false;
- }
- Index = IndexPtr.get();
- }
-
- // First step is collecting the import list.
- FunctionImporter::ImportMapTy ImportList;
- ComputeCrossModuleImportForModule(M.getModuleIdentifier(), *Index,
- ImportList);
-
- // Next we need to promote to global scope and rename any local values that
- // are potentially exported to other modules.
- if (renameModuleForThinLTO(M, *Index, nullptr)) {
- errs() << "Error renaming module\n";
- return false;
- }
-
- // Perform the import now.
- auto ModuleLoader = [&M](StringRef Identifier) {
- return loadFile(Identifier, M.getContext());
- };
- FunctionImporter Importer(*Index, ModuleLoader);
- return Importer.importFunctions(
- M, ImportList, !DontForceImportReferencedDiscardableSymbols);
+ return doImportingForModule(M);
}
};
} // anonymous namespace
-char FunctionImportPass::ID = 0;
-INITIALIZE_PASS_BEGIN(FunctionImportPass, "function-import",
- "Summary Based Function Import", false, false)
-INITIALIZE_PASS_END(FunctionImportPass, "function-import",
- "Summary Based Function Import", false, false)
+PreservedAnalyses FunctionImportPass::run(Module &M,
+ ModuleAnalysisManager &AM) {
+ if (!doImportingForModule(M))
+ return PreservedAnalyses::all();
+
+ return PreservedAnalyses::none();
+}
+
+char FunctionImportLegacyPass::ID = 0;
+INITIALIZE_PASS(FunctionImportLegacyPass, "function-import",
+ "Summary Based Function Import", false, false)
namespace llvm {
-Pass *createFunctionImportPass(const ModuleSummaryIndex *Index = nullptr) {
- return new FunctionImportPass(Index);
+Pass *createFunctionImportPass() {
+ return new FunctionImportLegacyPass();
}
}
diff --git a/contrib/llvm/lib/Transforms/IPO/GlobalDCE.cpp b/contrib/llvm/lib/Transforms/IPO/GlobalDCE.cpp
index 4c74698..7a04de3 100644
--- a/contrib/llvm/lib/Transforms/IPO/GlobalDCE.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/GlobalDCE.cpp
@@ -162,45 +162,29 @@ PreservedAnalyses GlobalDCEPass::run(Module &M, ModuleAnalysisManager &) {
GIF.setResolver(nullptr);
}
- if (!DeadFunctions.empty()) {
- // Now that all interferences have been dropped, delete the actual objects
- // themselves.
- for (Function *F : DeadFunctions) {
- RemoveUnusedGlobalValue(*F);
- M.getFunctionList().erase(F);
- }
- NumFunctions += DeadFunctions.size();
+ // Now that all interferences have been dropped, delete the actual objects
+ // themselves.
+ auto EraseUnusedGlobalValue = [&](GlobalValue *GV) {
+ RemoveUnusedGlobalValue(*GV);
+ GV->eraseFromParent();
Changed = true;
- }
+ };
- if (!DeadGlobalVars.empty()) {
- for (GlobalVariable *GV : DeadGlobalVars) {
- RemoveUnusedGlobalValue(*GV);
- M.getGlobalList().erase(GV);
- }
- NumVariables += DeadGlobalVars.size();
- Changed = true;
- }
+ NumFunctions += DeadFunctions.size();
+ for (Function *F : DeadFunctions)
+ EraseUnusedGlobalValue(F);
- // Now delete any dead aliases.
- if (!DeadAliases.empty()) {
- for (GlobalAlias *GA : DeadAliases) {
- RemoveUnusedGlobalValue(*GA);
- M.getAliasList().erase(GA);
- }
- NumAliases += DeadAliases.size();
- Changed = true;
- }
+ NumVariables += DeadGlobalVars.size();
+ for (GlobalVariable *GV : DeadGlobalVars)
+ EraseUnusedGlobalValue(GV);
- // Now delete any dead aliases.
- if (!DeadIFuncs.empty()) {
- for (GlobalIFunc *GIF : DeadIFuncs) {
- RemoveUnusedGlobalValue(*GIF);
- M.getIFuncList().erase(GIF);
- }
- NumIFuncs += DeadIFuncs.size();
- Changed = true;
- }
+ NumAliases += DeadAliases.size();
+ for (GlobalAlias *GA : DeadAliases)
+ EraseUnusedGlobalValue(GA);
+
+ NumIFuncs += DeadIFuncs.size();
+ for (GlobalIFunc *GIF : DeadIFuncs)
+ EraseUnusedGlobalValue(GIF);
// Make sure that all memory is released
AliveGlobals.clear();
diff --git a/contrib/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/contrib/llvm/lib/Transforms/IPO/GlobalOpt.cpp
index 99b12d4..5b0d5e3 100644
--- a/contrib/llvm/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/GlobalOpt.cpp
@@ -371,14 +371,14 @@ static bool IsUserOfGlobalSafeForSRA(User *U, GlobalValue *GV) {
++GEPI; // Skip over the pointer index.
// If this is a use of an array allocation, do a bit more checking for sanity.
- if (ArrayType *AT = dyn_cast<ArrayType>(*GEPI)) {
- uint64_t NumElements = AT->getNumElements();
+ if (GEPI.isSequential()) {
ConstantInt *Idx = cast<ConstantInt>(U->getOperand(2));
// Check to make sure that index falls within the array. If not,
// something funny is going on, so we won't do the optimization.
//
- if (Idx->getZExtValue() >= NumElements)
+ if (GEPI.isBoundedSequential() &&
+ Idx->getZExtValue() >= GEPI.getSequentialNumElements())
return false;
// We cannot scalar repl this level of the array unless any array
@@ -391,19 +391,13 @@ static bool IsUserOfGlobalSafeForSRA(User *U, GlobalValue *GV) {
for (++GEPI; // Skip array index.
GEPI != E;
++GEPI) {
- uint64_t NumElements;
- if (ArrayType *SubArrayTy = dyn_cast<ArrayType>(*GEPI))
- NumElements = SubArrayTy->getNumElements();
- else if (VectorType *SubVectorTy = dyn_cast<VectorType>(*GEPI))
- NumElements = SubVectorTy->getNumElements();
- else {
- assert((*GEPI)->isStructTy() &&
- "Indexed GEP type is not array, vector, or struct!");
+ if (GEPI.isStruct())
continue;
- }
ConstantInt *IdxVal = dyn_cast<ConstantInt>(GEPI.getOperand());
- if (!IdxVal || IdxVal->getZExtValue() >= NumElements)
+ if (!IdxVal ||
+ (GEPI.isBoundedSequential() &&
+ IdxVal->getZExtValue() >= GEPI.getSequentialNumElements()))
return false;
}
}
@@ -473,12 +467,7 @@ static GlobalVariable *SRAGlobal(GlobalVariable *GV, const DataLayout &DL) {
NGV->setAlignment(NewAlign);
}
} else if (SequentialType *STy = dyn_cast<SequentialType>(Ty)) {
- unsigned NumElements = 0;
- if (ArrayType *ATy = dyn_cast<ArrayType>(STy))
- NumElements = ATy->getNumElements();
- else
- NumElements = cast<VectorType>(STy)->getNumElements();
-
+ unsigned NumElements = STy->getNumElements();
if (NumElements > 16 && GV->hasNUsesOrMore(16))
return nullptr; // It's not worth it.
NewGlobals.reserve(NumElements);
@@ -1653,7 +1642,7 @@ static bool deleteIfDead(GlobalValue &GV,
SmallSet<const Comdat *, 8> &NotDiscardableComdats) {
GV.removeDeadConstantUsers();
- if (!GV.isDiscardableIfUnused())
+ if (!GV.isDiscardableIfUnused() && !GV.isDeclaration())
return false;
if (const Comdat *C = GV.getComdat())
@@ -1662,7 +1651,7 @@ static bool deleteIfDead(GlobalValue &GV,
bool Dead;
if (auto *F = dyn_cast<Function>(&GV))
- Dead = F->isDefTriviallyDead();
+ Dead = (F->isDeclaration() && F->use_empty()) || F->isDefTriviallyDead();
else
Dead = GV.use_empty();
if (!Dead)
@@ -1737,7 +1726,7 @@ static bool isPointerValueDeadOnEntryToFunction(
for (auto *L : Loads) {
auto *LTy = L->getType();
- if (!std::any_of(Stores.begin(), Stores.end(), [&](StoreInst *S) {
+ if (none_of(Stores, [&](const StoreInst *S) {
auto *STy = S->getValueOperand()->getType();
// The load is only dominated by the store if DomTree says so
// and the number of bits loaded in L is less than or equal to
@@ -2079,10 +2068,10 @@ OptimizeGlobalVars(Module &M, TargetLibraryInfo *TLI,
GV->setLinkage(GlobalValue::InternalLinkage);
// Simplify the initializer.
if (GV->hasInitializer())
- if (ConstantExpr *CE = dyn_cast<ConstantExpr>(GV->getInitializer())) {
+ if (auto *C = dyn_cast<Constant>(GV->getInitializer())) {
auto &DL = M.getDataLayout();
- Constant *New = ConstantFoldConstantExpression(CE, DL, TLI);
- if (New && New != CE)
+ Constant *New = ConstantFoldConstant(C, DL, TLI);
+ if (New && New != C)
GV->setInitializer(New);
}
@@ -2125,12 +2114,7 @@ static Constant *EvaluateStoreInto(Constant *Init, Constant *Val,
ConstantInt *CI = cast<ConstantInt>(Addr->getOperand(OpNo));
SequentialType *InitTy = cast<SequentialType>(Init->getType());
-
- uint64_t NumElts;
- if (ArrayType *ATy = dyn_cast<ArrayType>(InitTy))
- NumElts = ATy->getNumElements();
- else
- NumElts = InitTy->getVectorNumElements();
+ uint64_t NumElts = InitTy->getNumElements();
// Break up the array into elements.
for (uint64_t i = 0, e = NumElts; i != e; ++i)
@@ -2565,7 +2549,7 @@ static bool optimizeGlobalsInModule(
return Changed;
}
-PreservedAnalyses GlobalOptPass::run(Module &M, AnalysisManager<Module> &AM) {
+PreservedAnalyses GlobalOptPass::run(Module &M, ModuleAnalysisManager &AM) {
auto &DL = M.getDataLayout();
auto &TLI = AM.getResult<TargetLibraryAnalysis>(M);
auto &FAM =
diff --git a/contrib/llvm/lib/Transforms/IPO/GlobalSplit.cpp b/contrib/llvm/lib/Transforms/IPO/GlobalSplit.cpp
new file mode 100644
index 0000000..bbbd096
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/IPO/GlobalSplit.cpp
@@ -0,0 +1,171 @@
+//===- GlobalSplit.cpp - global variable splitter -------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass uses inrange annotations on GEP indices to split globals where
+// beneficial. Clang currently attaches these annotations to references to
+// virtual table globals under the Itanium ABI for the benefit of the
+// whole-program virtual call optimization and control flow integrity passes.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/IPO/GlobalSplit.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/Pass.h"
+
+#include <set>
+
+using namespace llvm;
+
+namespace {
+
+bool splitGlobal(GlobalVariable &GV) {
+ // If the address of the global is taken outside of the module, we cannot
+ // apply this transformation.
+ if (!GV.hasLocalLinkage())
+ return false;
+
+ // We currently only know how to split ConstantStructs.
+ auto *Init = dyn_cast_or_null<ConstantStruct>(GV.getInitializer());
+ if (!Init)
+ return false;
+
+ // Verify that each user of the global is an inrange getelementptr constant.
+ // From this it follows that any loads from or stores to that global must use
+ // a pointer derived from an inrange getelementptr constant, which is
+ // sufficient to allow us to apply the splitting transform.
+ for (User *U : GV.users()) {
+ if (!isa<Constant>(U))
+ return false;
+
+ auto *GEP = dyn_cast<GEPOperator>(U);
+ if (!GEP || !GEP->getInRangeIndex() || *GEP->getInRangeIndex() != 1 ||
+ !isa<ConstantInt>(GEP->getOperand(1)) ||
+ !cast<ConstantInt>(GEP->getOperand(1))->isZero() ||
+ !isa<ConstantInt>(GEP->getOperand(2)))
+ return false;
+ }
+
+ SmallVector<MDNode *, 2> Types;
+ GV.getMetadata(LLVMContext::MD_type, Types);
+
+ const DataLayout &DL = GV.getParent()->getDataLayout();
+ const StructLayout *SL = DL.getStructLayout(Init->getType());
+
+ IntegerType *Int32Ty = Type::getInt32Ty(GV.getContext());
+
+ std::vector<GlobalVariable *> SplitGlobals(Init->getNumOperands());
+ for (unsigned I = 0; I != Init->getNumOperands(); ++I) {
+ // Build a global representing this split piece.
+ auto *SplitGV =
+ new GlobalVariable(*GV.getParent(), Init->getOperand(I)->getType(),
+ GV.isConstant(), GlobalValue::PrivateLinkage,
+ Init->getOperand(I), GV.getName() + "." + utostr(I));
+ SplitGlobals[I] = SplitGV;
+
+ unsigned SplitBegin = SL->getElementOffset(I);
+ unsigned SplitEnd = (I == Init->getNumOperands() - 1)
+ ? SL->getSizeInBytes()
+ : SL->getElementOffset(I + 1);
+
+ // Rebuild type metadata, adjusting by the split offset.
+ // FIXME: See if we can use DW_OP_piece to preserve debug metadata here.
+ for (MDNode *Type : Types) {
+ uint64_t ByteOffset = cast<ConstantInt>(
+ cast<ConstantAsMetadata>(Type->getOperand(0))->getValue())
+ ->getZExtValue();
+ if (ByteOffset < SplitBegin || ByteOffset >= SplitEnd)
+ continue;
+ SplitGV->addMetadata(
+ LLVMContext::MD_type,
+ *MDNode::get(GV.getContext(),
+ {ConstantAsMetadata::get(
+ ConstantInt::get(Int32Ty, ByteOffset - SplitBegin)),
+ Type->getOperand(1)}));
+ }
+ }
+
+ for (User *U : GV.users()) {
+ auto *GEP = cast<GEPOperator>(U);
+ unsigned I = cast<ConstantInt>(GEP->getOperand(2))->getZExtValue();
+ if (I >= SplitGlobals.size())
+ continue;
+
+ SmallVector<Value *, 4> Ops;
+ Ops.push_back(ConstantInt::get(Int32Ty, 0));
+ for (unsigned I = 3; I != GEP->getNumOperands(); ++I)
+ Ops.push_back(GEP->getOperand(I));
+
+ auto *NewGEP = ConstantExpr::getGetElementPtr(
+ SplitGlobals[I]->getInitializer()->getType(), SplitGlobals[I], Ops,
+ GEP->isInBounds());
+ GEP->replaceAllUsesWith(NewGEP);
+ }
+
+ // Finally, remove the original global. Any remaining uses refer to invalid
+ // elements of the global, so replace with undef.
+ if (!GV.use_empty())
+ GV.replaceAllUsesWith(UndefValue::get(GV.getType()));
+ GV.eraseFromParent();
+ return true;
+}
+
+bool splitGlobals(Module &M) {
+ // First, see if the module uses either of the llvm.type.test or
+ // llvm.type.checked.load intrinsics, which indicates that splitting globals
+ // may be beneficial.
+ Function *TypeTestFunc =
+ M.getFunction(Intrinsic::getName(Intrinsic::type_test));
+ Function *TypeCheckedLoadFunc =
+ M.getFunction(Intrinsic::getName(Intrinsic::type_checked_load));
+ if ((!TypeTestFunc || TypeTestFunc->use_empty()) &&
+ (!TypeCheckedLoadFunc || TypeCheckedLoadFunc->use_empty()))
+ return false;
+
+ bool Changed = false;
+ for (auto I = M.global_begin(); I != M.global_end();) {
+ GlobalVariable &GV = *I;
+ ++I;
+ Changed |= splitGlobal(GV);
+ }
+ return Changed;
+}
+
+struct GlobalSplit : public ModulePass {
+ static char ID;
+ GlobalSplit() : ModulePass(ID) {
+ initializeGlobalSplitPass(*PassRegistry::getPassRegistry());
+ }
+ bool runOnModule(Module &M) {
+ if (skipModule(M))
+ return false;
+
+ return splitGlobals(M);
+ }
+};
+
+}
+
+INITIALIZE_PASS(GlobalSplit, "globalsplit", "Global splitter", false, false)
+char GlobalSplit::ID = 0;
+
+ModulePass *llvm::createGlobalSplitPass() {
+ return new GlobalSplit;
+}
+
+PreservedAnalyses GlobalSplitPass::run(Module &M, ModuleAnalysisManager &AM) {
+ if (!splitGlobals(M))
+ return PreservedAnalyses::all();
+ return PreservedAnalyses::none();
+}
diff --git a/contrib/llvm/lib/Transforms/IPO/IPO.cpp b/contrib/llvm/lib/Transforms/IPO/IPO.cpp
index 3507eba..89518f3 100644
--- a/contrib/llvm/lib/Transforms/IPO/IPO.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/IPO.cpp
@@ -18,6 +18,7 @@
#include "llvm/InitializePasses.h"
#include "llvm/IR/LegacyPassManager.h"
#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/IPO/AlwaysInliner.h"
#include "llvm/Transforms/IPO/FunctionAttrs.h"
using namespace llvm;
@@ -31,8 +32,9 @@ void llvm::initializeIPO(PassRegistry &Registry) {
initializeForceFunctionAttrsLegacyPassPass(Registry);
initializeGlobalDCELegacyPassPass(Registry);
initializeGlobalOptLegacyPassPass(Registry);
+ initializeGlobalSplitPass(Registry);
initializeIPCPPass(Registry);
- initializeAlwaysInlinerPass(Registry);
+ initializeAlwaysInlinerLegacyPassPass(Registry);
initializeSimpleInlinerPass(Registry);
initializeInferFunctionAttrsLegacyPassPass(Registry);
initializeInternalizeLegacyPassPass(Registry);
@@ -53,7 +55,7 @@ void llvm::initializeIPO(PassRegistry &Registry) {
initializeBarrierNoopPass(Registry);
initializeEliminateAvailableExternallyLegacyPassPass(Registry);
initializeSampleProfileLoaderLegacyPassPass(Registry);
- initializeFunctionImportPassPass(Registry);
+ initializeFunctionImportLegacyPassPass(Registry);
initializeWholeProgramDevirtPass(Registry);
}
@@ -82,7 +84,7 @@ void LLVMAddFunctionInliningPass(LLVMPassManagerRef PM) {
}
void LLVMAddAlwaysInlinerPass(LLVMPassManagerRef PM) {
- unwrap(PM)->add(llvm::createAlwaysInlinerPass());
+ unwrap(PM)->add(llvm::createAlwaysInlinerLegacyPass());
}
void LLVMAddGlobalDCEPass(LLVMPassManagerRef PM) {
diff --git a/contrib/llvm/lib/Transforms/IPO/InferFunctionAttrs.cpp b/contrib/llvm/lib/Transforms/IPO/InferFunctionAttrs.cpp
index ab2d2bd..2ef299d 100644
--- a/contrib/llvm/lib/Transforms/IPO/InferFunctionAttrs.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/InferFunctionAttrs.cpp
@@ -34,7 +34,7 @@ static bool inferAllPrototypeAttributes(Module &M,
}
PreservedAnalyses InferFunctionAttrsPass::run(Module &M,
- AnalysisManager<Module> &AM) {
+ ModuleAnalysisManager &AM) {
auto &TLI = AM.getResult<TargetLibraryAnalysis>(M);
if (!inferAllPrototypeAttributes(M, TLI))
diff --git a/contrib/llvm/lib/Transforms/IPO/InlineAlways.cpp b/contrib/llvm/lib/Transforms/IPO/InlineAlways.cpp
deleted file mode 100644
index cb1ab95..0000000
--- a/contrib/llvm/lib/Transforms/IPO/InlineAlways.cpp
+++ /dev/null
@@ -1,103 +0,0 @@
-//===- InlineAlways.cpp - Code to inline always_inline functions ----------===//
-//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements a custom inliner that handles only functions that
-// are marked as "always inline".
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/IPO.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/Analysis/AssumptionCache.h"
-#include "llvm/Analysis/CallGraph.h"
-#include "llvm/Analysis/InlineCost.h"
-#include "llvm/Analysis/ProfileSummaryInfo.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/IR/CallSite.h"
-#include "llvm/IR/CallingConv.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/Type.h"
-#include "llvm/Transforms/IPO/InlinerPass.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "inline"
-
-namespace {
-
-/// \brief Inliner pass which only handles "always inline" functions.
-class AlwaysInliner : public Inliner {
-
-public:
- AlwaysInliner() : Inliner(ID, /*InsertLifetime*/ true) {
- initializeAlwaysInlinerPass(*PassRegistry::getPassRegistry());
- }
-
- AlwaysInliner(bool InsertLifetime) : Inliner(ID, InsertLifetime) {
- initializeAlwaysInlinerPass(*PassRegistry::getPassRegistry());
- }
-
- /// Main run interface method. We override here to avoid calling skipSCC().
- bool runOnSCC(CallGraphSCC &SCC) override { return inlineCalls(SCC); }
-
- static char ID; // Pass identification, replacement for typeid
-
- InlineCost getInlineCost(CallSite CS) override;
-
- using llvm::Pass::doFinalization;
- bool doFinalization(CallGraph &CG) override {
- return removeDeadFunctions(CG, /*AlwaysInlineOnly=*/ true);
- }
-};
-
-}
-
-char AlwaysInliner::ID = 0;
-INITIALIZE_PASS_BEGIN(AlwaysInliner, "always-inline",
- "Inliner for always_inline functions", false, false)
-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
-INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
-INITIALIZE_PASS_END(AlwaysInliner, "always-inline",
- "Inliner for always_inline functions", false, false)
-
-Pass *llvm::createAlwaysInlinerPass() { return new AlwaysInliner(); }
-
-Pass *llvm::createAlwaysInlinerPass(bool InsertLifetime) {
- return new AlwaysInliner(InsertLifetime);
-}
-
-/// \brief Get the inline cost for the always-inliner.
-///
-/// The always inliner *only* handles functions which are marked with the
-/// attribute to force inlining. As such, it is dramatically simpler and avoids
-/// using the powerful (but expensive) inline cost analysis. Instead it uses
-/// a very simple and boring direct walk of the instructions looking for
-/// impossible-to-inline constructs.
-///
-/// Note, it would be possible to go to some lengths to cache the information
-/// computed here, but as we only expect to do this for relatively few and
-/// small functions which have the explicit attribute to force inlining, it is
-/// likely not worth it in practice.
-InlineCost AlwaysInliner::getInlineCost(CallSite CS) {
- Function *Callee = CS.getCalledFunction();
-
- // Only inline direct calls to functions with always-inline attributes
- // that are viable for inlining. FIXME: We shouldn't even get here for
- // declarations.
- if (Callee && !Callee->isDeclaration() &&
- CS.hasFnAttr(Attribute::AlwaysInline) && isInlineViable(*Callee))
- return InlineCost::getAlways();
-
- return InlineCost::getNever();
-}
diff --git a/contrib/llvm/lib/Transforms/IPO/InlineSimple.cpp b/contrib/llvm/lib/Transforms/IPO/InlineSimple.cpp
index 2aa650b..1770445 100644
--- a/contrib/llvm/lib/Transforms/IPO/InlineSimple.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/InlineSimple.cpp
@@ -25,7 +25,7 @@
#include "llvm/IR/Module.h"
#include "llvm/IR/Type.h"
#include "llvm/Transforms/IPO.h"
-#include "llvm/Transforms/IPO/InlinerPass.h"
+#include "llvm/Transforms/IPO/Inliner.h"
using namespace llvm;
@@ -38,21 +38,17 @@ namespace {
/// The common implementation of the inlining logic is shared between this
/// inliner pass and the always inliner pass. The two passes use different cost
/// analyses to determine when to inline.
-class SimpleInliner : public Inliner {
- // This field is populated based on one of the following:
- // * optimization or size-optimization levels,
- // * the --inline-threshold flag, or
- // * a user specified value.
- int DefaultThreshold;
+class SimpleInliner : public LegacyInlinerBase {
+
+ InlineParams Params;
public:
- SimpleInliner()
- : Inliner(ID), DefaultThreshold(llvm::getDefaultInlineThreshold()) {
+ SimpleInliner() : LegacyInlinerBase(ID), Params(llvm::getInlineParams()) {
initializeSimpleInlinerPass(*PassRegistry::getPassRegistry());
}
- explicit SimpleInliner(int Threshold)
- : Inliner(ID), DefaultThreshold(Threshold) {
+ explicit SimpleInliner(InlineParams Params)
+ : LegacyInlinerBase(ID), Params(Params) {
initializeSimpleInlinerPass(*PassRegistry::getPassRegistry());
}
@@ -61,7 +57,11 @@ public:
InlineCost getInlineCost(CallSite CS) override {
Function *Callee = CS.getCalledFunction();
TargetTransformInfo &TTI = TTIWP->getTTI(*Callee);
- return llvm::getInlineCost(CS, DefaultThreshold, TTI, ACT, PSI);
+ std::function<AssumptionCache &(Function &)> GetAssumptionCache =
+ [&](Function &F) -> AssumptionCache & {
+ return ACT->getAssumptionCache(F);
+ };
+ return llvm::getInlineCost(CS, Params, TTI, GetAssumptionCache, PSI);
}
bool runOnSCC(CallGraphSCC &SCC) override;
@@ -69,39 +69,43 @@ public:
private:
TargetTransformInfoWrapperPass *TTIWP;
+
};
} // end anonymous namespace
char SimpleInliner::ID = 0;
-INITIALIZE_PASS_BEGIN(SimpleInliner, "inline",
- "Function Integration/Inlining", false, false)
+INITIALIZE_PASS_BEGIN(SimpleInliner, "inline", "Function Integration/Inlining",
+ false, false)
INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass)
INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
-INITIALIZE_PASS_END(SimpleInliner, "inline",
- "Function Integration/Inlining", false, false)
+INITIALIZE_PASS_END(SimpleInliner, "inline", "Function Integration/Inlining",
+ false, false)
Pass *llvm::createFunctionInliningPass() { return new SimpleInliner(); }
Pass *llvm::createFunctionInliningPass(int Threshold) {
- return new SimpleInliner(Threshold);
+ return new SimpleInliner(llvm::getInlineParams(Threshold));
}
Pass *llvm::createFunctionInliningPass(unsigned OptLevel,
unsigned SizeOptLevel) {
- return new SimpleInliner(
- llvm::computeThresholdFromOptLevels(OptLevel, SizeOptLevel));
+ return new SimpleInliner(llvm::getInlineParams(OptLevel, SizeOptLevel));
+}
+
+Pass *llvm::createFunctionInliningPass(InlineParams &Params) {
+ return new SimpleInliner(Params);
}
bool SimpleInliner::runOnSCC(CallGraphSCC &SCC) {
TTIWP = &getAnalysis<TargetTransformInfoWrapperPass>();
- return Inliner::runOnSCC(SCC);
+ return LegacyInlinerBase::runOnSCC(SCC);
}
void SimpleInliner::getAnalysisUsage(AnalysisUsage &AU) const {
AU.addRequired<TargetTransformInfoWrapperPass>();
- Inliner::getAnalysisUsage(AU);
+ LegacyInlinerBase::getAnalysisUsage(AU);
}
diff --git a/contrib/llvm/lib/Transforms/IPO/Inliner.cpp b/contrib/llvm/lib/Transforms/IPO/Inliner.cpp
index 79535ca..3f4731c 100644
--- a/contrib/llvm/lib/Transforms/IPO/Inliner.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/Inliner.cpp
@@ -13,6 +13,7 @@
//
//===----------------------------------------------------------------------===//
+#include "llvm/Transforms/IPO/Inliner.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/AliasAnalysis.h"
@@ -20,19 +21,21 @@
#include "llvm/Analysis/BasicAliasAnalysis.h"
#include "llvm/Analysis/CallGraph.h"
#include "llvm/Analysis/InlineCost.h"
+#include "llvm/Analysis/OptimizationDiagnosticInfo.h"
#include "llvm/Analysis/ProfileSummaryInfo.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/IR/CallSite.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/InstIterator.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Module.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/IPO/InlinerPass.h"
#include "llvm/Transforms/Utils/Cloning.h"
#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
using namespace llvm;
#define DEBUG_TYPE "inline"
@@ -47,15 +50,44 @@ STATISTIC(NumMergedAllocas, "Number of allocas merged together");
// if those would be more profitable and blocked inline steps.
STATISTIC(NumCallerCallersAnalyzed, "Number of caller-callers analyzed");
-Inliner::Inliner(char &ID) : CallGraphSCCPass(ID), InsertLifetime(true) {}
-
-Inliner::Inliner(char &ID, bool InsertLifetime)
+/// Flag to disable manual alloca merging.
+///
+/// Merging of allocas was originally done as a stack-size saving technique
+/// prior to LLVM's code generator having support for stack coloring based on
+/// lifetime markers. It is now in the process of being removed. To experiment
+/// with disabling it and relying fully on lifetime marker based stack
+/// coloring, you can pass this flag to LLVM.
+static cl::opt<bool>
+ DisableInlinedAllocaMerging("disable-inlined-alloca-merging",
+ cl::init(false), cl::Hidden);
+
+namespace {
+enum class InlinerFunctionImportStatsOpts {
+ No = 0,
+ Basic = 1,
+ Verbose = 2,
+};
+
+cl::opt<InlinerFunctionImportStatsOpts> InlinerFunctionImportStats(
+ "inliner-function-import-stats",
+ cl::init(InlinerFunctionImportStatsOpts::No),
+ cl::values(clEnumValN(InlinerFunctionImportStatsOpts::Basic, "basic",
+ "basic statistics"),
+ clEnumValN(InlinerFunctionImportStatsOpts::Verbose, "verbose",
+ "printing of statistics for each inlined function")),
+ cl::Hidden, cl::desc("Enable inliner stats for imported functions"));
+} // namespace
+
+LegacyInlinerBase::LegacyInlinerBase(char &ID)
+ : CallGraphSCCPass(ID), InsertLifetime(true) {}
+
+LegacyInlinerBase::LegacyInlinerBase(char &ID, bool InsertLifetime)
: CallGraphSCCPass(ID), InsertLifetime(InsertLifetime) {}
/// For this class, we declare that we require and preserve the call graph.
/// If the derived class implements this method, it should
/// always explicitly call the implementation here.
-void Inliner::getAnalysisUsage(AnalysisUsage &AU) const {
+void LegacyInlinerBase::getAnalysisUsage(AnalysisUsage &AU) const {
AU.addRequired<AssumptionCacheTracker>();
AU.addRequired<ProfileSummaryInfoWrapperPass>();
AU.addRequired<TargetLibraryInfoWrapperPass>();
@@ -63,62 +95,33 @@ void Inliner::getAnalysisUsage(AnalysisUsage &AU) const {
CallGraphSCCPass::getAnalysisUsage(AU);
}
+typedef DenseMap<ArrayType *, std::vector<AllocaInst *>> InlinedArrayAllocasTy;
-typedef DenseMap<ArrayType*, std::vector<AllocaInst*> >
-InlinedArrayAllocasTy;
-
-/// If it is possible to inline the specified call site,
-/// do so and update the CallGraph for this operation.
+/// Look at all of the allocas that we inlined through this call site. If we
+/// have already inlined other allocas through other calls into this function,
+/// then we know that they have disjoint lifetimes and that we can merge them.
///
-/// This function also does some basic book-keeping to update the IR. The
-/// InlinedArrayAllocas map keeps track of any allocas that are already
-/// available from other functions inlined into the caller. If we are able to
-/// inline this call site we attempt to reuse already available allocas or add
-/// any new allocas to the set if not possible.
-static bool InlineCallIfPossible(Pass &P, CallSite CS, InlineFunctionInfo &IFI,
- InlinedArrayAllocasTy &InlinedArrayAllocas,
- int InlineHistory, bool InsertLifetime) {
- Function *Callee = CS.getCalledFunction();
- Function *Caller = CS.getCaller();
-
- // We need to manually construct BasicAA directly in order to disable
- // its use of other function analyses.
- BasicAAResult BAR(createLegacyPMBasicAAResult(P, *Callee));
-
- // Construct our own AA results for this function. We do this manually to
- // work around the limitations of the legacy pass manager.
- AAResults AAR(createLegacyPMAAResults(P, *Callee, BAR));
-
- // Try to inline the function. Get the list of static allocas that were
- // inlined.
- if (!InlineFunction(CS, IFI, &AAR, InsertLifetime))
- return false;
-
- AttributeFuncs::mergeAttributesForInlining(*Caller, *Callee);
+/// There are many heuristics possible for merging these allocas, and the
+/// different options have different tradeoffs. One thing that we *really*
+/// don't want to hurt is SRoA: once inlining happens, often allocas are no
+/// longer address taken and so they can be promoted.
+///
+/// Our "solution" for that is to only merge allocas whose outermost type is an
+/// array type. These are usually not promoted because someone is using a
+/// variable index into them. These are also often the most important ones to
+/// merge.
+///
+/// A better solution would be to have real memory lifetime markers in the IR
+/// and not have the inliner do any merging of allocas at all. This would
+/// allow the backend to do proper stack slot coloring of all allocas that
+/// *actually make it to the backend*, which is really what we want.
+///
+/// Because we don't have this information, we do this simple and useful hack.
+static void mergeInlinedArrayAllocas(
+ Function *Caller, InlineFunctionInfo &IFI,
+ InlinedArrayAllocasTy &InlinedArrayAllocas, int InlineHistory) {
+ SmallPtrSet<AllocaInst *, 16> UsedAllocas;
- // Look at all of the allocas that we inlined through this call site. If we
- // have already inlined other allocas through other calls into this function,
- // then we know that they have disjoint lifetimes and that we can merge them.
- //
- // There are many heuristics possible for merging these allocas, and the
- // different options have different tradeoffs. One thing that we *really*
- // don't want to hurt is SRoA: once inlining happens, often allocas are no
- // longer address taken and so they can be promoted.
- //
- // Our "solution" for that is to only merge allocas whose outermost type is an
- // array type. These are usually not promoted because someone is using a
- // variable index into them. These are also often the most important ones to
- // merge.
- //
- // A better solution would be to have real memory lifetime markers in the IR
- // and not have the inliner do any merging of allocas at all. This would
- // allow the backend to do proper stack slot coloring of all allocas that
- // *actually make it to the backend*, which is really what we want.
- //
- // Because we don't have this information, we do this simple and useful hack.
- //
- SmallPtrSet<AllocaInst*, 16> UsedAllocas;
-
// When processing our SCC, check to see if CS was inlined from some other
// call site. For example, if we're processing "A" in this code:
// A() { B() }
@@ -131,25 +134,25 @@ static bool InlineCallIfPossible(Pass &P, CallSite CS, InlineFunctionInfo &IFI,
// because their scopes are not disjoint. We could make this smarter by
// keeping track of the inline history for each alloca in the
// InlinedArrayAllocas but this isn't likely to be a significant win.
- if (InlineHistory != -1) // Only do merging for top-level call sites in SCC.
- return true;
-
+ if (InlineHistory != -1) // Only do merging for top-level call sites in SCC.
+ return;
+
// Loop over all the allocas we have so far and see if they can be merged with
// a previously inlined alloca. If not, remember that we had it.
- for (unsigned AllocaNo = 0, e = IFI.StaticAllocas.size();
- AllocaNo != e; ++AllocaNo) {
+ for (unsigned AllocaNo = 0, e = IFI.StaticAllocas.size(); AllocaNo != e;
+ ++AllocaNo) {
AllocaInst *AI = IFI.StaticAllocas[AllocaNo];
-
+
// Don't bother trying to merge array allocations (they will usually be
// canonicalized to be an allocation *of* an array), or allocations whose
// type is not itself an array (because we're afraid of pessimizing SRoA).
ArrayType *ATy = dyn_cast<ArrayType>(AI->getAllocatedType());
if (!ATy || AI->isArrayAllocation())
continue;
-
+
// Get the list of all available allocas for this array type.
- std::vector<AllocaInst*> &AllocasForType = InlinedArrayAllocas[ATy];
-
+ std::vector<AllocaInst *> &AllocasForType = InlinedArrayAllocas[ATy];
+
// Loop over the allocas in AllocasForType to see if we can reuse one. Note
// that we have to be careful not to reuse the same "available" alloca for
// multiple different allocas that we just inlined, we use the 'UsedAllocas'
@@ -160,24 +163,24 @@ static bool InlineCallIfPossible(Pass &P, CallSite CS, InlineFunctionInfo &IFI,
unsigned Align1 = AI->getAlignment(),
Align2 = AvailableAlloca->getAlignment();
-
+
// The available alloca has to be in the right function, not in some other
// function in this SCC.
if (AvailableAlloca->getParent() != AI->getParent())
continue;
-
+
// If the inlined function already uses this alloca then we can't reuse
// it.
if (!UsedAllocas.insert(AvailableAlloca).second)
continue;
-
+
// Otherwise, we *can* reuse it, RAUW AI into AvailableAlloca and declare
// success!
- DEBUG(dbgs() << " ***MERGED ALLOCA: " << *AI << "\n\t\tINTO: "
- << *AvailableAlloca << '\n');
-
+ DEBUG(dbgs() << " ***MERGED ALLOCA: " << *AI
+ << "\n\t\tINTO: " << *AvailableAlloca << '\n');
+
// Move affected dbg.declare calls immediately after the new alloca to
- // avoid the situation when a dbg.declare preceeds its alloca.
+ // avoid the situation when a dbg.declare precedes its alloca.
if (auto *L = LocalAsMetadata::getIfExists(AI))
if (auto *MDV = MetadataAsValue::getIfExists(AI->getContext(), L))
for (User *U : MDV->users())
@@ -209,7 +212,7 @@ static bool InlineCallIfPossible(Pass &P, CallSite CS, InlineFunctionInfo &IFI,
// If we already nuked the alloca, we're done with it.
if (MergedAwayAlloca)
continue;
-
+
// If we were unable to merge away the alloca either because there are no
// allocas of the right type available or because we reused them all
// already, remember that this alloca came from an inlined function and mark
@@ -218,19 +221,51 @@ static bool InlineCallIfPossible(Pass &P, CallSite CS, InlineFunctionInfo &IFI,
AllocasForType.push_back(AI);
UsedAllocas.insert(AI);
}
-
- return true;
}
-static void emitAnalysis(CallSite CS, const Twine &Msg) {
+/// If it is possible to inline the specified call site,
+/// do so and update the CallGraph for this operation.
+///
+/// This function also does some basic book-keeping to update the IR. The
+/// InlinedArrayAllocas map keeps track of any allocas that are already
+/// available from other functions inlined into the caller. If we are able to
+/// inline this call site we attempt to reuse already available allocas or add
+/// any new allocas to the set if not possible.
+static bool InlineCallIfPossible(
+ CallSite CS, InlineFunctionInfo &IFI,
+ InlinedArrayAllocasTy &InlinedArrayAllocas, int InlineHistory,
+ bool InsertLifetime, function_ref<AAResults &(Function &)> &AARGetter,
+ ImportedFunctionsInliningStatistics &ImportedFunctionsStats) {
+ Function *Callee = CS.getCalledFunction();
Function *Caller = CS.getCaller();
- LLVMContext &Ctx = Caller->getContext();
- DebugLoc DLoc = CS.getInstruction()->getDebugLoc();
- emitOptimizationRemarkAnalysis(Ctx, DEBUG_TYPE, *Caller, DLoc, Msg);
+
+ AAResults &AAR = AARGetter(*Callee);
+
+ // Try to inline the function. Get the list of static allocas that were
+ // inlined.
+ if (!InlineFunction(CS, IFI, &AAR, InsertLifetime))
+ return false;
+
+ if (InlinerFunctionImportStats != InlinerFunctionImportStatsOpts::No)
+ ImportedFunctionsStats.recordInline(*Caller, *Callee);
+
+ AttributeFuncs::mergeAttributesForInlining(*Caller, *Callee);
+
+ if (!DisableInlinedAllocaMerging)
+ mergeInlinedArrayAllocas(Caller, IFI, InlinedArrayAllocas, InlineHistory);
+
+ return true;
}
-bool Inliner::shouldBeDeferred(Function *Caller, CallSite CS, InlineCost IC,
- int &TotalSecondaryCost) {
+/// Return true if inlining of CS can block the caller from being
+/// inlined which is proved to be more beneficial. \p IC is the
+/// estimated inline cost associated with callsite \p CS.
+/// \p TotalAltCost will be set to the estimated cost of inlining the caller
+/// if \p CS is suppressed for inlining.
+static bool
+shouldBeDeferred(Function *Caller, CallSite CS, InlineCost IC,
+ int &TotalSecondaryCost,
+ function_ref<InlineCost(CallSite CS)> GetInlineCost) {
// For now we only handle local or inline functions.
if (!Caller->hasLocalLinkage() && !Caller->hasLinkOnceODRLinkage())
@@ -269,7 +304,7 @@ bool Inliner::shouldBeDeferred(Function *Caller, CallSite CS, InlineCost IC,
continue;
}
- InlineCost IC2 = getInlineCost(CS2);
+ InlineCost IC2 = GetInlineCost(CS2);
++NumCallerCallersAnalyzed;
if (!IC2) {
callerWillBeRemoved = false;
@@ -278,7 +313,7 @@ bool Inliner::shouldBeDeferred(Function *Caller, CallSite CS, InlineCost IC,
if (IC2.isAlways())
continue;
- // See if inlining or original callsite would erase the cost delta of
+ // See if inlining of the original callsite would erase the cost delta of
// this callsite. We subtract off the penalty for the call instruction,
// which we would be deleting.
if (IC2.getCostDelta() <= CandidateCost) {
@@ -291,7 +326,7 @@ bool Inliner::shouldBeDeferred(Function *Caller, CallSite CS, InlineCost IC,
// be removed entirely. We did not account for this above unless there
// is only one caller of Caller.
if (callerWillBeRemoved && !Caller->use_empty())
- TotalSecondaryCost += InlineConstants::LastCallToStaticBonus;
+ TotalSecondaryCost -= InlineConstants::LastCallToStaticBonus;
if (inliningPreventsSomeOuterInline && TotalSecondaryCost < IC.getCost())
return true;
@@ -300,63 +335,73 @@ bool Inliner::shouldBeDeferred(Function *Caller, CallSite CS, InlineCost IC,
}
/// Return true if the inliner should attempt to inline at the given CallSite.
-bool Inliner::shouldInline(CallSite CS) {
- InlineCost IC = getInlineCost(CS);
-
+static bool shouldInline(CallSite CS,
+ function_ref<InlineCost(CallSite CS)> GetInlineCost,
+ OptimizationRemarkEmitter &ORE) {
+ using namespace ore;
+ InlineCost IC = GetInlineCost(CS);
+ Instruction *Call = CS.getInstruction();
+ Function *Callee = CS.getCalledFunction();
+
if (IC.isAlways()) {
DEBUG(dbgs() << " Inlining: cost=always"
- << ", Call: " << *CS.getInstruction() << "\n");
- emitAnalysis(CS, Twine(CS.getCalledFunction()->getName()) +
- " should always be inlined (cost=always)");
+ << ", Call: " << *CS.getInstruction() << "\n");
+ ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "AlwaysInline", Call)
+ << NV("Callee", Callee)
+ << " should always be inlined (cost=always)");
return true;
}
-
+
if (IC.isNever()) {
DEBUG(dbgs() << " NOT Inlining: cost=never"
- << ", Call: " << *CS.getInstruction() << "\n");
- emitAnalysis(CS, Twine(CS.getCalledFunction()->getName() +
- " should never be inlined (cost=never)"));
+ << ", Call: " << *CS.getInstruction() << "\n");
+ ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "NeverInline", Call)
+ << NV("Callee", Callee)
+ << " should never be inlined (cost=never)");
return false;
}
-
+
Function *Caller = CS.getCaller();
if (!IC) {
DEBUG(dbgs() << " NOT Inlining: cost=" << IC.getCost()
- << ", thres=" << (IC.getCostDelta() + IC.getCost())
- << ", Call: " << *CS.getInstruction() << "\n");
- emitAnalysis(CS, Twine(CS.getCalledFunction()->getName() +
- " too costly to inline (cost=") +
- Twine(IC.getCost()) + ", threshold=" +
- Twine(IC.getCostDelta() + IC.getCost()) + ")");
+ << ", thres=" << (IC.getCostDelta() + IC.getCost())
+ << ", Call: " << *CS.getInstruction() << "\n");
+ ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly", Call)
+ << NV("Callee", Callee) << " too costly to inline (cost="
+ << NV("Cost", IC.getCost()) << ", threshold="
+ << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")");
return false;
}
int TotalSecondaryCost = 0;
- if (shouldBeDeferred(Caller, CS, IC, TotalSecondaryCost)) {
+ if (shouldBeDeferred(Caller, CS, IC, TotalSecondaryCost, GetInlineCost)) {
DEBUG(dbgs() << " NOT Inlining: " << *CS.getInstruction()
- << " Cost = " << IC.getCost()
- << ", outer Cost = " << TotalSecondaryCost << '\n');
- emitAnalysis(CS, Twine("Not inlining. Cost of inlining " +
- CS.getCalledFunction()->getName() +
- " increases the cost of inlining " +
- CS.getCaller()->getName() + " in other contexts"));
+ << " Cost = " << IC.getCost()
+ << ", outer Cost = " << TotalSecondaryCost << '\n');
+ ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE,
+ "IncreaseCostInOtherContexts", Call)
+ << "Not inlining. Cost of inlining " << NV("Callee", Callee)
+ << " increases the cost of inlining " << NV("Caller", Caller)
+ << " in other contexts");
return false;
}
DEBUG(dbgs() << " Inlining: cost=" << IC.getCost()
- << ", thres=" << (IC.getCostDelta() + IC.getCost())
- << ", Call: " << *CS.getInstruction() << '\n');
- emitAnalysis(
- CS, CS.getCalledFunction()->getName() + Twine(" can be inlined into ") +
- CS.getCaller()->getName() + " with cost=" + Twine(IC.getCost()) +
- " (threshold=" + Twine(IC.getCostDelta() + IC.getCost()) + ")");
+ << ", thres=" << (IC.getCostDelta() + IC.getCost())
+ << ", Call: " << *CS.getInstruction() << '\n');
+ ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "CanBeInlined", Call)
+ << NV("Callee", Callee) << " can be inlined into "
+ << NV("Caller", Caller) << " with cost=" << NV("Cost", IC.getCost())
+ << " (threshold="
+ << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")");
return true;
}
/// Return true if the specified inline history ID
/// indicates an inline history that includes the specified function.
-static bool InlineHistoryIncludes(Function *F, int InlineHistoryID,
- const SmallVectorImpl<std::pair<Function*, int> > &InlineHistory) {
+static bool InlineHistoryIncludes(
+ Function *F, int InlineHistoryID,
+ const SmallVectorImpl<std::pair<Function *, int>> &InlineHistory) {
while (InlineHistoryID != -1) {
assert(unsigned(InlineHistoryID) < InlineHistory.size() &&
"Invalid inline history ID");
@@ -367,23 +412,32 @@ static bool InlineHistoryIncludes(Function *F, int InlineHistoryID,
return false;
}
-bool Inliner::runOnSCC(CallGraphSCC &SCC) {
+bool LegacyInlinerBase::doInitialization(CallGraph &CG) {
+ if (InlinerFunctionImportStats != InlinerFunctionImportStatsOpts::No)
+ ImportedFunctionsStats.setModuleInfo(CG.getModule());
+ return false; // No changes to CallGraph.
+}
+
+bool LegacyInlinerBase::runOnSCC(CallGraphSCC &SCC) {
if (skipSCC(SCC))
return false;
return inlineCalls(SCC);
}
-bool Inliner::inlineCalls(CallGraphSCC &SCC) {
- CallGraph &CG = getAnalysis<CallGraphWrapperPass>().getCallGraph();
- ACT = &getAnalysis<AssumptionCacheTracker>();
- PSI = getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(CG.getModule());
- auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
-
- SmallPtrSet<Function*, 8> SCCFunctions;
+static bool
+inlineCallsImpl(CallGraphSCC &SCC, CallGraph &CG,
+ std::function<AssumptionCache &(Function &)> GetAssumptionCache,
+ ProfileSummaryInfo *PSI, TargetLibraryInfo &TLI,
+ bool InsertLifetime,
+ function_ref<InlineCost(CallSite CS)> GetInlineCost,
+ function_ref<AAResults &(Function &)> AARGetter,
+ ImportedFunctionsInliningStatistics &ImportedFunctionsStats) {
+ SmallPtrSet<Function *, 8> SCCFunctions;
DEBUG(dbgs() << "Inliner visiting SCC:");
for (CallGraphNode *Node : SCC) {
Function *F = Node->getFunction();
- if (F) SCCFunctions.insert(F);
+ if (F)
+ SCCFunctions.insert(F);
DEBUG(dbgs() << " " << (F ? F->getName() : "INDIRECTNODE"));
}
@@ -391,17 +445,19 @@ bool Inliner::inlineCalls(CallGraphSCC &SCC) {
// inline call sites in the original functions, not call sites that result
// from inlining other functions.
SmallVector<std::pair<CallSite, int>, 16> CallSites;
-
+
// When inlining a callee produces new call sites, we want to keep track of
// the fact that they were inlined from the callee. This allows us to avoid
// infinite inlining in some obscure cases. To represent this, we use an
// index into the InlineHistory vector.
- SmallVector<std::pair<Function*, int>, 8> InlineHistory;
+ SmallVector<std::pair<Function *, int>, 8> InlineHistory;
for (CallGraphNode *Node : SCC) {
Function *F = Node->getFunction();
- if (!F) continue;
-
+ if (!F || F->isDeclaration())
+ continue;
+
+ OptimizationRemarkEmitter ORE(F);
for (BasicBlock &BB : *F)
for (Instruction &I : BB) {
CallSite CS(cast<Value>(&I));
@@ -409,14 +465,21 @@ bool Inliner::inlineCalls(CallGraphSCC &SCC) {
// never be inlined.
if (!CS || isa<IntrinsicInst>(I))
continue;
-
+
// If this is a direct call to an external function, we can never inline
// it. If it is an indirect call, inlining may resolve it to be a
// direct call, so we keep it.
if (Function *Callee = CS.getCalledFunction())
- if (Callee->isDeclaration())
+ if (Callee->isDeclaration()) {
+ using namespace ore;
+ ORE.emit(OptimizationRemarkMissed(DEBUG_TYPE, "NoDefinition", &I)
+ << NV("Callee", Callee) << " will not be inlined into "
+ << NV("Caller", CS.getCaller())
+ << " because its definition is unavailable"
+ << setIsVerbose());
continue;
-
+ }
+
CallSites.push_back(std::make_pair(CS, -1));
}
}
@@ -435,9 +498,8 @@ bool Inliner::inlineCalls(CallGraphSCC &SCC) {
if (SCCFunctions.count(F))
std::swap(CallSites[i--], CallSites[--FirstCallInSCC]);
-
InlinedArrayAllocasTy InlinedArrayAllocas;
- InlineFunctionInfo InlineInfo(&CG, ACT);
+ InlineFunctionInfo InlineInfo(&CG, &GetAssumptionCache);
// Now that we have all of the call sites, loop over them and inline them if
// it looks profitable to do so.
@@ -450,7 +512,7 @@ bool Inliner::inlineCalls(CallGraphSCC &SCC) {
// CallSites may be modified inside so ranged for loop can not be used.
for (unsigned CSi = 0; CSi != CallSites.size(); ++CSi) {
CallSite CS = CallSites[CSi].first;
-
+
Function *Caller = CS.getCaller();
Function *Callee = CS.getCalledFunction();
@@ -459,16 +521,17 @@ bool Inliner::inlineCalls(CallGraphSCC &SCC) {
// size. This happens because IPSCCP propagates the result out of the
// call and then we're left with the dead call.
if (isInstructionTriviallyDead(CS.getInstruction(), &TLI)) {
- DEBUG(dbgs() << " -> Deleting dead call: "
- << *CS.getInstruction() << "\n");
+ DEBUG(dbgs() << " -> Deleting dead call: " << *CS.getInstruction()
+ << "\n");
// Update the call graph by deleting the edge from Callee to Caller.
CG[Caller]->removeCallEdgeFor(CS);
CS.getInstruction()->eraseFromParent();
++NumCallsDeleted;
} else {
// We can only inline direct calls to non-declarations.
- if (!Callee || Callee->isDeclaration()) continue;
-
+ if (!Callee || Callee->isDeclaration())
+ continue;
+
// If this call site was obtained by inlining another function, verify
// that the include path for the function did not include the callee
// itself. If so, we'd be recursively inlining the same function,
@@ -478,37 +541,42 @@ bool Inliner::inlineCalls(CallGraphSCC &SCC) {
if (InlineHistoryID != -1 &&
InlineHistoryIncludes(Callee, InlineHistoryID, InlineHistory))
continue;
-
- LLVMContext &CallerCtx = Caller->getContext();
// Get DebugLoc to report. CS will be invalid after Inliner.
DebugLoc DLoc = CS.getInstruction()->getDebugLoc();
+ BasicBlock *Block = CS.getParent();
+ // FIXME for new PM: because of the old PM we currently generate ORE and
+ // in turn BFI on demand. With the new PM, the ORE dependency should
+ // just become a regular analysis dependency.
+ OptimizationRemarkEmitter ORE(Caller);
// If the policy determines that we should inline this function,
// try to do so.
- if (!shouldInline(CS)) {
- emitOptimizationRemarkMissed(CallerCtx, DEBUG_TYPE, *Caller, DLoc,
- Twine(Callee->getName() +
- " will not be inlined into " +
- Caller->getName()));
+ using namespace ore;
+ if (!shouldInline(CS, GetInlineCost, ORE)) {
+ ORE.emit(
+ OptimizationRemarkMissed(DEBUG_TYPE, "NotInlined", DLoc, Block)
+ << NV("Callee", Callee) << " will not be inlined into "
+ << NV("Caller", Caller));
continue;
}
// Attempt to inline the function.
- if (!InlineCallIfPossible(*this, CS, InlineInfo, InlinedArrayAllocas,
- InlineHistoryID, InsertLifetime)) {
- emitOptimizationRemarkMissed(CallerCtx, DEBUG_TYPE, *Caller, DLoc,
- Twine(Callee->getName() +
- " will not be inlined into " +
- Caller->getName()));
+ if (!InlineCallIfPossible(CS, InlineInfo, InlinedArrayAllocas,
+ InlineHistoryID, InsertLifetime, AARGetter,
+ ImportedFunctionsStats)) {
+ ORE.emit(
+ OptimizationRemarkMissed(DEBUG_TYPE, "NotInlined", DLoc, Block)
+ << NV("Callee", Callee) << " will not be inlined into "
+ << NV("Caller", Caller));
continue;
}
++NumInlined;
// Report the inline decision.
- emitOptimizationRemark(
- CallerCtx, DEBUG_TYPE, *Caller, DLoc,
- Twine(Callee->getName() + " inlined into " + Caller->getName()));
+ ORE.emit(OptimizationRemark(DEBUG_TYPE, "Inlined", DLoc, Block)
+ << NV("Callee", Callee) << " inlined into "
+ << NV("Caller", Caller));
// If inlining this function gave us any new call sites, throw them
// onto our worklist to process. They are useful inline candidates.
@@ -522,30 +590,30 @@ bool Inliner::inlineCalls(CallGraphSCC &SCC) {
CallSites.push_back(std::make_pair(CallSite(Ptr), NewHistoryID));
}
}
-
+
// If we inlined or deleted the last possible call site to the function,
// delete the function body now.
if (Callee && Callee->use_empty() && Callee->hasLocalLinkage() &&
// TODO: Can remove if in SCC now.
!SCCFunctions.count(Callee) &&
-
+
// The function may be apparently dead, but if there are indirect
// callgraph references to the node, we cannot delete it yet, this
// could invalidate the CGSCC iterator.
CG[Callee]->getNumReferences() == 0) {
- DEBUG(dbgs() << " -> Deleting dead function: "
- << Callee->getName() << "\n");
+ DEBUG(dbgs() << " -> Deleting dead function: " << Callee->getName()
+ << "\n");
CallGraphNode *CalleeNode = CG[Callee];
// Remove any call graph edges from the callee to its callees.
CalleeNode->removeAllCalledFunctions();
-
+
// Removing the node for callee from the call graph and delete it.
delete CG.removeFunctionFromModule(CalleeNode);
++NumDeleted;
}
- // Remove this call site from the list. If possible, use
+ // Remove this call site from the list. If possible, use
// swap/pop_back for efficiency, but do not use it if doing so would
// move a call site to a function in this SCC before the
// 'FirstCallInSCC' barrier.
@@ -553,7 +621,7 @@ bool Inliner::inlineCalls(CallGraphSCC &SCC) {
CallSites[CSi] = CallSites.back();
CallSites.pop_back();
} else {
- CallSites.erase(CallSites.begin()+CSi);
+ CallSites.erase(CallSites.begin() + CSi);
}
--CSi;
@@ -565,17 +633,43 @@ bool Inliner::inlineCalls(CallGraphSCC &SCC) {
return Changed;
}
+bool LegacyInlinerBase::inlineCalls(CallGraphSCC &SCC) {
+ CallGraph &CG = getAnalysis<CallGraphWrapperPass>().getCallGraph();
+ ACT = &getAnalysis<AssumptionCacheTracker>();
+ PSI = getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
+ auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+ // We compute dedicated AA results for each function in the SCC as needed. We
+ // use a lambda referencing external objects so that they live long enough to
+ // be queried, but we re-use them each time.
+ Optional<BasicAAResult> BAR;
+ Optional<AAResults> AAR;
+ auto AARGetter = [&](Function &F) -> AAResults & {
+ BAR.emplace(createLegacyPMBasicAAResult(*this, F));
+ AAR.emplace(createLegacyPMAAResults(*this, F, *BAR));
+ return *AAR;
+ };
+ auto GetAssumptionCache = [&](Function &F) -> AssumptionCache & {
+ return ACT->getAssumptionCache(F);
+ };
+ return inlineCallsImpl(SCC, CG, GetAssumptionCache, PSI, TLI, InsertLifetime,
+ [this](CallSite CS) { return getInlineCost(CS); },
+ AARGetter, ImportedFunctionsStats);
+}
+
/// Remove now-dead linkonce functions at the end of
/// processing to avoid breaking the SCC traversal.
-bool Inliner::doFinalization(CallGraph &CG) {
+bool LegacyInlinerBase::doFinalization(CallGraph &CG) {
+ if (InlinerFunctionImportStats != InlinerFunctionImportStatsOpts::No)
+ ImportedFunctionsStats.dump(InlinerFunctionImportStats ==
+ InlinerFunctionImportStatsOpts::Verbose);
return removeDeadFunctions(CG);
}
/// Remove dead functions that are not included in DNR (Do Not Remove) list.
-bool Inliner::removeDeadFunctions(CallGraph &CG, bool AlwaysInlineOnly) {
- SmallVector<CallGraphNode*, 16> FunctionsToRemove;
- SmallVector<CallGraphNode *, 16> DeadFunctionsInComdats;
- SmallDenseMap<const Comdat *, int, 16> ComdatEntriesAlive;
+bool LegacyInlinerBase::removeDeadFunctions(CallGraph &CG,
+ bool AlwaysInlineOnly) {
+ SmallVector<CallGraphNode *, 16> FunctionsToRemove;
+ SmallVector<Function *, 16> DeadFunctionsInComdats;
auto RemoveCGN = [&](CallGraphNode *CGN) {
// Remove any call graph edges from the function to its callees.
@@ -616,9 +710,8 @@ bool Inliner::removeDeadFunctions(CallGraph &CG, bool AlwaysInlineOnly) {
// The inliner doesn't visit non-function entities which are in COMDAT
// groups so it is unsafe to do so *unless* the linkage is local.
if (!F->hasLocalLinkage()) {
- if (const Comdat *C = F->getComdat()) {
- --ComdatEntriesAlive[C];
- DeadFunctionsInComdats.push_back(CGN);
+ if (F->hasComdat()) {
+ DeadFunctionsInComdats.push_back(F);
continue;
}
}
@@ -626,32 +719,11 @@ bool Inliner::removeDeadFunctions(CallGraph &CG, bool AlwaysInlineOnly) {
RemoveCGN(CGN);
}
if (!DeadFunctionsInComdats.empty()) {
- // Count up all the entities in COMDAT groups
- auto ComdatGroupReferenced = [&](const Comdat *C) {
- auto I = ComdatEntriesAlive.find(C);
- if (I != ComdatEntriesAlive.end())
- ++(I->getSecond());
- };
- for (const Function &F : CG.getModule())
- if (const Comdat *C = F.getComdat())
- ComdatGroupReferenced(C);
- for (const GlobalVariable &GV : CG.getModule().globals())
- if (const Comdat *C = GV.getComdat())
- ComdatGroupReferenced(C);
- for (const GlobalAlias &GA : CG.getModule().aliases())
- if (const Comdat *C = GA.getComdat())
- ComdatGroupReferenced(C);
- for (CallGraphNode *CGN : DeadFunctionsInComdats) {
- Function *F = CGN->getFunction();
- const Comdat *C = F->getComdat();
- int NumAlive = ComdatEntriesAlive[C];
- // We can remove functions in a COMDAT group if the entire group is dead.
- assert(NumAlive >= 0);
- if (NumAlive > 0)
- continue;
-
- RemoveCGN(CGN);
- }
+ // Filter out the functions whose comdats remain alive.
+ filterDeadComdatFunctions(CG.getModule(), DeadFunctionsInComdats);
+ // Remove the rest.
+ for (Function *F : DeadFunctionsInComdats)
+ RemoveCGN(CG[F]);
}
if (FunctionsToRemove.empty())
@@ -665,12 +737,201 @@ bool Inliner::removeDeadFunctions(CallGraph &CG, bool AlwaysInlineOnly) {
// here to do this, it doesn't matter which order the functions are deleted
// in.
array_pod_sort(FunctionsToRemove.begin(), FunctionsToRemove.end());
- FunctionsToRemove.erase(std::unique(FunctionsToRemove.begin(),
- FunctionsToRemove.end()),
- FunctionsToRemove.end());
+ FunctionsToRemove.erase(
+ std::unique(FunctionsToRemove.begin(), FunctionsToRemove.end()),
+ FunctionsToRemove.end());
for (CallGraphNode *CGN : FunctionsToRemove) {
delete CG.removeFunctionFromModule(CGN);
++NumDeleted;
}
return true;
}
+
+PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
+ CGSCCAnalysisManager &AM, LazyCallGraph &CG,
+ CGSCCUpdateResult &UR) {
+ FunctionAnalysisManager &FAM =
+ AM.getResult<FunctionAnalysisManagerCGSCCProxy>(InitialC, CG)
+ .getManager();
+ const ModuleAnalysisManager &MAM =
+ AM.getResult<ModuleAnalysisManagerCGSCCProxy>(InitialC, CG).getManager();
+ bool Changed = false;
+
+ assert(InitialC.size() > 0 && "Cannot handle an empty SCC!");
+ Module &M = *InitialC.begin()->getFunction().getParent();
+ ProfileSummaryInfo *PSI = MAM.getCachedResult<ProfileSummaryAnalysis>(M);
+
+ std::function<AssumptionCache &(Function &)> GetAssumptionCache =
+ [&](Function &F) -> AssumptionCache & {
+ return FAM.getResult<AssumptionAnalysis>(F);
+ };
+
+ // Setup the data structure used to plumb customization into the
+ // `InlineFunction` routine.
+ InlineFunctionInfo IFI(/*cg=*/nullptr, &GetAssumptionCache);
+
+ auto GetInlineCost = [&](CallSite CS) {
+ Function &Callee = *CS.getCalledFunction();
+ auto &CalleeTTI = FAM.getResult<TargetIRAnalysis>(Callee);
+ return getInlineCost(CS, Params, CalleeTTI, GetAssumptionCache, PSI);
+ };
+
+ // We use a worklist of nodes to process so that we can handle if the SCC
+ // structure changes and some nodes are no longer part of the current SCC. We
+ // also need to use an updatable pointer for the SCC as a consequence.
+ SmallVector<LazyCallGraph::Node *, 16> Nodes;
+ for (auto &N : InitialC)
+ Nodes.push_back(&N);
+ auto *C = &InitialC;
+ auto *RC = &C->getOuterRefSCC();
+
+ // We also use a secondary worklist of call sites within a particular node to
+ // allow quickly continuing to inline through newly inlined call sites where
+ // possible.
+ SmallVector<std::pair<CallSite, int>, 16> Calls;
+
+ // When inlining a callee produces new call sites, we want to keep track of
+ // the fact that they were inlined from the callee. This allows us to avoid
+ // infinite inlining in some obscure cases. To represent this, we use an
+ // index into the InlineHistory vector.
+ SmallVector<std::pair<Function *, int>, 16> InlineHistory;
+
+ // Track a set vector of inlined callees so that we can augment the caller
+ // with all of their edges in the call graph before pruning out the ones that
+ // got simplified away.
+ SmallSetVector<Function *, 4> InlinedCallees;
+
+ // Track the dead functions to delete once finished with inlining calls. We
+ // defer deleting these to make it easier to handle the call graph updates.
+ SmallVector<Function *, 4> DeadFunctions;
+
+ do {
+ auto &N = *Nodes.pop_back_val();
+ if (CG.lookupSCC(N) != C)
+ continue;
+ Function &F = N.getFunction();
+ if (F.hasFnAttribute(Attribute::OptimizeNone))
+ continue;
+
+ // Get the remarks emission analysis for the caller.
+ auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(F);
+
+ // We want to generally process call sites top-down in order for
+ // simplifications stemming from replacing the call with the returned value
+ // after inlining to be visible to subsequent inlining decisions. So we
+ // walk the function backwards and then process the back of the vector.
+ // FIXME: Using reverse is a really bad way to do this. Instead we should
+ // do an actual PO walk of the function body.
+ for (Instruction &I : reverse(instructions(F)))
+ if (auto CS = CallSite(&I))
+ if (Function *Callee = CS.getCalledFunction())
+ if (!Callee->isDeclaration())
+ Calls.push_back({CS, -1});
+
+ bool DidInline = false;
+ while (!Calls.empty()) {
+ int InlineHistoryID;
+ CallSite CS;
+ std::tie(CS, InlineHistoryID) = Calls.pop_back_val();
+ Function &Callee = *CS.getCalledFunction();
+
+ if (InlineHistoryID != -1 &&
+ InlineHistoryIncludes(&Callee, InlineHistoryID, InlineHistory))
+ continue;
+
+ // Check whether we want to inline this callsite.
+ if (!shouldInline(CS, GetInlineCost, ORE))
+ continue;
+
+ if (!InlineFunction(CS, IFI))
+ continue;
+ DidInline = true;
+ InlinedCallees.insert(&Callee);
+
+ // Add any new callsites to defined functions to the worklist.
+ if (!IFI.InlinedCallSites.empty()) {
+ int NewHistoryID = InlineHistory.size();
+ InlineHistory.push_back({&Callee, InlineHistoryID});
+ for (CallSite &CS : reverse(IFI.InlinedCallSites))
+ if (Function *NewCallee = CS.getCalledFunction())
+ if (!NewCallee->isDeclaration())
+ Calls.push_back({CS, NewHistoryID});
+ }
+
+ // Merge the attributes based on the inlining.
+ AttributeFuncs::mergeAttributesForInlining(F, Callee);
+
+ // For local functions, check whether this makes the callee trivially
+ // dead. In that case, we can drop the body of the function eagerly
+ // which may reduce the number of callers of other functions to one,
+ // changing inline cost thresholds.
+ if (Callee.hasLocalLinkage()) {
+ // To check this we also need to nuke any dead constant uses (perhaps
+ // made dead by this operation on other functions).
+ Callee.removeDeadConstantUsers();
+ if (Callee.use_empty()) {
+ // Clear the body and queue the function itself for deletion when we
+ // finish inlining and call graph updates.
+ // Note that after this point, it is an error to do anything other
+ // than use the callee's address or delete it.
+ Callee.dropAllReferences();
+ assert(find(DeadFunctions, &Callee) == DeadFunctions.end() &&
+ "Cannot put cause a function to become dead twice!");
+ DeadFunctions.push_back(&Callee);
+ }
+ }
+ }
+
+ if (!DidInline)
+ continue;
+ Changed = true;
+
+ // Add all the inlined callees' edges as ref edges to the caller. These are
+ // by definition trivial edges as we always have *some* transitive ref edge
+ // chain. While in some cases these edges are direct calls inside the
+ // callee, they have to be modeled in the inliner as reference edges as
+ // there may be a reference edge anywhere along the chain from the current
+ // caller to the callee that causes the whole thing to appear like
+ // a (transitive) reference edge that will require promotion to a call edge
+ // below.
+ for (Function *InlinedCallee : InlinedCallees) {
+ LazyCallGraph::Node &CalleeN = *CG.lookup(*InlinedCallee);
+ for (LazyCallGraph::Edge &E : CalleeN)
+ RC->insertTrivialRefEdge(N, *E.getNode());
+ }
+ InlinedCallees.clear();
+
+ // At this point, since we have made changes we have at least removed
+ // a call instruction. However, in the process we do some incremental
+ // simplification of the surrounding code. This simplification can
+ // essentially do all of the same things as a function pass and we can
+ // re-use the exact same logic for updating the call graph to reflect the
+ // change..
+ C = &updateCGAndAnalysisManagerForFunctionPass(CG, *C, N, AM, UR);
+ RC = &C->getOuterRefSCC();
+ } while (!Nodes.empty());
+
+ // Now that we've finished inlining all of the calls across this SCC, delete
+ // all of the trivially dead functions, updating the call graph and the CGSCC
+ // pass manager in the process.
+ //
+ // Note that this walks a pointer set which has non-deterministic order but
+ // that is OK as all we do is delete things and add pointers to unordered
+ // sets.
+ for (Function *DeadF : DeadFunctions) {
+ // Get the necessary information out of the call graph and nuke the
+ // function there.
+ auto &DeadC = *CG.lookupSCC(*CG.lookup(*DeadF));
+ auto &DeadRC = DeadC.getOuterRefSCC();
+ CG.removeDeadFunction(*DeadF);
+
+ // Mark the relevant parts of the call graph as invalid so we don't visit
+ // them.
+ UR.InvalidatedSCCs.insert(&DeadC);
+ UR.InvalidatedRefSCCs.insert(&DeadRC);
+
+ // And delete the actual function from the module.
+ M.getFunctionList().erase(DeadF);
+ }
+ return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
+}
diff --git a/contrib/llvm/lib/Transforms/IPO/Internalize.cpp b/contrib/llvm/lib/Transforms/IPO/Internalize.cpp
index 8c5c6f7..26db146 100644
--- a/contrib/llvm/lib/Transforms/IPO/Internalize.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/Internalize.cpp
@@ -239,7 +239,7 @@ bool InternalizePass::internalizeModule(Module &M, CallGraph *CG) {
InternalizePass::InternalizePass() : MustPreserveGV(PreserveAPIList()) {}
-PreservedAnalyses InternalizePass::run(Module &M, AnalysisManager<Module> &AM) {
+PreservedAnalyses InternalizePass::run(Module &M, ModuleAnalysisManager &AM) {
if (!internalizeModule(M, AM.getCachedResult<CallGraphAnalysis>(M)))
return PreservedAnalyses::all();
diff --git a/contrib/llvm/lib/Transforms/IPO/LowerTypeTests.cpp b/contrib/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
index 36089f0..deb7e81 100644
--- a/contrib/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
@@ -13,8 +13,8 @@
//===----------------------------------------------------------------------===//
#include "llvm/Transforms/IPO/LowerTypeTests.h"
-#include "llvm/Transforms/IPO.h"
#include "llvm/ADT/EquivalenceClasses.h"
+#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/Triple.h"
#include "llvm/IR/Constant.h"
@@ -23,18 +23,27 @@
#include "llvm/IR/GlobalObject.h"
#include "llvm/IR/GlobalVariable.h"
#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InlineAsm.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/Module.h"
+#include "llvm/IR/ModuleSummaryIndexYAML.h"
#include "llvm/IR/Operator.h"
#include "llvm/Pass.h"
#include "llvm/Support/Debug.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/TrailingObjects.h"
#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/IPO.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
using namespace llvm;
using namespace lowertypetests;
+using SummaryAction = LowerTypeTestsSummaryAction;
+
#define DEBUG_TYPE "lowertypetests"
STATISTIC(ByteArraySizeBits, "Byte array size in bits");
@@ -48,6 +57,26 @@ static cl::opt<bool> AvoidReuse(
cl::desc("Try to avoid reuse of byte array addresses using aliases"),
cl::Hidden, cl::init(true));
+static cl::opt<SummaryAction> ClSummaryAction(
+ "lowertypetests-summary-action",
+ cl::desc("What to do with the summary when running this pass"),
+ cl::values(clEnumValN(SummaryAction::None, "none", "Do nothing"),
+ clEnumValN(SummaryAction::Import, "import",
+ "Import typeid resolutions from summary and globals"),
+ clEnumValN(SummaryAction::Export, "export",
+ "Export typeid resolutions to summary and globals")),
+ cl::Hidden);
+
+static cl::opt<std::string> ClReadSummary(
+ "lowertypetests-read-summary",
+ cl::desc("Read summary from given YAML file before running pass"),
+ cl::Hidden);
+
+static cl::opt<std::string> ClWriteSummary(
+ "lowertypetests-write-summary",
+ cl::desc("Write summary to given YAML file after running pass"),
+ cl::Hidden);
+
bool BitSetInfo::containsGlobalOffset(uint64_t Offset) const {
if (Offset < ByteOffset)
return false;
@@ -62,39 +91,6 @@ bool BitSetInfo::containsGlobalOffset(uint64_t Offset) const {
return Bits.count(BitOffset);
}
-bool BitSetInfo::containsValue(
- const DataLayout &DL,
- const DenseMap<GlobalObject *, uint64_t> &GlobalLayout, Value *V,
- uint64_t COffset) const {
- if (auto GV = dyn_cast<GlobalObject>(V)) {
- auto I = GlobalLayout.find(GV);
- if (I == GlobalLayout.end())
- return false;
- return containsGlobalOffset(I->second + COffset);
- }
-
- if (auto GEP = dyn_cast<GEPOperator>(V)) {
- APInt APOffset(DL.getPointerSizeInBits(0), 0);
- bool Result = GEP->accumulateConstantOffset(DL, APOffset);
- if (!Result)
- return false;
- COffset += APOffset.getZExtValue();
- return containsValue(DL, GlobalLayout, GEP->getPointerOperand(),
- COffset);
- }
-
- if (auto Op = dyn_cast<Operator>(V)) {
- if (Op->getOpcode() == Instruction::BitCast)
- return containsValue(DL, GlobalLayout, Op->getOperand(0), COffset);
-
- if (Op->getOpcode() == Instruction::Select)
- return containsValue(DL, GlobalLayout, Op->getOperand(1), COffset) &&
- containsValue(DL, GlobalLayout, Op->getOperand(2), COffset);
- }
-
- return false;
-}
-
void BitSetInfo::print(raw_ostream &OS) const {
OS << "offset " << ByteOffset << " size " << BitSize << " align "
<< (1 << AlignLog2);
@@ -201,59 +197,169 @@ struct ByteArrayInfo {
std::set<uint64_t> Bits;
uint64_t BitSize;
GlobalVariable *ByteArray;
- Constant *Mask;
+ GlobalVariable *MaskGlobal;
};
-struct LowerTypeTests : public ModulePass {
- static char ID;
- LowerTypeTests() : ModulePass(ID) {
- initializeLowerTypeTestsPass(*PassRegistry::getPassRegistry());
+/// A POD-like structure that we use to store a global reference together with
+/// its metadata types. In this pass we frequently need to query the set of
+/// metadata types referenced by a global, which at the IR level is an expensive
+/// operation involving a map lookup; this data structure helps to reduce the
+/// number of times we need to do this lookup.
+class GlobalTypeMember final : TrailingObjects<GlobalTypeMember, MDNode *> {
+ GlobalObject *GO;
+ size_t NTypes;
+
+ friend TrailingObjects;
+ size_t numTrailingObjects(OverloadToken<MDNode *>) const { return NTypes; }
+
+public:
+ static GlobalTypeMember *create(BumpPtrAllocator &Alloc, GlobalObject *GO,
+ ArrayRef<MDNode *> Types) {
+ auto *GTM = static_cast<GlobalTypeMember *>(Alloc.Allocate(
+ totalSizeToAlloc<MDNode *>(Types.size()), alignof(GlobalTypeMember)));
+ GTM->GO = GO;
+ GTM->NTypes = Types.size();
+ std::uninitialized_copy(Types.begin(), Types.end(),
+ GTM->getTrailingObjects<MDNode *>());
+ return GTM;
}
+ GlobalObject *getGlobal() const {
+ return GO;
+ }
+ ArrayRef<MDNode *> types() const {
+ return makeArrayRef(getTrailingObjects<MDNode *>(), NTypes);
+ }
+};
- Module *M;
+class LowerTypeTestsModule {
+ Module &M;
+
+ SummaryAction Action;
+ ModuleSummaryIndex *Summary;
bool LinkerSubsectionsViaSymbols;
Triple::ArchType Arch;
+ Triple::OSType OS;
Triple::ObjectFormatType ObjectFormat;
- IntegerType *Int1Ty;
- IntegerType *Int8Ty;
- IntegerType *Int32Ty;
- Type *Int32PtrTy;
- IntegerType *Int64Ty;
- IntegerType *IntPtrTy;
+
+ IntegerType *Int1Ty = Type::getInt1Ty(M.getContext());
+ IntegerType *Int8Ty = Type::getInt8Ty(M.getContext());
+ PointerType *Int8PtrTy = Type::getInt8PtrTy(M.getContext());
+ IntegerType *Int32Ty = Type::getInt32Ty(M.getContext());
+ PointerType *Int32PtrTy = PointerType::getUnqual(Int32Ty);
+ IntegerType *Int64Ty = Type::getInt64Ty(M.getContext());
+ IntegerType *IntPtrTy = M.getDataLayout().getIntPtrType(M.getContext(), 0);
+
+ // Indirect function call index assignment counter for WebAssembly
+ uint64_t IndirectIndex = 1;
// Mapping from type identifiers to the call sites that test them.
DenseMap<Metadata *, std::vector<CallInst *>> TypeTestCallSites;
+ /// This structure describes how to lower type tests for a particular type
+ /// identifier. It is either built directly from the global analysis (during
+ /// regular LTO or the regular LTO phase of ThinLTO), or indirectly using type
+ /// identifier summaries and external symbol references (in ThinLTO backends).
+ struct TypeIdLowering {
+ TypeTestResolution::Kind TheKind;
+
+ /// All except Unsat: the start address within the combined global.
+ Constant *OffsetedGlobal;
+
+ /// ByteArray, Inline, AllOnes: log2 of the required global alignment
+ /// relative to the start address.
+ Constant *AlignLog2;
+
+ /// ByteArray, Inline, AllOnes: one less than the size of the memory region
+ /// covering members of this type identifier as a multiple of 2^AlignLog2.
+ Constant *SizeM1;
+
+ /// ByteArray, Inline, AllOnes: range of SizeM1 expressed as a bit width.
+ unsigned SizeM1BitWidth;
+
+ /// ByteArray: the byte array to test the address against.
+ Constant *TheByteArray;
+
+ /// ByteArray: the bit mask to apply to bytes loaded from the byte array.
+ Constant *BitMask;
+
+ /// Inline: the bit mask to test the address against.
+ Constant *InlineBits;
+ };
+
std::vector<ByteArrayInfo> ByteArrayInfos;
+ Function *WeakInitializerFn = nullptr;
+
BitSetInfo
buildBitSet(Metadata *TypeId,
- const DenseMap<GlobalObject *, uint64_t> &GlobalLayout);
+ const DenseMap<GlobalTypeMember *, uint64_t> &GlobalLayout);
ByteArrayInfo *createByteArray(BitSetInfo &BSI);
void allocateByteArrays();
- Value *createBitSetTest(IRBuilder<> &B, BitSetInfo &BSI, ByteArrayInfo *&BAI,
+ Value *createBitSetTest(IRBuilder<> &B, const TypeIdLowering &TIL,
Value *BitOffset);
- void
- lowerTypeTestCalls(ArrayRef<Metadata *> TypeIds, Constant *CombinedGlobalAddr,
- const DenseMap<GlobalObject *, uint64_t> &GlobalLayout);
- Value *
- lowerBitSetCall(CallInst *CI, BitSetInfo &BSI, ByteArrayInfo *&BAI,
- Constant *CombinedGlobal,
- const DenseMap<GlobalObject *, uint64_t> &GlobalLayout);
+ void lowerTypeTestCalls(
+ ArrayRef<Metadata *> TypeIds, Constant *CombinedGlobalAddr,
+ const DenseMap<GlobalTypeMember *, uint64_t> &GlobalLayout);
+ Value *lowerTypeTestCall(Metadata *TypeId, CallInst *CI,
+ const TypeIdLowering &TIL);
void buildBitSetsFromGlobalVariables(ArrayRef<Metadata *> TypeIds,
- ArrayRef<GlobalVariable *> Globals);
+ ArrayRef<GlobalTypeMember *> Globals);
unsigned getJumpTableEntrySize();
Type *getJumpTableEntryType();
- Constant *createJumpTableEntry(GlobalObject *Src, Function *Dest,
- unsigned Distance);
+ void createJumpTableEntry(raw_ostream &AsmOS, raw_ostream &ConstraintOS,
+ SmallVectorImpl<Value *> &AsmArgs, Function *Dest);
void verifyTypeMDNode(GlobalObject *GO, MDNode *Type);
void buildBitSetsFromFunctions(ArrayRef<Metadata *> TypeIds,
- ArrayRef<Function *> Functions);
+ ArrayRef<GlobalTypeMember *> Functions);
+ void buildBitSetsFromFunctionsNative(ArrayRef<Metadata *> TypeIds,
+ ArrayRef<GlobalTypeMember *> Functions);
+ void buildBitSetsFromFunctionsWASM(ArrayRef<Metadata *> TypeIds,
+ ArrayRef<GlobalTypeMember *> Functions);
void buildBitSetsFromDisjointSet(ArrayRef<Metadata *> TypeIds,
- ArrayRef<GlobalObject *> Globals);
+ ArrayRef<GlobalTypeMember *> Globals);
+
+ void replaceWeakDeclarationWithJumpTablePtr(Function *F, Constant *JT);
+ void moveInitializerToModuleConstructor(GlobalVariable *GV);
+ void findGlobalVariableUsersOf(Constant *C,
+ SmallSetVector<GlobalVariable *, 8> &Out);
+
+ void createJumpTable(Function *F, ArrayRef<GlobalTypeMember *> Functions);
+
+public:
+ LowerTypeTestsModule(Module &M, SummaryAction Action,
+ ModuleSummaryIndex *Summary);
bool lower();
- bool runOnModule(Module &M) override;
+
+ // Lower the module using the action and summary passed as command line
+ // arguments. For testing purposes only.
+ static bool runForTesting(Module &M);
+};
+
+struct LowerTypeTests : public ModulePass {
+ static char ID;
+
+ bool UseCommandLine = false;
+
+ SummaryAction Action;
+ ModuleSummaryIndex *Summary;
+
+ LowerTypeTests() : ModulePass(ID), UseCommandLine(true) {
+ initializeLowerTypeTestsPass(*PassRegistry::getPassRegistry());
+ }
+
+ LowerTypeTests(SummaryAction Action, ModuleSummaryIndex *Summary)
+ : ModulePass(ID), Action(Action), Summary(Summary) {
+ initializeLowerTypeTestsPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnModule(Module &M) override {
+ if (skipModule(M))
+ return false;
+ if (UseCommandLine)
+ return LowerTypeTestsModule::runForTesting(M);
+ return LowerTypeTestsModule(M, Action, Summary).lower();
+ }
};
} // anonymous namespace
@@ -262,27 +368,28 @@ INITIALIZE_PASS(LowerTypeTests, "lowertypetests", "Lower type metadata", false,
false)
char LowerTypeTests::ID = 0;
-ModulePass *llvm::createLowerTypeTestsPass() { return new LowerTypeTests; }
+ModulePass *llvm::createLowerTypeTestsPass(SummaryAction Action,
+ ModuleSummaryIndex *Summary) {
+ return new LowerTypeTests(Action, Summary);
+}
/// Build a bit set for TypeId using the object layouts in
/// GlobalLayout.
-BitSetInfo LowerTypeTests::buildBitSet(
+BitSetInfo LowerTypeTestsModule::buildBitSet(
Metadata *TypeId,
- const DenseMap<GlobalObject *, uint64_t> &GlobalLayout) {
+ const DenseMap<GlobalTypeMember *, uint64_t> &GlobalLayout) {
BitSetBuilder BSB;
// Compute the byte offset of each address associated with this type
// identifier.
- SmallVector<MDNode *, 2> Types;
for (auto &GlobalAndOffset : GlobalLayout) {
- Types.clear();
- GlobalAndOffset.first->getMetadata(LLVMContext::MD_type, Types);
- for (MDNode *Type : Types) {
+ for (MDNode *Type : GlobalAndOffset.first->types()) {
if (Type->getOperand(1) != TypeId)
continue;
uint64_t Offset =
- cast<ConstantInt>(cast<ConstantAsMetadata>(Type->getOperand(0))
- ->getValue())->getZExtValue();
+ cast<ConstantInt>(
+ cast<ConstantAsMetadata>(Type->getOperand(0))->getValue())
+ ->getZExtValue();
BSB.addOffset(GlobalAndOffset.second + Offset);
}
}
@@ -305,14 +412,14 @@ static Value *createMaskedBitTest(IRBuilder<> &B, Value *Bits,
return B.CreateICmpNE(MaskedBits, ConstantInt::get(BitsType, 0));
}
-ByteArrayInfo *LowerTypeTests::createByteArray(BitSetInfo &BSI) {
+ByteArrayInfo *LowerTypeTestsModule::createByteArray(BitSetInfo &BSI) {
// Create globals to stand in for byte arrays and masks. These never actually
// get initialized, we RAUW and erase them later in allocateByteArrays() once
// we know the offset and mask to use.
auto ByteArrayGlobal = new GlobalVariable(
- *M, Int8Ty, /*isConstant=*/true, GlobalValue::PrivateLinkage, nullptr);
- auto MaskGlobal = new GlobalVariable(
- *M, Int8Ty, /*isConstant=*/true, GlobalValue::PrivateLinkage, nullptr);
+ M, Int8Ty, /*isConstant=*/true, GlobalValue::PrivateLinkage, nullptr);
+ auto MaskGlobal = new GlobalVariable(M, Int8Ty, /*isConstant=*/true,
+ GlobalValue::PrivateLinkage, nullptr);
ByteArrayInfos.emplace_back();
ByteArrayInfo *BAI = &ByteArrayInfos.back();
@@ -320,11 +427,11 @@ ByteArrayInfo *LowerTypeTests::createByteArray(BitSetInfo &BSI) {
BAI->Bits = BSI.Bits;
BAI->BitSize = BSI.BitSize;
BAI->ByteArray = ByteArrayGlobal;
- BAI->Mask = ConstantExpr::getPtrToInt(MaskGlobal, Int8Ty);
+ BAI->MaskGlobal = MaskGlobal;
return BAI;
}
-void LowerTypeTests::allocateByteArrays() {
+void LowerTypeTestsModule::allocateByteArrays() {
std::stable_sort(ByteArrayInfos.begin(), ByteArrayInfos.end(),
[](const ByteArrayInfo &BAI1, const ByteArrayInfo &BAI2) {
return BAI1.BitSize > BAI2.BitSize;
@@ -339,13 +446,14 @@ void LowerTypeTests::allocateByteArrays() {
uint8_t Mask;
BAB.allocate(BAI->Bits, BAI->BitSize, ByteArrayOffsets[I], Mask);
- BAI->Mask->replaceAllUsesWith(ConstantInt::get(Int8Ty, Mask));
- cast<GlobalVariable>(BAI->Mask->getOperand(0))->eraseFromParent();
+ BAI->MaskGlobal->replaceAllUsesWith(
+ ConstantExpr::getIntToPtr(ConstantInt::get(Int8Ty, Mask), Int8PtrTy));
+ BAI->MaskGlobal->eraseFromParent();
}
- Constant *ByteArrayConst = ConstantDataArray::get(M->getContext(), BAB.Bytes);
+ Constant *ByteArrayConst = ConstantDataArray::get(M.getContext(), BAB.Bytes);
auto ByteArray =
- new GlobalVariable(*M, ByteArrayConst->getType(), /*isConstant=*/true,
+ new GlobalVariable(M, ByteArrayConst->getType(), /*isConstant=*/true,
GlobalValue::PrivateLinkage, ByteArrayConst);
for (unsigned I = 0; I != ByteArrayInfos.size(); ++I) {
@@ -363,7 +471,7 @@ void LowerTypeTests::allocateByteArrays() {
BAI->ByteArray->replaceAllUsesWith(GEP);
} else {
GlobalAlias *Alias = GlobalAlias::create(
- Int8Ty, 0, GlobalValue::PrivateLinkage, "bits", GEP, M);
+ Int8Ty, 0, GlobalValue::PrivateLinkage, "bits", GEP, &M);
BAI->ByteArray->replaceAllUsesWith(Alias);
}
BAI->ByteArray->eraseFromParent();
@@ -375,63 +483,84 @@ void LowerTypeTests::allocateByteArrays() {
ByteArraySizeBytes = BAB.Bytes.size();
}
-/// Build a test that bit BitOffset is set in BSI, where
-/// BitSetGlobal is a global containing the bits in BSI.
-Value *LowerTypeTests::createBitSetTest(IRBuilder<> &B, BitSetInfo &BSI,
- ByteArrayInfo *&BAI, Value *BitOffset) {
- if (BSI.BitSize <= 64) {
+/// Build a test that bit BitOffset is set in the type identifier that was
+/// lowered to TIL, which must be either an Inline or a ByteArray.
+Value *LowerTypeTestsModule::createBitSetTest(IRBuilder<> &B,
+ const TypeIdLowering &TIL,
+ Value *BitOffset) {
+ if (TIL.TheKind == TypeTestResolution::Inline) {
// If the bit set is sufficiently small, we can avoid a load by bit testing
// a constant.
- IntegerType *BitsTy;
- if (BSI.BitSize <= 32)
- BitsTy = Int32Ty;
- else
- BitsTy = Int64Ty;
-
- uint64_t Bits = 0;
- for (auto Bit : BSI.Bits)
- Bits |= uint64_t(1) << Bit;
- Constant *BitsConst = ConstantInt::get(BitsTy, Bits);
- return createMaskedBitTest(B, BitsConst, BitOffset);
+ return createMaskedBitTest(B, TIL.InlineBits, BitOffset);
} else {
- if (!BAI) {
- ++NumByteArraysCreated;
- BAI = createByteArray(BSI);
- }
-
- Constant *ByteArray = BAI->ByteArray;
- Type *Ty = BAI->ByteArray->getValueType();
+ Constant *ByteArray = TIL.TheByteArray;
if (!LinkerSubsectionsViaSymbols && AvoidReuse) {
// Each use of the byte array uses a different alias. This makes the
// backend less likely to reuse previously computed byte array addresses,
// improving the security of the CFI mechanism based on this pass.
- ByteArray = GlobalAlias::create(BAI->ByteArray->getValueType(), 0,
- GlobalValue::PrivateLinkage, "bits_use",
- ByteArray, M);
+ ByteArray = GlobalAlias::create(Int8Ty, 0, GlobalValue::PrivateLinkage,
+ "bits_use", ByteArray, &M);
}
- Value *ByteAddr = B.CreateGEP(Ty, ByteArray, BitOffset);
+ Value *ByteAddr = B.CreateGEP(Int8Ty, ByteArray, BitOffset);
Value *Byte = B.CreateLoad(ByteAddr);
- Value *ByteAndMask = B.CreateAnd(Byte, BAI->Mask);
+ Value *ByteAndMask =
+ B.CreateAnd(Byte, ConstantExpr::getPtrToInt(TIL.BitMask, Int8Ty));
return B.CreateICmpNE(ByteAndMask, ConstantInt::get(Int8Ty, 0));
}
}
+static bool isKnownTypeIdMember(Metadata *TypeId, const DataLayout &DL,
+ Value *V, uint64_t COffset) {
+ if (auto GV = dyn_cast<GlobalObject>(V)) {
+ SmallVector<MDNode *, 2> Types;
+ GV->getMetadata(LLVMContext::MD_type, Types);
+ for (MDNode *Type : Types) {
+ if (Type->getOperand(1) != TypeId)
+ continue;
+ uint64_t Offset =
+ cast<ConstantInt>(
+ cast<ConstantAsMetadata>(Type->getOperand(0))->getValue())
+ ->getZExtValue();
+ if (COffset == Offset)
+ return true;
+ }
+ return false;
+ }
+
+ if (auto GEP = dyn_cast<GEPOperator>(V)) {
+ APInt APOffset(DL.getPointerSizeInBits(0), 0);
+ bool Result = GEP->accumulateConstantOffset(DL, APOffset);
+ if (!Result)
+ return false;
+ COffset += APOffset.getZExtValue();
+ return isKnownTypeIdMember(TypeId, DL, GEP->getPointerOperand(), COffset);
+ }
+
+ if (auto Op = dyn_cast<Operator>(V)) {
+ if (Op->getOpcode() == Instruction::BitCast)
+ return isKnownTypeIdMember(TypeId, DL, Op->getOperand(0), COffset);
+
+ if (Op->getOpcode() == Instruction::Select)
+ return isKnownTypeIdMember(TypeId, DL, Op->getOperand(1), COffset) &&
+ isKnownTypeIdMember(TypeId, DL, Op->getOperand(2), COffset);
+ }
+
+ return false;
+}
+
/// Lower a llvm.type.test call to its implementation. Returns the value to
/// replace the call with.
-Value *LowerTypeTests::lowerBitSetCall(
- CallInst *CI, BitSetInfo &BSI, ByteArrayInfo *&BAI,
- Constant *CombinedGlobalIntAddr,
- const DenseMap<GlobalObject *, uint64_t> &GlobalLayout) {
- Value *Ptr = CI->getArgOperand(0);
- const DataLayout &DL = M->getDataLayout();
-
- if (BSI.containsValue(DL, GlobalLayout, Ptr))
- return ConstantInt::getTrue(M->getContext());
+Value *LowerTypeTestsModule::lowerTypeTestCall(Metadata *TypeId, CallInst *CI,
+ const TypeIdLowering &TIL) {
+ if (TIL.TheKind == TypeTestResolution::Unsat)
+ return ConstantInt::getFalse(M.getContext());
- Constant *OffsetedGlobalAsInt = ConstantExpr::getAdd(
- CombinedGlobalIntAddr, ConstantInt::get(IntPtrTy, BSI.ByteOffset));
+ Value *Ptr = CI->getArgOperand(0);
+ const DataLayout &DL = M.getDataLayout();
+ if (isKnownTypeIdMember(TypeId, DL, Ptr, 0))
+ return ConstantInt::getTrue(M.getContext());
BasicBlock *InitialBB = CI->getParent();
@@ -439,36 +568,36 @@ Value *LowerTypeTests::lowerBitSetCall(
Value *PtrAsInt = B.CreatePtrToInt(Ptr, IntPtrTy);
- if (BSI.isSingleOffset())
+ Constant *OffsetedGlobalAsInt =
+ ConstantExpr::getPtrToInt(TIL.OffsetedGlobal, IntPtrTy);
+ if (TIL.TheKind == TypeTestResolution::Single)
return B.CreateICmpEQ(PtrAsInt, OffsetedGlobalAsInt);
Value *PtrOffset = B.CreateSub(PtrAsInt, OffsetedGlobalAsInt);
- Value *BitOffset;
- if (BSI.AlignLog2 == 0) {
- BitOffset = PtrOffset;
- } else {
- // We need to check that the offset both falls within our range and is
- // suitably aligned. We can check both properties at the same time by
- // performing a right rotate by log2(alignment) followed by an integer
- // comparison against the bitset size. The rotate will move the lower
- // order bits that need to be zero into the higher order bits of the
- // result, causing the comparison to fail if they are nonzero. The rotate
- // also conveniently gives us a bit offset to use during the load from
- // the bitset.
- Value *OffsetSHR =
- B.CreateLShr(PtrOffset, ConstantInt::get(IntPtrTy, BSI.AlignLog2));
- Value *OffsetSHL = B.CreateShl(
- PtrOffset,
- ConstantInt::get(IntPtrTy, DL.getPointerSizeInBits(0) - BSI.AlignLog2));
- BitOffset = B.CreateOr(OffsetSHR, OffsetSHL);
- }
-
- Constant *BitSizeConst = ConstantInt::get(IntPtrTy, BSI.BitSize);
- Value *OffsetInRange = B.CreateICmpULT(BitOffset, BitSizeConst);
+ // We need to check that the offset both falls within our range and is
+ // suitably aligned. We can check both properties at the same time by
+ // performing a right rotate by log2(alignment) followed by an integer
+ // comparison against the bitset size. The rotate will move the lower
+ // order bits that need to be zero into the higher order bits of the
+ // result, causing the comparison to fail if they are nonzero. The rotate
+ // also conveniently gives us a bit offset to use during the load from
+ // the bitset.
+ Value *OffsetSHR =
+ B.CreateLShr(PtrOffset, ConstantExpr::getZExt(TIL.AlignLog2, IntPtrTy));
+ Value *OffsetSHL = B.CreateShl(
+ PtrOffset, ConstantExpr::getZExt(
+ ConstantExpr::getSub(
+ ConstantInt::get(Int8Ty, DL.getPointerSizeInBits(0)),
+ TIL.AlignLog2),
+ IntPtrTy));
+ Value *BitOffset = B.CreateOr(OffsetSHR, OffsetSHL);
+
+ Constant *BitSizeConst = ConstantExpr::getZExt(TIL.SizeM1, IntPtrTy);
+ Value *OffsetInRange = B.CreateICmpULE(BitOffset, BitSizeConst);
// If the bit set is all ones, testing against it is unnecessary.
- if (BSI.isAllOnes())
+ if (TIL.TheKind == TypeTestResolution::AllOnes)
return OffsetInRange;
TerminatorInst *Term = SplitBlockAndInsertIfThen(OffsetInRange, CI, false);
@@ -476,7 +605,7 @@ Value *LowerTypeTests::lowerBitSetCall(
// Now that we know that the offset is in range and aligned, load the
// appropriate bit from the bitset.
- Value *Bit = createBitSetTest(ThenB, BSI, BAI, BitOffset);
+ Value *Bit = createBitSetTest(ThenB, TIL, BitOffset);
// The value we want is 0 if we came directly from the initial block
// (having failed the range or alignment checks), or the loaded bit if
@@ -490,17 +619,18 @@ Value *LowerTypeTests::lowerBitSetCall(
/// Given a disjoint set of type identifiers and globals, lay out the globals,
/// build the bit sets and lower the llvm.type.test calls.
-void LowerTypeTests::buildBitSetsFromGlobalVariables(
- ArrayRef<Metadata *> TypeIds, ArrayRef<GlobalVariable *> Globals) {
+void LowerTypeTestsModule::buildBitSetsFromGlobalVariables(
+ ArrayRef<Metadata *> TypeIds, ArrayRef<GlobalTypeMember *> Globals) {
// Build a new global with the combined contents of the referenced globals.
// This global is a struct whose even-indexed elements contain the original
// contents of the referenced globals and whose odd-indexed elements contain
// any padding required to align the next element to the next power of 2.
std::vector<Constant *> GlobalInits;
- const DataLayout &DL = M->getDataLayout();
- for (GlobalVariable *G : Globals) {
- GlobalInits.push_back(G->getInitializer());
- uint64_t InitSize = DL.getTypeAllocSize(G->getValueType());
+ const DataLayout &DL = M.getDataLayout();
+ for (GlobalTypeMember *G : Globals) {
+ GlobalVariable *GV = cast<GlobalVariable>(G->getGlobal());
+ GlobalInits.push_back(GV->getInitializer());
+ uint64_t InitSize = DL.getTypeAllocSize(GV->getValueType());
// Compute the amount of padding required.
uint64_t Padding = NextPowerOf2(InitSize - 1) - InitSize;
@@ -515,16 +645,16 @@ void LowerTypeTests::buildBitSetsFromGlobalVariables(
}
if (!GlobalInits.empty())
GlobalInits.pop_back();
- Constant *NewInit = ConstantStruct::getAnon(M->getContext(), GlobalInits);
+ Constant *NewInit = ConstantStruct::getAnon(M.getContext(), GlobalInits);
auto *CombinedGlobal =
- new GlobalVariable(*M, NewInit->getType(), /*isConstant=*/true,
+ new GlobalVariable(M, NewInit->getType(), /*isConstant=*/true,
GlobalValue::PrivateLinkage, NewInit);
StructType *NewTy = cast<StructType>(NewInit->getType());
const StructLayout *CombinedGlobalLayout = DL.getStructLayout(NewTy);
// Compute the offsets of the original globals within the new global.
- DenseMap<GlobalObject *, uint64_t> GlobalLayout;
+ DenseMap<GlobalTypeMember *, uint64_t> GlobalLayout;
for (unsigned I = 0; I != Globals.size(); ++I)
// Multiply by 2 to account for padding elements.
GlobalLayout[Globals[I]] = CombinedGlobalLayout->getElementOffset(I * 2);
@@ -535,31 +665,32 @@ void LowerTypeTests::buildBitSetsFromGlobalVariables(
// global from which we built the combined global, and replace references
// to the original globals with references to the aliases.
for (unsigned I = 0; I != Globals.size(); ++I) {
+ GlobalVariable *GV = cast<GlobalVariable>(Globals[I]->getGlobal());
+
// Multiply by 2 to account for padding elements.
Constant *CombinedGlobalIdxs[] = {ConstantInt::get(Int32Ty, 0),
ConstantInt::get(Int32Ty, I * 2)};
Constant *CombinedGlobalElemPtr = ConstantExpr::getGetElementPtr(
NewInit->getType(), CombinedGlobal, CombinedGlobalIdxs);
if (LinkerSubsectionsViaSymbols) {
- Globals[I]->replaceAllUsesWith(CombinedGlobalElemPtr);
+ GV->replaceAllUsesWith(CombinedGlobalElemPtr);
} else {
- assert(Globals[I]->getType()->getAddressSpace() == 0);
+ assert(GV->getType()->getAddressSpace() == 0);
GlobalAlias *GAlias = GlobalAlias::create(NewTy->getElementType(I * 2), 0,
- Globals[I]->getLinkage(), "",
- CombinedGlobalElemPtr, M);
- GAlias->setVisibility(Globals[I]->getVisibility());
- GAlias->takeName(Globals[I]);
- Globals[I]->replaceAllUsesWith(GAlias);
+ GV->getLinkage(), "",
+ CombinedGlobalElemPtr, &M);
+ GAlias->setVisibility(GV->getVisibility());
+ GAlias->takeName(GV);
+ GV->replaceAllUsesWith(GAlias);
}
- Globals[I]->eraseFromParent();
+ GV->eraseFromParent();
}
}
-void LowerTypeTests::lowerTypeTestCalls(
+void LowerTypeTestsModule::lowerTypeTestCalls(
ArrayRef<Metadata *> TypeIds, Constant *CombinedGlobalAddr,
- const DenseMap<GlobalObject *, uint64_t> &GlobalLayout) {
- Constant *CombinedGlobalIntAddr =
- ConstantExpr::getPtrToInt(CombinedGlobalAddr, IntPtrTy);
+ const DenseMap<GlobalTypeMember *, uint64_t> &GlobalLayout) {
+ CombinedGlobalAddr = ConstantExpr::getBitCast(CombinedGlobalAddr, Int8PtrTy);
// For each type identifier in this disjoint set...
for (Metadata *TypeId : TypeIds) {
@@ -573,23 +704,52 @@ void LowerTypeTests::lowerTypeTestCalls(
BSI.print(dbgs());
});
- ByteArrayInfo *BAI = nullptr;
+ TypeIdLowering TIL;
+ TIL.OffsetedGlobal = ConstantExpr::getGetElementPtr(
+ Int8Ty, CombinedGlobalAddr, ConstantInt::get(IntPtrTy, BSI.ByteOffset)),
+ TIL.AlignLog2 = ConstantInt::get(Int8Ty, BSI.AlignLog2);
+ if (BSI.isAllOnes()) {
+ TIL.TheKind = (BSI.BitSize == 1) ? TypeTestResolution::Single
+ : TypeTestResolution::AllOnes;
+ TIL.SizeM1BitWidth = (BSI.BitSize <= 128) ? 7 : 32;
+ TIL.SizeM1 = ConstantInt::get((BSI.BitSize <= 128) ? Int8Ty : Int32Ty,
+ BSI.BitSize - 1);
+ } else if (BSI.BitSize <= 64) {
+ TIL.TheKind = TypeTestResolution::Inline;
+ TIL.SizeM1BitWidth = (BSI.BitSize <= 32) ? 5 : 6;
+ TIL.SizeM1 = ConstantInt::get(Int8Ty, BSI.BitSize - 1);
+ uint64_t InlineBits = 0;
+ for (auto Bit : BSI.Bits)
+ InlineBits |= uint64_t(1) << Bit;
+ if (InlineBits == 0)
+ TIL.TheKind = TypeTestResolution::Unsat;
+ else
+ TIL.InlineBits = ConstantInt::get(
+ (BSI.BitSize <= 32) ? Int32Ty : Int64Ty, InlineBits);
+ } else {
+ TIL.TheKind = TypeTestResolution::ByteArray;
+ TIL.SizeM1BitWidth = (BSI.BitSize <= 128) ? 7 : 32;
+ TIL.SizeM1 = ConstantInt::get((BSI.BitSize <= 128) ? Int8Ty : Int32Ty,
+ BSI.BitSize - 1);
+ ++NumByteArraysCreated;
+ ByteArrayInfo *BAI = createByteArray(BSI);
+ TIL.TheByteArray = BAI->ByteArray;
+ TIL.BitMask = BAI->MaskGlobal;
+ }
// Lower each call to llvm.type.test for this type identifier.
for (CallInst *CI : TypeTestCallSites[TypeId]) {
++NumTypeTestCallsLowered;
- Value *Lowered =
- lowerBitSetCall(CI, BSI, BAI, CombinedGlobalIntAddr, GlobalLayout);
+ Value *Lowered = lowerTypeTestCall(TypeId, CI, TIL);
CI->replaceAllUsesWith(Lowered);
CI->eraseFromParent();
}
}
}
-void LowerTypeTests::verifyTypeMDNode(GlobalObject *GO, MDNode *Type) {
+void LowerTypeTestsModule::verifyTypeMDNode(GlobalObject *GO, MDNode *Type) {
if (Type->getNumOperands() != 2)
- report_fatal_error(
- "All operands of type metadata must have 2 elements");
+ report_fatal_error("All operands of type metadata must have 2 elements");
if (GO->isThreadLocal())
report_fatal_error("Bit set element may not be thread-local");
@@ -610,60 +770,172 @@ void LowerTypeTests::verifyTypeMDNode(GlobalObject *GO, MDNode *Type) {
}
static const unsigned kX86JumpTableEntrySize = 8;
+static const unsigned kARMJumpTableEntrySize = 4;
+
+unsigned LowerTypeTestsModule::getJumpTableEntrySize() {
+ switch (Arch) {
+ case Triple::x86:
+ case Triple::x86_64:
+ return kX86JumpTableEntrySize;
+ case Triple::arm:
+ case Triple::thumb:
+ case Triple::aarch64:
+ return kARMJumpTableEntrySize;
+ default:
+ report_fatal_error("Unsupported architecture for jump tables");
+ }
+}
-unsigned LowerTypeTests::getJumpTableEntrySize() {
- if (Arch != Triple::x86 && Arch != Triple::x86_64)
+// Create a jump table entry for the target. This consists of an instruction
+// sequence containing a relative branch to Dest. Appends inline asm text,
+// constraints and arguments to AsmOS, ConstraintOS and AsmArgs.
+void LowerTypeTestsModule::createJumpTableEntry(
+ raw_ostream &AsmOS, raw_ostream &ConstraintOS,
+ SmallVectorImpl<Value *> &AsmArgs, Function *Dest) {
+ unsigned ArgIndex = AsmArgs.size();
+
+ if (Arch == Triple::x86 || Arch == Triple::x86_64) {
+ AsmOS << "jmp ${" << ArgIndex << ":c}@plt\n";
+ AsmOS << "int3\nint3\nint3\n";
+ } else if (Arch == Triple::arm || Arch == Triple::aarch64) {
+ AsmOS << "b $" << ArgIndex << "\n";
+ } else if (Arch == Triple::thumb) {
+ AsmOS << "b.w $" << ArgIndex << "\n";
+ } else {
report_fatal_error("Unsupported architecture for jump tables");
+ }
- return kX86JumpTableEntrySize;
+ ConstraintOS << (ArgIndex > 0 ? ",s" : "s");
+ AsmArgs.push_back(Dest);
}
-// Create a constant representing a jump table entry for the target. This
-// consists of an instruction sequence containing a relative branch to Dest. The
-// constant will be laid out at address Src+(Len*Distance) where Len is the
-// target-specific jump table entry size.
-Constant *LowerTypeTests::createJumpTableEntry(GlobalObject *Src,
- Function *Dest,
- unsigned Distance) {
- if (Arch != Triple::x86 && Arch != Triple::x86_64)
- report_fatal_error("Unsupported architecture for jump tables");
+Type *LowerTypeTestsModule::getJumpTableEntryType() {
+ return ArrayType::get(Int8Ty, getJumpTableEntrySize());
+}
- const unsigned kJmpPCRel32Code = 0xe9;
- const unsigned kInt3Code = 0xcc;
+/// Given a disjoint set of type identifiers and functions, build the bit sets
+/// and lower the llvm.type.test calls, architecture dependently.
+void LowerTypeTestsModule::buildBitSetsFromFunctions(
+ ArrayRef<Metadata *> TypeIds, ArrayRef<GlobalTypeMember *> Functions) {
+ if (Arch == Triple::x86 || Arch == Triple::x86_64 || Arch == Triple::arm ||
+ Arch == Triple::thumb || Arch == Triple::aarch64)
+ buildBitSetsFromFunctionsNative(TypeIds, Functions);
+ else if (Arch == Triple::wasm32 || Arch == Triple::wasm64)
+ buildBitSetsFromFunctionsWASM(TypeIds, Functions);
+ else
+ report_fatal_error("Unsupported architecture for jump tables");
+}
- ConstantInt *Jmp = ConstantInt::get(Int8Ty, kJmpPCRel32Code);
+void LowerTypeTestsModule::moveInitializerToModuleConstructor(
+ GlobalVariable *GV) {
+ if (WeakInitializerFn == nullptr) {
+ WeakInitializerFn = Function::Create(
+ FunctionType::get(Type::getVoidTy(M.getContext()),
+ /* IsVarArg */ false),
+ GlobalValue::InternalLinkage, "__cfi_global_var_init", &M);
+ BasicBlock *BB =
+ BasicBlock::Create(M.getContext(), "entry", WeakInitializerFn);
+ ReturnInst::Create(M.getContext(), BB);
+ WeakInitializerFn->setSection(
+ ObjectFormat == Triple::MachO
+ ? "__TEXT,__StaticInit,regular,pure_instructions"
+ : ".text.startup");
+ // This code is equivalent to relocation application, and should run at the
+ // earliest possible time (i.e. with the highest priority).
+ appendToGlobalCtors(M, WeakInitializerFn, /* Priority */ 0);
+ }
- // Build a constant representing the displacement between the constant's
- // address and Dest. This will resolve to a PC32 relocation referring to Dest.
- Constant *DestInt = ConstantExpr::getPtrToInt(Dest, IntPtrTy);
- Constant *SrcInt = ConstantExpr::getPtrToInt(Src, IntPtrTy);
- Constant *Disp = ConstantExpr::getSub(DestInt, SrcInt);
- ConstantInt *DispOffset =
- ConstantInt::get(IntPtrTy, Distance * kX86JumpTableEntrySize + 5);
- Constant *OffsetedDisp = ConstantExpr::getSub(Disp, DispOffset);
- OffsetedDisp = ConstantExpr::getTruncOrBitCast(OffsetedDisp, Int32Ty);
+ IRBuilder<> IRB(WeakInitializerFn->getEntryBlock().getTerminator());
+ GV->setConstant(false);
+ IRB.CreateAlignedStore(GV->getInitializer(), GV, GV->getAlignment());
+ GV->setInitializer(Constant::getNullValue(GV->getValueType()));
+}
- ConstantInt *Int3 = ConstantInt::get(Int8Ty, kInt3Code);
+void LowerTypeTestsModule::findGlobalVariableUsersOf(
+ Constant *C, SmallSetVector<GlobalVariable *, 8> &Out) {
+ for (auto *U : C->users()){
+ if (auto *GV = dyn_cast<GlobalVariable>(U))
+ Out.insert(GV);
+ else if (auto *C2 = dyn_cast<Constant>(U))
+ findGlobalVariableUsersOf(C2, Out);
+ }
+}
- Constant *Fields[] = {
- Jmp, OffsetedDisp, Int3, Int3, Int3,
- };
- return ConstantStruct::getAnon(Fields, /*Packed=*/true);
+// Replace all uses of F with (F ? JT : 0).
+void LowerTypeTestsModule::replaceWeakDeclarationWithJumpTablePtr(
+ Function *F, Constant *JT) {
+ // The target expression can not appear in a constant initializer on most
+ // (all?) targets. Switch to a runtime initializer.
+ SmallSetVector<GlobalVariable *, 8> GlobalVarUsers;
+ findGlobalVariableUsersOf(F, GlobalVarUsers);
+ for (auto GV : GlobalVarUsers)
+ moveInitializerToModuleConstructor(GV);
+
+ // Can not RAUW F with an expression that uses F. Replace with a temporary
+ // placeholder first.
+ Function *PlaceholderFn =
+ Function::Create(cast<FunctionType>(F->getValueType()),
+ GlobalValue::ExternalWeakLinkage, "", &M);
+ F->replaceAllUsesWith(PlaceholderFn);
+
+ Constant *Target = ConstantExpr::getSelect(
+ ConstantExpr::getICmp(CmpInst::ICMP_NE, F,
+ Constant::getNullValue(F->getType())),
+ JT, Constant::getNullValue(F->getType()));
+ PlaceholderFn->replaceAllUsesWith(Target);
+ PlaceholderFn->eraseFromParent();
}
-Type *LowerTypeTests::getJumpTableEntryType() {
- if (Arch != Triple::x86 && Arch != Triple::x86_64)
- report_fatal_error("Unsupported architecture for jump tables");
+void LowerTypeTestsModule::createJumpTable(
+ Function *F, ArrayRef<GlobalTypeMember *> Functions) {
+ std::string AsmStr, ConstraintStr;
+ raw_string_ostream AsmOS(AsmStr), ConstraintOS(ConstraintStr);
+ SmallVector<Value *, 16> AsmArgs;
+ AsmArgs.reserve(Functions.size() * 2);
- return StructType::get(M->getContext(),
- {Int8Ty, Int32Ty, Int8Ty, Int8Ty, Int8Ty},
- /*Packed=*/true);
+ for (unsigned I = 0; I != Functions.size(); ++I)
+ createJumpTableEntry(AsmOS, ConstraintOS, AsmArgs,
+ cast<Function>(Functions[I]->getGlobal()));
+
+ // Try to emit the jump table at the end of the text segment.
+ // Jump table must come after __cfi_check in the cross-dso mode.
+ // FIXME: this magic section name seems to do the trick.
+ F->setSection(ObjectFormat == Triple::MachO
+ ? "__TEXT,__text,regular,pure_instructions"
+ : ".text.cfi");
+ // Align the whole table by entry size.
+ F->setAlignment(getJumpTableEntrySize());
+ // Skip prologue.
+ // Disabled on win32 due to https://llvm.org/bugs/show_bug.cgi?id=28641#c3.
+ // Luckily, this function does not get any prologue even without the
+ // attribute.
+ if (OS != Triple::Win32)
+ F->addFnAttr(llvm::Attribute::Naked);
+ // Thumb jump table assembly needs Thumb2. The following attribute is added by
+ // Clang for -march=armv7.
+ if (Arch == Triple::thumb)
+ F->addFnAttr("target-cpu", "cortex-a8");
+
+ BasicBlock *BB = BasicBlock::Create(M.getContext(), "entry", F);
+ IRBuilder<> IRB(BB);
+
+ SmallVector<Type *, 16> ArgTypes;
+ ArgTypes.reserve(AsmArgs.size());
+ for (const auto &Arg : AsmArgs)
+ ArgTypes.push_back(Arg->getType());
+ InlineAsm *JumpTableAsm =
+ InlineAsm::get(FunctionType::get(IRB.getVoidTy(), ArgTypes, false),
+ AsmOS.str(), ConstraintOS.str(),
+ /*hasSideEffects=*/true);
+
+ IRB.CreateCall(JumpTableAsm, AsmArgs);
+ IRB.CreateUnreachable();
}
/// Given a disjoint set of type identifiers and functions, build a jump table
/// for the functions, build the bit sets and lower the llvm.type.test calls.
-void LowerTypeTests::buildBitSetsFromFunctions(ArrayRef<Metadata *> TypeIds,
- ArrayRef<Function *> Functions) {
+void LowerTypeTestsModule::buildBitSetsFromFunctionsNative(
+ ArrayRef<Metadata *> TypeIds, ArrayRef<GlobalTypeMember *> Functions) {
// Unlike the global bitset builder, the function bitset builder cannot
// re-arrange functions in a particular order and base its calculations on the
// layout of the functions' entry points, as we have no idea how large a
@@ -697,39 +969,35 @@ void LowerTypeTests::buildBitSetsFromFunctions(ArrayRef<Metadata *> TypeIds,
// mov h, %ecx
// ret
//
- // To create a jump table for these functions, we instruct the LLVM code
- // generator to output a jump table in the .text section. This is done by
- // representing the instructions in the jump table as an LLVM constant and
- // placing them in a global variable in the .text section. The end result will
- // (conceptually) look like this:
+ // We output the jump table as module-level inline asm string. The end result
+ // will (conceptually) look like this:
//
- // f:
- // jmp .Ltmp0 ; 5 bytes
+ // f = .cfi.jumptable
+ // g = .cfi.jumptable + 4
+ // h = .cfi.jumptable + 8
+ // .cfi.jumptable:
+ // jmp f.cfi ; 5 bytes
// int3 ; 1 byte
// int3 ; 1 byte
// int3 ; 1 byte
- //
- // g:
- // jmp .Ltmp1 ; 5 bytes
+ // jmp g.cfi ; 5 bytes
// int3 ; 1 byte
// int3 ; 1 byte
// int3 ; 1 byte
- //
- // h:
- // jmp .Ltmp2 ; 5 bytes
+ // jmp h.cfi ; 5 bytes
// int3 ; 1 byte
// int3 ; 1 byte
// int3 ; 1 byte
//
- // .Ltmp0:
+ // f.cfi:
// mov 0, %eax
// ret
//
- // .Ltmp1:
+ // g.cfi:
// mov 1, %eax
// ret
//
- // .Ltmp2:
+ // h.cfi:
// mov 2, %eax
// ret
//
@@ -743,60 +1011,101 @@ void LowerTypeTests::buildBitSetsFromFunctions(ArrayRef<Metadata *> TypeIds,
// normal case the check can be carried out using the same kind of simple
// arithmetic that we normally use for globals.
+ // FIXME: find a better way to represent the jumptable in the IR.
+
assert(!Functions.empty());
// Build a simple layout based on the regular layout of jump tables.
- DenseMap<GlobalObject *, uint64_t> GlobalLayout;
+ DenseMap<GlobalTypeMember *, uint64_t> GlobalLayout;
unsigned EntrySize = getJumpTableEntrySize();
for (unsigned I = 0; I != Functions.size(); ++I)
GlobalLayout[Functions[I]] = I * EntrySize;
- // Create a constant to hold the jump table.
+ Function *JumpTableFn =
+ Function::Create(FunctionType::get(Type::getVoidTy(M.getContext()),
+ /* IsVarArg */ false),
+ GlobalValue::PrivateLinkage, ".cfi.jumptable", &M);
ArrayType *JumpTableType =
ArrayType::get(getJumpTableEntryType(), Functions.size());
- auto JumpTable = new GlobalVariable(*M, JumpTableType,
- /*isConstant=*/true,
- GlobalValue::PrivateLinkage, nullptr);
- JumpTable->setSection(ObjectFormat == Triple::MachO
- ? "__TEXT,__text,regular,pure_instructions"
- : ".text");
+ auto JumpTable =
+ ConstantExpr::getPointerCast(JumpTableFn, JumpTableType->getPointerTo(0));
+
lowerTypeTestCalls(TypeIds, JumpTable, GlobalLayout);
// Build aliases pointing to offsets into the jump table, and replace
// references to the original functions with references to the aliases.
for (unsigned I = 0; I != Functions.size(); ++I) {
+ Function *F = cast<Function>(Functions[I]->getGlobal());
+
Constant *CombinedGlobalElemPtr = ConstantExpr::getBitCast(
- ConstantExpr::getGetElementPtr(
+ ConstantExpr::getInBoundsGetElementPtr(
JumpTableType, JumpTable,
ArrayRef<Constant *>{ConstantInt::get(IntPtrTy, 0),
ConstantInt::get(IntPtrTy, I)}),
- Functions[I]->getType());
- if (LinkerSubsectionsViaSymbols || Functions[I]->isDeclarationForLinker()) {
- Functions[I]->replaceAllUsesWith(CombinedGlobalElemPtr);
+ F->getType());
+ if (LinkerSubsectionsViaSymbols || F->isDeclarationForLinker()) {
+
+ if (F->isWeakForLinker())
+ replaceWeakDeclarationWithJumpTablePtr(F, CombinedGlobalElemPtr);
+ else
+ F->replaceAllUsesWith(CombinedGlobalElemPtr);
} else {
- assert(Functions[I]->getType()->getAddressSpace() == 0);
- GlobalAlias *GAlias = GlobalAlias::create(Functions[I]->getValueType(), 0,
- Functions[I]->getLinkage(), "",
- CombinedGlobalElemPtr, M);
- GAlias->setVisibility(Functions[I]->getVisibility());
- GAlias->takeName(Functions[I]);
- Functions[I]->replaceAllUsesWith(GAlias);
+ assert(F->getType()->getAddressSpace() == 0);
+
+ GlobalAlias *FAlias = GlobalAlias::create(F->getValueType(), 0,
+ F->getLinkage(), "",
+ CombinedGlobalElemPtr, &M);
+ FAlias->setVisibility(F->getVisibility());
+ FAlias->takeName(F);
+ if (FAlias->hasName())
+ F->setName(FAlias->getName() + ".cfi");
+ F->replaceAllUsesWith(FAlias);
}
- if (!Functions[I]->isDeclarationForLinker())
- Functions[I]->setLinkage(GlobalValue::PrivateLinkage);
+ if (!F->isDeclarationForLinker())
+ F->setLinkage(GlobalValue::InternalLinkage);
}
- // Build and set the jump table's initializer.
- std::vector<Constant *> JumpTableEntries;
- for (unsigned I = 0; I != Functions.size(); ++I)
- JumpTableEntries.push_back(
- createJumpTableEntry(JumpTable, Functions[I], I));
- JumpTable->setInitializer(
- ConstantArray::get(JumpTableType, JumpTableEntries));
+ createJumpTable(JumpTableFn, Functions);
+}
+
+/// Assign a dummy layout using an incrementing counter, tag each function
+/// with its index represented as metadata, and lower each type test to an
+/// integer range comparison. During generation of the indirect function call
+/// table in the backend, it will assign the given indexes.
+/// Note: Dynamic linking is not supported, as the WebAssembly ABI has not yet
+/// been finalized.
+void LowerTypeTestsModule::buildBitSetsFromFunctionsWASM(
+ ArrayRef<Metadata *> TypeIds, ArrayRef<GlobalTypeMember *> Functions) {
+ assert(!Functions.empty());
+
+ // Build consecutive monotonic integer ranges for each call target set
+ DenseMap<GlobalTypeMember *, uint64_t> GlobalLayout;
+
+ for (GlobalTypeMember *GTM : Functions) {
+ Function *F = cast<Function>(GTM->getGlobal());
+
+ // Skip functions that are not address taken, to avoid bloating the table
+ if (!F->hasAddressTaken())
+ continue;
+
+ // Store metadata with the index for each function
+ MDNode *MD = MDNode::get(F->getContext(),
+ ArrayRef<Metadata *>(ConstantAsMetadata::get(
+ ConstantInt::get(Int64Ty, IndirectIndex))));
+ F->setMetadata("wasm.index", MD);
+
+ // Assign the counter value
+ GlobalLayout[GTM] = IndirectIndex++;
+ }
+
+ // The indirect function table index space starts at zero, so pass a NULL
+ // pointer as the subtracted "jump table" offset.
+ lowerTypeTestCalls(TypeIds, ConstantPointerNull::get(Int32PtrTy),
+ GlobalLayout);
}
-void LowerTypeTests::buildBitSetsFromDisjointSet(
- ArrayRef<Metadata *> TypeIds, ArrayRef<GlobalObject *> Globals) {
+void LowerTypeTestsModule::buildBitSetsFromDisjointSet(
+ ArrayRef<Metadata *> TypeIds, ArrayRef<GlobalTypeMember *> Globals) {
llvm::DenseMap<Metadata *, uint64_t> TypeIdIndices;
for (unsigned I = 0; I != TypeIds.size(); ++I)
TypeIdIndices[TypeIds[I]] = I;
@@ -804,12 +1113,9 @@ void LowerTypeTests::buildBitSetsFromDisjointSet(
// For each type identifier, build a set of indices that refer to members of
// the type identifier.
std::vector<std::set<uint64_t>> TypeMembers(TypeIds.size());
- SmallVector<MDNode *, 2> Types;
unsigned GlobalIndex = 0;
- for (GlobalObject *GO : Globals) {
- Types.clear();
- GO->getMetadata(LLVMContext::MD_type, Types);
- for (MDNode *Type : Types) {
+ for (GlobalTypeMember *GTM : Globals) {
+ for (MDNode *Type : GTM->types()) {
// Type = { offset, type identifier }
unsigned TypeIdIndex = TypeIdIndices[Type->getOperand(1)];
TypeMembers[TypeIdIndex].insert(GlobalIndex);
@@ -833,32 +1139,32 @@ void LowerTypeTests::buildBitSetsFromDisjointSet(
GLB.addFragment(MemSet);
// Build the bitsets from this disjoint set.
- if (Globals.empty() || isa<GlobalVariable>(Globals[0])) {
+ if (Globals.empty() || isa<GlobalVariable>(Globals[0]->getGlobal())) {
// Build a vector of global variables with the computed layout.
- std::vector<GlobalVariable *> OrderedGVs(Globals.size());
+ std::vector<GlobalTypeMember *> OrderedGVs(Globals.size());
auto OGI = OrderedGVs.begin();
for (auto &&F : GLB.Fragments) {
for (auto &&Offset : F) {
- auto GV = dyn_cast<GlobalVariable>(Globals[Offset]);
+ auto GV = dyn_cast<GlobalVariable>(Globals[Offset]->getGlobal());
if (!GV)
report_fatal_error("Type identifier may not contain both global "
"variables and functions");
- *OGI++ = GV;
+ *OGI++ = Globals[Offset];
}
}
buildBitSetsFromGlobalVariables(TypeIds, OrderedGVs);
} else {
// Build a vector of functions with the computed layout.
- std::vector<Function *> OrderedFns(Globals.size());
+ std::vector<GlobalTypeMember *> OrderedFns(Globals.size());
auto OFI = OrderedFns.begin();
for (auto &&F : GLB.Fragments) {
for (auto &&Offset : F) {
- auto Fn = dyn_cast<Function>(Globals[Offset]);
+ auto Fn = dyn_cast<Function>(Globals[Offset]->getGlobal());
if (!Fn)
report_fatal_error("Type identifier may not contain both global "
"variables and functions");
- *OFI++ = Fn;
+ *OFI++ = Globals[Offset];
}
}
@@ -867,31 +1173,92 @@ void LowerTypeTests::buildBitSetsFromDisjointSet(
}
/// Lower all type tests in this module.
-bool LowerTypeTests::lower() {
+LowerTypeTestsModule::LowerTypeTestsModule(Module &M, SummaryAction Action,
+ ModuleSummaryIndex *Summary)
+ : M(M), Action(Action), Summary(Summary) {
+ // FIXME: Use these fields.
+ (void)this->Action;
+ (void)this->Summary;
+
+ Triple TargetTriple(M.getTargetTriple());
+ LinkerSubsectionsViaSymbols = TargetTriple.isMacOSX();
+ Arch = TargetTriple.getArch();
+ OS = TargetTriple.getOS();
+ ObjectFormat = TargetTriple.getObjectFormat();
+}
+
+bool LowerTypeTestsModule::runForTesting(Module &M) {
+ ModuleSummaryIndex Summary;
+
+ // Handle the command-line summary arguments. This code is for testing
+ // purposes only, so we handle errors directly.
+ if (!ClReadSummary.empty()) {
+ ExitOnError ExitOnErr("-lowertypetests-read-summary: " + ClReadSummary +
+ ": ");
+ auto ReadSummaryFile =
+ ExitOnErr(errorOrToExpected(MemoryBuffer::getFile(ClReadSummary)));
+
+ yaml::Input In(ReadSummaryFile->getBuffer());
+ In >> Summary;
+ ExitOnErr(errorCodeToError(In.error()));
+ }
+
+ bool Changed = LowerTypeTestsModule(M, ClSummaryAction, &Summary).lower();
+
+ if (!ClWriteSummary.empty()) {
+ ExitOnError ExitOnErr("-lowertypetests-write-summary: " + ClWriteSummary +
+ ": ");
+ std::error_code EC;
+ raw_fd_ostream OS(ClWriteSummary, EC, sys::fs::F_Text);
+ ExitOnErr(errorCodeToError(EC));
+
+ yaml::Output Out(OS);
+ Out << Summary;
+ }
+
+ return Changed;
+}
+
+bool LowerTypeTestsModule::lower() {
Function *TypeTestFunc =
- M->getFunction(Intrinsic::getName(Intrinsic::type_test));
+ M.getFunction(Intrinsic::getName(Intrinsic::type_test));
if (!TypeTestFunc || TypeTestFunc->use_empty())
return false;
// Equivalence class set containing type identifiers and the globals that
// reference them. This is used to partition the set of type identifiers in
// the module into disjoint sets.
- typedef EquivalenceClasses<PointerUnion<GlobalObject *, Metadata *>>
+ typedef EquivalenceClasses<PointerUnion<GlobalTypeMember *, Metadata *>>
GlobalClassesTy;
GlobalClassesTy GlobalClasses;
- // Verify the type metadata and build a mapping from type identifiers to their
- // last observed index in the list of globals. This will be used later to
- // deterministically order the list of type identifiers.
- llvm::DenseMap<Metadata *, unsigned> TypeIdIndices;
+ // Verify the type metadata and build a few data structures to let us
+ // efficiently enumerate the type identifiers associated with a global:
+ // a list of GlobalTypeMembers (a GlobalObject stored alongside a vector
+ // of associated type metadata) and a mapping from type identifiers to their
+ // list of GlobalTypeMembers and last observed index in the list of globals.
+ // The indices will be used later to deterministically order the list of type
+ // identifiers.
+ BumpPtrAllocator Alloc;
+ struct TIInfo {
+ unsigned Index;
+ std::vector<GlobalTypeMember *> RefGlobals;
+ };
+ llvm::DenseMap<Metadata *, TIInfo> TypeIdInfo;
unsigned I = 0;
SmallVector<MDNode *, 2> Types;
- for (GlobalObject &GO : M->global_objects()) {
+ for (GlobalObject &GO : M.global_objects()) {
Types.clear();
GO.getMetadata(LLVMContext::MD_type, Types);
+ if (Types.empty())
+ continue;
+
+ auto *GTM = GlobalTypeMember::create(Alloc, &GO, Types);
for (MDNode *Type : Types) {
verifyTypeMDNode(&GO, Type);
- TypeIdIndices[cast<MDNode>(Type)->getOperand(1)] = ++I;
+ auto &Info = TypeIdInfo[cast<MDNode>(Type)->getOperand(1)];
+ Info.Index = ++I;
+ Info.RefGlobals.push_back(GTM);
}
}
@@ -900,8 +1267,7 @@ bool LowerTypeTests::lower() {
auto BitSetMDVal = dyn_cast<MetadataAsValue>(CI->getArgOperand(1));
if (!BitSetMDVal)
- report_fatal_error(
- "Second argument of llvm.type.test must be metadata");
+ report_fatal_error("Second argument of llvm.type.test must be metadata");
auto BitSet = BitSetMDVal->getMetadata();
// Add the call site to the list of call sites for this type identifier. We
@@ -920,14 +1286,9 @@ bool LowerTypeTests::lower() {
GlobalClassesTy::member_iterator CurSet = GlobalClasses.findLeader(GCI);
// Add the referenced globals to the type identifier's equivalence class.
- for (GlobalObject &GO : M->global_objects()) {
- Types.clear();
- GO.getMetadata(LLVMContext::MD_type, Types);
- for (MDNode *Type : Types)
- if (Type->getOperand(1) == BitSet)
- CurSet = GlobalClasses.unionSets(
- CurSet, GlobalClasses.findLeader(GlobalClasses.insert(&GO)));
- }
+ for (GlobalTypeMember *GTM : TypeIdInfo[BitSet].RefGlobals)
+ CurSet = GlobalClasses.unionSets(
+ CurSet, GlobalClasses.findLeader(GlobalClasses.insert(GTM)));
}
if (GlobalClasses.empty())
@@ -939,14 +1300,15 @@ bool LowerTypeTests::lower() {
for (GlobalClassesTy::iterator I = GlobalClasses.begin(),
E = GlobalClasses.end();
I != E; ++I) {
- if (!I->isLeader()) continue;
+ if (!I->isLeader())
+ continue;
++NumTypeIdDisjointSets;
unsigned MaxIndex = 0;
for (GlobalClassesTy::member_iterator MI = GlobalClasses.member_begin(I);
MI != GlobalClasses.member_end(); ++MI) {
if ((*MI).is<Metadata *>())
- MaxIndex = std::max(MaxIndex, TypeIdIndices[MI->get<Metadata *>()]);
+ MaxIndex = std::max(MaxIndex, TypeIdInfo[MI->get<Metadata *>()].Index);
}
Sets.emplace_back(I, MaxIndex);
}
@@ -960,20 +1322,20 @@ bool LowerTypeTests::lower() {
for (const auto &S : Sets) {
// Build the list of type identifiers in this disjoint set.
std::vector<Metadata *> TypeIds;
- std::vector<GlobalObject *> Globals;
+ std::vector<GlobalTypeMember *> Globals;
for (GlobalClassesTy::member_iterator MI =
GlobalClasses.member_begin(S.first);
MI != GlobalClasses.member_end(); ++MI) {
if ((*MI).is<Metadata *>())
TypeIds.push_back(MI->get<Metadata *>());
else
- Globals.push_back(MI->get<GlobalObject *>());
+ Globals.push_back(MI->get<GlobalTypeMember *>());
}
// Order type identifiers by global index for determinism. This ordering is
// stable as there is a one-to-one mapping between metadata and indices.
std::sort(TypeIds.begin(), TypeIds.end(), [&](Metadata *M1, Metadata *M2) {
- return TypeIdIndices[M1] < TypeIdIndices[M2];
+ return TypeIdInfo[M1].Index < TypeIdInfo[M2].Index;
});
// Build bitsets for this disjoint set.
@@ -985,35 +1347,10 @@ bool LowerTypeTests::lower() {
return true;
}
-// Initialization helper shared by the old and the new PM.
-static void init(LowerTypeTests *LTT, Module &M) {
- LTT->M = &M;
- const DataLayout &DL = M.getDataLayout();
- Triple TargetTriple(M.getTargetTriple());
- LTT->LinkerSubsectionsViaSymbols = TargetTriple.isMacOSX();
- LTT->Arch = TargetTriple.getArch();
- LTT->ObjectFormat = TargetTriple.getObjectFormat();
- LTT->Int1Ty = Type::getInt1Ty(M.getContext());
- LTT->Int8Ty = Type::getInt8Ty(M.getContext());
- LTT->Int32Ty = Type::getInt32Ty(M.getContext());
- LTT->Int32PtrTy = PointerType::getUnqual(LTT->Int32Ty);
- LTT->Int64Ty = Type::getInt64Ty(M.getContext());
- LTT->IntPtrTy = DL.getIntPtrType(M.getContext(), 0);
- LTT->TypeTestCallSites.clear();
-}
-
-bool LowerTypeTests::runOnModule(Module &M) {
- if (skipModule(M))
- return false;
- init(this, M);
- return lower();
-}
-
PreservedAnalyses LowerTypeTestsPass::run(Module &M,
- AnalysisManager<Module> &AM) {
- LowerTypeTests Impl;
- init(&Impl, M);
- bool Changed = Impl.lower();
+ ModuleAnalysisManager &AM) {
+ bool Changed =
+ LowerTypeTestsModule(M, SummaryAction::None, /*Summary=*/nullptr).lower();
if (!Changed)
return PreservedAnalyses::all();
return PreservedAnalyses::none();
diff --git a/contrib/llvm/lib/Transforms/IPO/MergeFunctions.cpp b/contrib/llvm/lib/Transforms/IPO/MergeFunctions.cpp
index fe653a7..e0bb0eb 100644
--- a/contrib/llvm/lib/Transforms/IPO/MergeFunctions.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/MergeFunctions.cpp
@@ -97,11 +97,9 @@
#include "llvm/IR/Constants.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/InlineAsm.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/Module.h"
-#include "llvm/IR/Operator.h"
#include "llvm/IR/ValueHandle.h"
#include "llvm/IR/ValueMap.h"
#include "llvm/Pass.h"
@@ -110,6 +108,7 @@
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/Utils/FunctionComparator.h"
#include <vector>
using namespace llvm;
@@ -130,328 +129,6 @@ static cl::opt<unsigned> NumFunctionsForSanityCheck(
namespace {
-/// GlobalNumberState assigns an integer to each global value in the program,
-/// which is used by the comparison routine to order references to globals. This
-/// state must be preserved throughout the pass, because Functions and other
-/// globals need to maintain their relative order. Globals are assigned a number
-/// when they are first visited. This order is deterministic, and so the
-/// assigned numbers are as well. When two functions are merged, neither number
-/// is updated. If the symbols are weak, this would be incorrect. If they are
-/// strong, then one will be replaced at all references to the other, and so
-/// direct callsites will now see one or the other symbol, and no update is
-/// necessary. Note that if we were guaranteed unique names, we could just
-/// compare those, but this would not work for stripped bitcodes or for those
-/// few symbols without a name.
-class GlobalNumberState {
- struct Config : ValueMapConfig<GlobalValue*> {
- enum { FollowRAUW = false };
- };
- // Each GlobalValue is mapped to an identifier. The Config ensures when RAUW
- // occurs, the mapping does not change. Tracking changes is unnecessary, and
- // also problematic for weak symbols (which may be overwritten).
- typedef ValueMap<GlobalValue *, uint64_t, Config> ValueNumberMap;
- ValueNumberMap GlobalNumbers;
- // The next unused serial number to assign to a global.
- uint64_t NextNumber;
- public:
- GlobalNumberState() : GlobalNumbers(), NextNumber(0) {}
- uint64_t getNumber(GlobalValue* Global) {
- ValueNumberMap::iterator MapIter;
- bool Inserted;
- std::tie(MapIter, Inserted) = GlobalNumbers.insert({Global, NextNumber});
- if (Inserted)
- NextNumber++;
- return MapIter->second;
- }
- void clear() {
- GlobalNumbers.clear();
- }
-};
-
-/// FunctionComparator - Compares two functions to determine whether or not
-/// they will generate machine code with the same behaviour. DataLayout is
-/// used if available. The comparator always fails conservatively (erring on the
-/// side of claiming that two functions are different).
-class FunctionComparator {
-public:
- FunctionComparator(const Function *F1, const Function *F2,
- GlobalNumberState* GN)
- : FnL(F1), FnR(F2), GlobalNumbers(GN) {}
-
- /// Test whether the two functions have equivalent behaviour.
- int compare();
- /// Hash a function. Equivalent functions will have the same hash, and unequal
- /// functions will have different hashes with high probability.
- typedef uint64_t FunctionHash;
- static FunctionHash functionHash(Function &);
-
-private:
- /// Test whether two basic blocks have equivalent behaviour.
- int cmpBasicBlocks(const BasicBlock *BBL, const BasicBlock *BBR) const;
-
- /// Constants comparison.
- /// Its analog to lexicographical comparison between hypothetical numbers
- /// of next format:
- /// <bitcastability-trait><raw-bit-contents>
- ///
- /// 1. Bitcastability.
- /// Check whether L's type could be losslessly bitcasted to R's type.
- /// On this stage method, in case when lossless bitcast is not possible
- /// method returns -1 or 1, thus also defining which type is greater in
- /// context of bitcastability.
- /// Stage 0: If types are equal in terms of cmpTypes, then we can go straight
- /// to the contents comparison.
- /// If types differ, remember types comparison result and check
- /// whether we still can bitcast types.
- /// Stage 1: Types that satisfies isFirstClassType conditions are always
- /// greater then others.
- /// Stage 2: Vector is greater then non-vector.
- /// If both types are vectors, then vector with greater bitwidth is
- /// greater.
- /// If both types are vectors with the same bitwidth, then types
- /// are bitcastable, and we can skip other stages, and go to contents
- /// comparison.
- /// Stage 3: Pointer types are greater than non-pointers. If both types are
- /// pointers of the same address space - go to contents comparison.
- /// Different address spaces: pointer with greater address space is
- /// greater.
- /// Stage 4: Types are neither vectors, nor pointers. And they differ.
- /// We don't know how to bitcast them. So, we better don't do it,
- /// and return types comparison result (so it determines the
- /// relationship among constants we don't know how to bitcast).
- ///
- /// Just for clearance, let's see how the set of constants could look
- /// on single dimension axis:
- ///
- /// [NFCT], [FCT, "others"], [FCT, pointers], [FCT, vectors]
- /// Where: NFCT - Not a FirstClassType
- /// FCT - FirstClassTyp:
- ///
- /// 2. Compare raw contents.
- /// It ignores types on this stage and only compares bits from L and R.
- /// Returns 0, if L and R has equivalent contents.
- /// -1 or 1 if values are different.
- /// Pretty trivial:
- /// 2.1. If contents are numbers, compare numbers.
- /// Ints with greater bitwidth are greater. Ints with same bitwidths
- /// compared by their contents.
- /// 2.2. "And so on". Just to avoid discrepancies with comments
- /// perhaps it would be better to read the implementation itself.
- /// 3. And again about overall picture. Let's look back at how the ordered set
- /// of constants will look like:
- /// [NFCT], [FCT, "others"], [FCT, pointers], [FCT, vectors]
- ///
- /// Now look, what could be inside [FCT, "others"], for example:
- /// [FCT, "others"] =
- /// [
- /// [double 0.1], [double 1.23],
- /// [i32 1], [i32 2],
- /// { double 1.0 }, ; StructTyID, NumElements = 1
- /// { i32 1 }, ; StructTyID, NumElements = 1
- /// { double 1, i32 1 }, ; StructTyID, NumElements = 2
- /// { i32 1, double 1 } ; StructTyID, NumElements = 2
- /// ]
- ///
- /// Let's explain the order. Float numbers will be less than integers, just
- /// because of cmpType terms: FloatTyID < IntegerTyID.
- /// Floats (with same fltSemantics) are sorted according to their value.
- /// Then you can see integers, and they are, like a floats,
- /// could be easy sorted among each others.
- /// The structures. Structures are grouped at the tail, again because of their
- /// TypeID: StructTyID > IntegerTyID > FloatTyID.
- /// Structures with greater number of elements are greater. Structures with
- /// greater elements going first are greater.
- /// The same logic with vectors, arrays and other possible complex types.
- ///
- /// Bitcastable constants.
- /// Let's assume, that some constant, belongs to some group of
- /// "so-called-equal" values with different types, and at the same time
- /// belongs to another group of constants with equal types
- /// and "really" equal values.
- ///
- /// Now, prove that this is impossible:
- ///
- /// If constant A with type TyA is bitcastable to B with type TyB, then:
- /// 1. All constants with equal types to TyA, are bitcastable to B. Since
- /// those should be vectors (if TyA is vector), pointers
- /// (if TyA is pointer), or else (if TyA equal to TyB), those types should
- /// be equal to TyB.
- /// 2. All constants with non-equal, but bitcastable types to TyA, are
- /// bitcastable to B.
- /// Once again, just because we allow it to vectors and pointers only.
- /// This statement could be expanded as below:
- /// 2.1. All vectors with equal bitwidth to vector A, has equal bitwidth to
- /// vector B, and thus bitcastable to B as well.
- /// 2.2. All pointers of the same address space, no matter what they point to,
- /// bitcastable. So if C is pointer, it could be bitcasted to A and to B.
- /// So any constant equal or bitcastable to A is equal or bitcastable to B.
- /// QED.
- ///
- /// In another words, for pointers and vectors, we ignore top-level type and
- /// look at their particular properties (bit-width for vectors, and
- /// address space for pointers).
- /// If these properties are equal - compare their contents.
- int cmpConstants(const Constant *L, const Constant *R) const;
-
- /// Compares two global values by number. Uses the GlobalNumbersState to
- /// identify the same gobals across function calls.
- int cmpGlobalValues(GlobalValue *L, GlobalValue *R) const;
-
- /// Assign or look up previously assigned numbers for the two values, and
- /// return whether the numbers are equal. Numbers are assigned in the order
- /// visited.
- /// Comparison order:
- /// Stage 0: Value that is function itself is always greater then others.
- /// If left and right values are references to their functions, then
- /// they are equal.
- /// Stage 1: Constants are greater than non-constants.
- /// If both left and right are constants, then the result of
- /// cmpConstants is used as cmpValues result.
- /// Stage 2: InlineAsm instances are greater than others. If both left and
- /// right are InlineAsm instances, InlineAsm* pointers casted to
- /// integers and compared as numbers.
- /// Stage 3: For all other cases we compare order we meet these values in
- /// their functions. If right value was met first during scanning,
- /// then left value is greater.
- /// In another words, we compare serial numbers, for more details
- /// see comments for sn_mapL and sn_mapR.
- int cmpValues(const Value *L, const Value *R) const;
-
- /// Compare two Instructions for equivalence, similar to
- /// Instruction::isSameOperationAs.
- ///
- /// Stages are listed in "most significant stage first" order:
- /// On each stage below, we do comparison between some left and right
- /// operation parts. If parts are non-equal, we assign parts comparison
- /// result to the operation comparison result and exit from method.
- /// Otherwise we proceed to the next stage.
- /// Stages:
- /// 1. Operations opcodes. Compared as numbers.
- /// 2. Number of operands.
- /// 3. Operation types. Compared with cmpType method.
- /// 4. Compare operation subclass optional data as stream of bytes:
- /// just convert it to integers and call cmpNumbers.
- /// 5. Compare in operation operand types with cmpType in
- /// most significant operand first order.
- /// 6. Last stage. Check operations for some specific attributes.
- /// For example, for Load it would be:
- /// 6.1.Load: volatile (as boolean flag)
- /// 6.2.Load: alignment (as integer numbers)
- /// 6.3.Load: ordering (as underlying enum class value)
- /// 6.4.Load: synch-scope (as integer numbers)
- /// 6.5.Load: range metadata (as integer ranges)
- /// On this stage its better to see the code, since its not more than 10-15
- /// strings for particular instruction, and could change sometimes.
- int cmpOperations(const Instruction *L, const Instruction *R) const;
-
- /// Compare two GEPs for equivalent pointer arithmetic.
- /// Parts to be compared for each comparison stage,
- /// most significant stage first:
- /// 1. Address space. As numbers.
- /// 2. Constant offset, (using GEPOperator::accumulateConstantOffset method).
- /// 3. Pointer operand type (using cmpType method).
- /// 4. Number of operands.
- /// 5. Compare operands, using cmpValues method.
- int cmpGEPs(const GEPOperator *GEPL, const GEPOperator *GEPR) const;
- int cmpGEPs(const GetElementPtrInst *GEPL,
- const GetElementPtrInst *GEPR) const {
- return cmpGEPs(cast<GEPOperator>(GEPL), cast<GEPOperator>(GEPR));
- }
-
- /// cmpType - compares two types,
- /// defines total ordering among the types set.
- ///
- /// Return values:
- /// 0 if types are equal,
- /// -1 if Left is less than Right,
- /// +1 if Left is greater than Right.
- ///
- /// Description:
- /// Comparison is broken onto stages. Like in lexicographical comparison
- /// stage coming first has higher priority.
- /// On each explanation stage keep in mind total ordering properties.
- ///
- /// 0. Before comparison we coerce pointer types of 0 address space to
- /// integer.
- /// We also don't bother with same type at left and right, so
- /// just return 0 in this case.
- ///
- /// 1. If types are of different kind (different type IDs).
- /// Return result of type IDs comparison, treating them as numbers.
- /// 2. If types are integers, check that they have the same width. If they
- /// are vectors, check that they have the same count and subtype.
- /// 3. Types have the same ID, so check whether they are one of:
- /// * Void
- /// * Float
- /// * Double
- /// * X86_FP80
- /// * FP128
- /// * PPC_FP128
- /// * Label
- /// * Metadata
- /// We can treat these types as equal whenever their IDs are same.
- /// 4. If Left and Right are pointers, return result of address space
- /// comparison (numbers comparison). We can treat pointer types of same
- /// address space as equal.
- /// 5. If types are complex.
- /// Then both Left and Right are to be expanded and their element types will
- /// be checked with the same way. If we get Res != 0 on some stage, return it.
- /// Otherwise return 0.
- /// 6. For all other cases put llvm_unreachable.
- int cmpTypes(Type *TyL, Type *TyR) const;
-
- int cmpNumbers(uint64_t L, uint64_t R) const;
- int cmpOrderings(AtomicOrdering L, AtomicOrdering R) const;
- int cmpAPInts(const APInt &L, const APInt &R) const;
- int cmpAPFloats(const APFloat &L, const APFloat &R) const;
- int cmpInlineAsm(const InlineAsm *L, const InlineAsm *R) const;
- int cmpMem(StringRef L, StringRef R) const;
- int cmpAttrs(const AttributeSet L, const AttributeSet R) const;
- int cmpRangeMetadata(const MDNode *L, const MDNode *R) const;
- int cmpOperandBundlesSchema(const Instruction *L, const Instruction *R) const;
-
- // The two functions undergoing comparison.
- const Function *FnL, *FnR;
-
- /// Assign serial numbers to values from left function, and values from
- /// right function.
- /// Explanation:
- /// Being comparing functions we need to compare values we meet at left and
- /// right sides.
- /// Its easy to sort things out for external values. It just should be
- /// the same value at left and right.
- /// But for local values (those were introduced inside function body)
- /// we have to ensure they were introduced at exactly the same place,
- /// and plays the same role.
- /// Let's assign serial number to each value when we meet it first time.
- /// Values that were met at same place will be with same serial numbers.
- /// In this case it would be good to explain few points about values assigned
- /// to BBs and other ways of implementation (see below).
- ///
- /// 1. Safety of BB reordering.
- /// It's safe to change the order of BasicBlocks in function.
- /// Relationship with other functions and serial numbering will not be
- /// changed in this case.
- /// As follows from FunctionComparator::compare(), we do CFG walk: we start
- /// from the entry, and then take each terminator. So it doesn't matter how in
- /// fact BBs are ordered in function. And since cmpValues are called during
- /// this walk, the numbering depends only on how BBs located inside the CFG.
- /// So the answer is - yes. We will get the same numbering.
- ///
- /// 2. Impossibility to use dominance properties of values.
- /// If we compare two instruction operands: first is usage of local
- /// variable AL from function FL, and second is usage of local variable AR
- /// from FR, we could compare their origins and check whether they are
- /// defined at the same place.
- /// But, we are still not able to compare operands of PHI nodes, since those
- /// could be operands from further BBs we didn't scan yet.
- /// So it's impossible to use dominance properties in general.
- mutable DenseMap<const Value*, int> sn_mapL, sn_mapR;
-
- // The global state we will use
- GlobalNumberState* GlobalNumbers;
-};
-
class FunctionNode {
mutable AssertingVH<Function> F;
FunctionComparator::FunctionHash Hash;
@@ -470,898 +147,6 @@ public:
void release() { F = nullptr; }
};
-} // end anonymous namespace
-
-int FunctionComparator::cmpNumbers(uint64_t L, uint64_t R) const {
- if (L < R) return -1;
- if (L > R) return 1;
- return 0;
-}
-
-int FunctionComparator::cmpOrderings(AtomicOrdering L, AtomicOrdering R) const {
- if ((int)L < (int)R) return -1;
- if ((int)L > (int)R) return 1;
- return 0;
-}
-
-int FunctionComparator::cmpAPInts(const APInt &L, const APInt &R) const {
- if (int Res = cmpNumbers(L.getBitWidth(), R.getBitWidth()))
- return Res;
- if (L.ugt(R)) return 1;
- if (R.ugt(L)) return -1;
- return 0;
-}
-
-int FunctionComparator::cmpAPFloats(const APFloat &L, const APFloat &R) const {
- // Floats are ordered first by semantics (i.e. float, double, half, etc.),
- // then by value interpreted as a bitstring (aka APInt).
- const fltSemantics &SL = L.getSemantics(), &SR = R.getSemantics();
- if (int Res = cmpNumbers(APFloat::semanticsPrecision(SL),
- APFloat::semanticsPrecision(SR)))
- return Res;
- if (int Res = cmpNumbers(APFloat::semanticsMaxExponent(SL),
- APFloat::semanticsMaxExponent(SR)))
- return Res;
- if (int Res = cmpNumbers(APFloat::semanticsMinExponent(SL),
- APFloat::semanticsMinExponent(SR)))
- return Res;
- if (int Res = cmpNumbers(APFloat::semanticsSizeInBits(SL),
- APFloat::semanticsSizeInBits(SR)))
- return Res;
- return cmpAPInts(L.bitcastToAPInt(), R.bitcastToAPInt());
-}
-
-int FunctionComparator::cmpMem(StringRef L, StringRef R) const {
- // Prevent heavy comparison, compare sizes first.
- if (int Res = cmpNumbers(L.size(), R.size()))
- return Res;
-
- // Compare strings lexicographically only when it is necessary: only when
- // strings are equal in size.
- return L.compare(R);
-}
-
-int FunctionComparator::cmpAttrs(const AttributeSet L,
- const AttributeSet R) const {
- if (int Res = cmpNumbers(L.getNumSlots(), R.getNumSlots()))
- return Res;
-
- for (unsigned i = 0, e = L.getNumSlots(); i != e; ++i) {
- AttributeSet::iterator LI = L.begin(i), LE = L.end(i), RI = R.begin(i),
- RE = R.end(i);
- for (; LI != LE && RI != RE; ++LI, ++RI) {
- Attribute LA = *LI;
- Attribute RA = *RI;
- if (LA < RA)
- return -1;
- if (RA < LA)
- return 1;
- }
- if (LI != LE)
- return 1;
- if (RI != RE)
- return -1;
- }
- return 0;
-}
-
-int FunctionComparator::cmpRangeMetadata(const MDNode *L,
- const MDNode *R) const {
- if (L == R)
- return 0;
- if (!L)
- return -1;
- if (!R)
- return 1;
- // Range metadata is a sequence of numbers. Make sure they are the same
- // sequence.
- // TODO: Note that as this is metadata, it is possible to drop and/or merge
- // this data when considering functions to merge. Thus this comparison would
- // return 0 (i.e. equivalent), but merging would become more complicated
- // because the ranges would need to be unioned. It is not likely that
- // functions differ ONLY in this metadata if they are actually the same
- // function semantically.
- if (int Res = cmpNumbers(L->getNumOperands(), R->getNumOperands()))
- return Res;
- for (size_t I = 0; I < L->getNumOperands(); ++I) {
- ConstantInt *LLow = mdconst::extract<ConstantInt>(L->getOperand(I));
- ConstantInt *RLow = mdconst::extract<ConstantInt>(R->getOperand(I));
- if (int Res = cmpAPInts(LLow->getValue(), RLow->getValue()))
- return Res;
- }
- return 0;
-}
-
-int FunctionComparator::cmpOperandBundlesSchema(const Instruction *L,
- const Instruction *R) const {
- ImmutableCallSite LCS(L);
- ImmutableCallSite RCS(R);
-
- assert(LCS && RCS && "Must be calls or invokes!");
- assert(LCS.isCall() == RCS.isCall() && "Can't compare otherwise!");
-
- if (int Res =
- cmpNumbers(LCS.getNumOperandBundles(), RCS.getNumOperandBundles()))
- return Res;
-
- for (unsigned i = 0, e = LCS.getNumOperandBundles(); i != e; ++i) {
- auto OBL = LCS.getOperandBundleAt(i);
- auto OBR = RCS.getOperandBundleAt(i);
-
- if (int Res = OBL.getTagName().compare(OBR.getTagName()))
- return Res;
-
- if (int Res = cmpNumbers(OBL.Inputs.size(), OBR.Inputs.size()))
- return Res;
- }
-
- return 0;
-}
-
-/// Constants comparison:
-/// 1. Check whether type of L constant could be losslessly bitcasted to R
-/// type.
-/// 2. Compare constant contents.
-/// For more details see declaration comments.
-int FunctionComparator::cmpConstants(const Constant *L,
- const Constant *R) const {
-
- Type *TyL = L->getType();
- Type *TyR = R->getType();
-
- // Check whether types are bitcastable. This part is just re-factored
- // Type::canLosslesslyBitCastTo method, but instead of returning true/false,
- // we also pack into result which type is "less" for us.
- int TypesRes = cmpTypes(TyL, TyR);
- if (TypesRes != 0) {
- // Types are different, but check whether we can bitcast them.
- if (!TyL->isFirstClassType()) {
- if (TyR->isFirstClassType())
- return -1;
- // Neither TyL nor TyR are values of first class type. Return the result
- // of comparing the types
- return TypesRes;
- }
- if (!TyR->isFirstClassType()) {
- if (TyL->isFirstClassType())
- return 1;
- return TypesRes;
- }
-
- // Vector -> Vector conversions are always lossless if the two vector types
- // have the same size, otherwise not.
- unsigned TyLWidth = 0;
- unsigned TyRWidth = 0;
-
- if (auto *VecTyL = dyn_cast<VectorType>(TyL))
- TyLWidth = VecTyL->getBitWidth();
- if (auto *VecTyR = dyn_cast<VectorType>(TyR))
- TyRWidth = VecTyR->getBitWidth();
-
- if (TyLWidth != TyRWidth)
- return cmpNumbers(TyLWidth, TyRWidth);
-
- // Zero bit-width means neither TyL nor TyR are vectors.
- if (!TyLWidth) {
- PointerType *PTyL = dyn_cast<PointerType>(TyL);
- PointerType *PTyR = dyn_cast<PointerType>(TyR);
- if (PTyL && PTyR) {
- unsigned AddrSpaceL = PTyL->getAddressSpace();
- unsigned AddrSpaceR = PTyR->getAddressSpace();
- if (int Res = cmpNumbers(AddrSpaceL, AddrSpaceR))
- return Res;
- }
- if (PTyL)
- return 1;
- if (PTyR)
- return -1;
-
- // TyL and TyR aren't vectors, nor pointers. We don't know how to
- // bitcast them.
- return TypesRes;
- }
- }
-
- // OK, types are bitcastable, now check constant contents.
-
- if (L->isNullValue() && R->isNullValue())
- return TypesRes;
- if (L->isNullValue() && !R->isNullValue())
- return 1;
- if (!L->isNullValue() && R->isNullValue())
- return -1;
-
- auto GlobalValueL = const_cast<GlobalValue*>(dyn_cast<GlobalValue>(L));
- auto GlobalValueR = const_cast<GlobalValue*>(dyn_cast<GlobalValue>(R));
- if (GlobalValueL && GlobalValueR) {
- return cmpGlobalValues(GlobalValueL, GlobalValueR);
- }
-
- if (int Res = cmpNumbers(L->getValueID(), R->getValueID()))
- return Res;
-
- if (const auto *SeqL = dyn_cast<ConstantDataSequential>(L)) {
- const auto *SeqR = cast<ConstantDataSequential>(R);
- // This handles ConstantDataArray and ConstantDataVector. Note that we
- // compare the two raw data arrays, which might differ depending on the host
- // endianness. This isn't a problem though, because the endiness of a module
- // will affect the order of the constants, but this order is the same
- // for a given input module and host platform.
- return cmpMem(SeqL->getRawDataValues(), SeqR->getRawDataValues());
- }
-
- switch (L->getValueID()) {
- case Value::UndefValueVal:
- case Value::ConstantTokenNoneVal:
- return TypesRes;
- case Value::ConstantIntVal: {
- const APInt &LInt = cast<ConstantInt>(L)->getValue();
- const APInt &RInt = cast<ConstantInt>(R)->getValue();
- return cmpAPInts(LInt, RInt);
- }
- case Value::ConstantFPVal: {
- const APFloat &LAPF = cast<ConstantFP>(L)->getValueAPF();
- const APFloat &RAPF = cast<ConstantFP>(R)->getValueAPF();
- return cmpAPFloats(LAPF, RAPF);
- }
- case Value::ConstantArrayVal: {
- const ConstantArray *LA = cast<ConstantArray>(L);
- const ConstantArray *RA = cast<ConstantArray>(R);
- uint64_t NumElementsL = cast<ArrayType>(TyL)->getNumElements();
- uint64_t NumElementsR = cast<ArrayType>(TyR)->getNumElements();
- if (int Res = cmpNumbers(NumElementsL, NumElementsR))
- return Res;
- for (uint64_t i = 0; i < NumElementsL; ++i) {
- if (int Res = cmpConstants(cast<Constant>(LA->getOperand(i)),
- cast<Constant>(RA->getOperand(i))))
- return Res;
- }
- return 0;
- }
- case Value::ConstantStructVal: {
- const ConstantStruct *LS = cast<ConstantStruct>(L);
- const ConstantStruct *RS = cast<ConstantStruct>(R);
- unsigned NumElementsL = cast<StructType>(TyL)->getNumElements();
- unsigned NumElementsR = cast<StructType>(TyR)->getNumElements();
- if (int Res = cmpNumbers(NumElementsL, NumElementsR))
- return Res;
- for (unsigned i = 0; i != NumElementsL; ++i) {
- if (int Res = cmpConstants(cast<Constant>(LS->getOperand(i)),
- cast<Constant>(RS->getOperand(i))))
- return Res;
- }
- return 0;
- }
- case Value::ConstantVectorVal: {
- const ConstantVector *LV = cast<ConstantVector>(L);
- const ConstantVector *RV = cast<ConstantVector>(R);
- unsigned NumElementsL = cast<VectorType>(TyL)->getNumElements();
- unsigned NumElementsR = cast<VectorType>(TyR)->getNumElements();
- if (int Res = cmpNumbers(NumElementsL, NumElementsR))
- return Res;
- for (uint64_t i = 0; i < NumElementsL; ++i) {
- if (int Res = cmpConstants(cast<Constant>(LV->getOperand(i)),
- cast<Constant>(RV->getOperand(i))))
- return Res;
- }
- return 0;
- }
- case Value::ConstantExprVal: {
- const ConstantExpr *LE = cast<ConstantExpr>(L);
- const ConstantExpr *RE = cast<ConstantExpr>(R);
- unsigned NumOperandsL = LE->getNumOperands();
- unsigned NumOperandsR = RE->getNumOperands();
- if (int Res = cmpNumbers(NumOperandsL, NumOperandsR))
- return Res;
- for (unsigned i = 0; i < NumOperandsL; ++i) {
- if (int Res = cmpConstants(cast<Constant>(LE->getOperand(i)),
- cast<Constant>(RE->getOperand(i))))
- return Res;
- }
- return 0;
- }
- case Value::BlockAddressVal: {
- const BlockAddress *LBA = cast<BlockAddress>(L);
- const BlockAddress *RBA = cast<BlockAddress>(R);
- if (int Res = cmpValues(LBA->getFunction(), RBA->getFunction()))
- return Res;
- if (LBA->getFunction() == RBA->getFunction()) {
- // They are BBs in the same function. Order by which comes first in the
- // BB order of the function. This order is deterministic.
- Function* F = LBA->getFunction();
- BasicBlock *LBB = LBA->getBasicBlock();
- BasicBlock *RBB = RBA->getBasicBlock();
- if (LBB == RBB)
- return 0;
- for(BasicBlock &BB : F->getBasicBlockList()) {
- if (&BB == LBB) {
- assert(&BB != RBB);
- return -1;
- }
- if (&BB == RBB)
- return 1;
- }
- llvm_unreachable("Basic Block Address does not point to a basic block in "
- "its function.");
- return -1;
- } else {
- // cmpValues said the functions are the same. So because they aren't
- // literally the same pointer, they must respectively be the left and
- // right functions.
- assert(LBA->getFunction() == FnL && RBA->getFunction() == FnR);
- // cmpValues will tell us if these are equivalent BasicBlocks, in the
- // context of their respective functions.
- return cmpValues(LBA->getBasicBlock(), RBA->getBasicBlock());
- }
- }
- default: // Unknown constant, abort.
- DEBUG(dbgs() << "Looking at valueID " << L->getValueID() << "\n");
- llvm_unreachable("Constant ValueID not recognized.");
- return -1;
- }
-}
-
-int FunctionComparator::cmpGlobalValues(GlobalValue *L, GlobalValue *R) const {
- return cmpNumbers(GlobalNumbers->getNumber(L), GlobalNumbers->getNumber(R));
-}
-
-/// cmpType - compares two types,
-/// defines total ordering among the types set.
-/// See method declaration comments for more details.
-int FunctionComparator::cmpTypes(Type *TyL, Type *TyR) const {
- PointerType *PTyL = dyn_cast<PointerType>(TyL);
- PointerType *PTyR = dyn_cast<PointerType>(TyR);
-
- const DataLayout &DL = FnL->getParent()->getDataLayout();
- if (PTyL && PTyL->getAddressSpace() == 0)
- TyL = DL.getIntPtrType(TyL);
- if (PTyR && PTyR->getAddressSpace() == 0)
- TyR = DL.getIntPtrType(TyR);
-
- if (TyL == TyR)
- return 0;
-
- if (int Res = cmpNumbers(TyL->getTypeID(), TyR->getTypeID()))
- return Res;
-
- switch (TyL->getTypeID()) {
- default:
- llvm_unreachable("Unknown type!");
- // Fall through in Release mode.
- case Type::IntegerTyID:
- return cmpNumbers(cast<IntegerType>(TyL)->getBitWidth(),
- cast<IntegerType>(TyR)->getBitWidth());
- case Type::VectorTyID: {
- VectorType *VTyL = cast<VectorType>(TyL), *VTyR = cast<VectorType>(TyR);
- if (int Res = cmpNumbers(VTyL->getNumElements(), VTyR->getNumElements()))
- return Res;
- return cmpTypes(VTyL->getElementType(), VTyR->getElementType());
- }
- // TyL == TyR would have returned true earlier, because types are uniqued.
- case Type::VoidTyID:
- case Type::FloatTyID:
- case Type::DoubleTyID:
- case Type::X86_FP80TyID:
- case Type::FP128TyID:
- case Type::PPC_FP128TyID:
- case Type::LabelTyID:
- case Type::MetadataTyID:
- case Type::TokenTyID:
- return 0;
-
- case Type::PointerTyID: {
- assert(PTyL && PTyR && "Both types must be pointers here.");
- return cmpNumbers(PTyL->getAddressSpace(), PTyR->getAddressSpace());
- }
-
- case Type::StructTyID: {
- StructType *STyL = cast<StructType>(TyL);
- StructType *STyR = cast<StructType>(TyR);
- if (STyL->getNumElements() != STyR->getNumElements())
- return cmpNumbers(STyL->getNumElements(), STyR->getNumElements());
-
- if (STyL->isPacked() != STyR->isPacked())
- return cmpNumbers(STyL->isPacked(), STyR->isPacked());
-
- for (unsigned i = 0, e = STyL->getNumElements(); i != e; ++i) {
- if (int Res = cmpTypes(STyL->getElementType(i), STyR->getElementType(i)))
- return Res;
- }
- return 0;
- }
-
- case Type::FunctionTyID: {
- FunctionType *FTyL = cast<FunctionType>(TyL);
- FunctionType *FTyR = cast<FunctionType>(TyR);
- if (FTyL->getNumParams() != FTyR->getNumParams())
- return cmpNumbers(FTyL->getNumParams(), FTyR->getNumParams());
-
- if (FTyL->isVarArg() != FTyR->isVarArg())
- return cmpNumbers(FTyL->isVarArg(), FTyR->isVarArg());
-
- if (int Res = cmpTypes(FTyL->getReturnType(), FTyR->getReturnType()))
- return Res;
-
- for (unsigned i = 0, e = FTyL->getNumParams(); i != e; ++i) {
- if (int Res = cmpTypes(FTyL->getParamType(i), FTyR->getParamType(i)))
- return Res;
- }
- return 0;
- }
-
- case Type::ArrayTyID: {
- ArrayType *ATyL = cast<ArrayType>(TyL);
- ArrayType *ATyR = cast<ArrayType>(TyR);
- if (ATyL->getNumElements() != ATyR->getNumElements())
- return cmpNumbers(ATyL->getNumElements(), ATyR->getNumElements());
- return cmpTypes(ATyL->getElementType(), ATyR->getElementType());
- }
- }
-}
-
-// Determine whether the two operations are the same except that pointer-to-A
-// and pointer-to-B are equivalent. This should be kept in sync with
-// Instruction::isSameOperationAs.
-// Read method declaration comments for more details.
-int FunctionComparator::cmpOperations(const Instruction *L,
- const Instruction *R) const {
- // Differences from Instruction::isSameOperationAs:
- // * replace type comparison with calls to cmpTypes.
- // * we test for I->getRawSubclassOptionalData (nuw/nsw/tail) at the top.
- // * because of the above, we don't test for the tail bit on calls later on.
- if (int Res = cmpNumbers(L->getOpcode(), R->getOpcode()))
- return Res;
-
- if (int Res = cmpNumbers(L->getNumOperands(), R->getNumOperands()))
- return Res;
-
- if (int Res = cmpTypes(L->getType(), R->getType()))
- return Res;
-
- if (int Res = cmpNumbers(L->getRawSubclassOptionalData(),
- R->getRawSubclassOptionalData()))
- return Res;
-
- // We have two instructions of identical opcode and #operands. Check to see
- // if all operands are the same type
- for (unsigned i = 0, e = L->getNumOperands(); i != e; ++i) {
- if (int Res =
- cmpTypes(L->getOperand(i)->getType(), R->getOperand(i)->getType()))
- return Res;
- }
-
- // Check special state that is a part of some instructions.
- if (const AllocaInst *AI = dyn_cast<AllocaInst>(L)) {
- if (int Res = cmpTypes(AI->getAllocatedType(),
- cast<AllocaInst>(R)->getAllocatedType()))
- return Res;
- return cmpNumbers(AI->getAlignment(), cast<AllocaInst>(R)->getAlignment());
- }
- if (const LoadInst *LI = dyn_cast<LoadInst>(L)) {
- if (int Res = cmpNumbers(LI->isVolatile(), cast<LoadInst>(R)->isVolatile()))
- return Res;
- if (int Res =
- cmpNumbers(LI->getAlignment(), cast<LoadInst>(R)->getAlignment()))
- return Res;
- if (int Res =
- cmpOrderings(LI->getOrdering(), cast<LoadInst>(R)->getOrdering()))
- return Res;
- if (int Res =
- cmpNumbers(LI->getSynchScope(), cast<LoadInst>(R)->getSynchScope()))
- return Res;
- return cmpRangeMetadata(LI->getMetadata(LLVMContext::MD_range),
- cast<LoadInst>(R)->getMetadata(LLVMContext::MD_range));
- }
- if (const StoreInst *SI = dyn_cast<StoreInst>(L)) {
- if (int Res =
- cmpNumbers(SI->isVolatile(), cast<StoreInst>(R)->isVolatile()))
- return Res;
- if (int Res =
- cmpNumbers(SI->getAlignment(), cast<StoreInst>(R)->getAlignment()))
- return Res;
- if (int Res =
- cmpOrderings(SI->getOrdering(), cast<StoreInst>(R)->getOrdering()))
- return Res;
- return cmpNumbers(SI->getSynchScope(), cast<StoreInst>(R)->getSynchScope());
- }
- if (const CmpInst *CI = dyn_cast<CmpInst>(L))
- return cmpNumbers(CI->getPredicate(), cast<CmpInst>(R)->getPredicate());
- if (const CallInst *CI = dyn_cast<CallInst>(L)) {
- if (int Res = cmpNumbers(CI->getCallingConv(),
- cast<CallInst>(R)->getCallingConv()))
- return Res;
- if (int Res =
- cmpAttrs(CI->getAttributes(), cast<CallInst>(R)->getAttributes()))
- return Res;
- if (int Res = cmpOperandBundlesSchema(CI, R))
- return Res;
- return cmpRangeMetadata(
- CI->getMetadata(LLVMContext::MD_range),
- cast<CallInst>(R)->getMetadata(LLVMContext::MD_range));
- }
- if (const InvokeInst *II = dyn_cast<InvokeInst>(L)) {
- if (int Res = cmpNumbers(II->getCallingConv(),
- cast<InvokeInst>(R)->getCallingConv()))
- return Res;
- if (int Res =
- cmpAttrs(II->getAttributes(), cast<InvokeInst>(R)->getAttributes()))
- return Res;
- if (int Res = cmpOperandBundlesSchema(II, R))
- return Res;
- return cmpRangeMetadata(
- II->getMetadata(LLVMContext::MD_range),
- cast<InvokeInst>(R)->getMetadata(LLVMContext::MD_range));
- }
- if (const InsertValueInst *IVI = dyn_cast<InsertValueInst>(L)) {
- ArrayRef<unsigned> LIndices = IVI->getIndices();
- ArrayRef<unsigned> RIndices = cast<InsertValueInst>(R)->getIndices();
- if (int Res = cmpNumbers(LIndices.size(), RIndices.size()))
- return Res;
- for (size_t i = 0, e = LIndices.size(); i != e; ++i) {
- if (int Res = cmpNumbers(LIndices[i], RIndices[i]))
- return Res;
- }
- return 0;
- }
- if (const ExtractValueInst *EVI = dyn_cast<ExtractValueInst>(L)) {
- ArrayRef<unsigned> LIndices = EVI->getIndices();
- ArrayRef<unsigned> RIndices = cast<ExtractValueInst>(R)->getIndices();
- if (int Res = cmpNumbers(LIndices.size(), RIndices.size()))
- return Res;
- for (size_t i = 0, e = LIndices.size(); i != e; ++i) {
- if (int Res = cmpNumbers(LIndices[i], RIndices[i]))
- return Res;
- }
- }
- if (const FenceInst *FI = dyn_cast<FenceInst>(L)) {
- if (int Res =
- cmpOrderings(FI->getOrdering(), cast<FenceInst>(R)->getOrdering()))
- return Res;
- return cmpNumbers(FI->getSynchScope(), cast<FenceInst>(R)->getSynchScope());
- }
- if (const AtomicCmpXchgInst *CXI = dyn_cast<AtomicCmpXchgInst>(L)) {
- if (int Res = cmpNumbers(CXI->isVolatile(),
- cast<AtomicCmpXchgInst>(R)->isVolatile()))
- return Res;
- if (int Res = cmpNumbers(CXI->isWeak(),
- cast<AtomicCmpXchgInst>(R)->isWeak()))
- return Res;
- if (int Res =
- cmpOrderings(CXI->getSuccessOrdering(),
- cast<AtomicCmpXchgInst>(R)->getSuccessOrdering()))
- return Res;
- if (int Res =
- cmpOrderings(CXI->getFailureOrdering(),
- cast<AtomicCmpXchgInst>(R)->getFailureOrdering()))
- return Res;
- return cmpNumbers(CXI->getSynchScope(),
- cast<AtomicCmpXchgInst>(R)->getSynchScope());
- }
- if (const AtomicRMWInst *RMWI = dyn_cast<AtomicRMWInst>(L)) {
- if (int Res = cmpNumbers(RMWI->getOperation(),
- cast<AtomicRMWInst>(R)->getOperation()))
- return Res;
- if (int Res = cmpNumbers(RMWI->isVolatile(),
- cast<AtomicRMWInst>(R)->isVolatile()))
- return Res;
- if (int Res = cmpOrderings(RMWI->getOrdering(),
- cast<AtomicRMWInst>(R)->getOrdering()))
- return Res;
- return cmpNumbers(RMWI->getSynchScope(),
- cast<AtomicRMWInst>(R)->getSynchScope());
- }
- if (const PHINode *PNL = dyn_cast<PHINode>(L)) {
- const PHINode *PNR = cast<PHINode>(R);
- // Ensure that in addition to the incoming values being identical
- // (checked by the caller of this function), the incoming blocks
- // are also identical.
- for (unsigned i = 0, e = PNL->getNumIncomingValues(); i != e; ++i) {
- if (int Res =
- cmpValues(PNL->getIncomingBlock(i), PNR->getIncomingBlock(i)))
- return Res;
- }
- }
- return 0;
-}
-
-// Determine whether two GEP operations perform the same underlying arithmetic.
-// Read method declaration comments for more details.
-int FunctionComparator::cmpGEPs(const GEPOperator *GEPL,
- const GEPOperator *GEPR) const {
-
- unsigned int ASL = GEPL->getPointerAddressSpace();
- unsigned int ASR = GEPR->getPointerAddressSpace();
-
- if (int Res = cmpNumbers(ASL, ASR))
- return Res;
-
- // When we have target data, we can reduce the GEP down to the value in bytes
- // added to the address.
- const DataLayout &DL = FnL->getParent()->getDataLayout();
- unsigned BitWidth = DL.getPointerSizeInBits(ASL);
- APInt OffsetL(BitWidth, 0), OffsetR(BitWidth, 0);
- if (GEPL->accumulateConstantOffset(DL, OffsetL) &&
- GEPR->accumulateConstantOffset(DL, OffsetR))
- return cmpAPInts(OffsetL, OffsetR);
- if (int Res = cmpTypes(GEPL->getSourceElementType(),
- GEPR->getSourceElementType()))
- return Res;
-
- if (int Res = cmpNumbers(GEPL->getNumOperands(), GEPR->getNumOperands()))
- return Res;
-
- for (unsigned i = 0, e = GEPL->getNumOperands(); i != e; ++i) {
- if (int Res = cmpValues(GEPL->getOperand(i), GEPR->getOperand(i)))
- return Res;
- }
-
- return 0;
-}
-
-int FunctionComparator::cmpInlineAsm(const InlineAsm *L,
- const InlineAsm *R) const {
- // InlineAsm's are uniqued. If they are the same pointer, obviously they are
- // the same, otherwise compare the fields.
- if (L == R)
- return 0;
- if (int Res = cmpTypes(L->getFunctionType(), R->getFunctionType()))
- return Res;
- if (int Res = cmpMem(L->getAsmString(), R->getAsmString()))
- return Res;
- if (int Res = cmpMem(L->getConstraintString(), R->getConstraintString()))
- return Res;
- if (int Res = cmpNumbers(L->hasSideEffects(), R->hasSideEffects()))
- return Res;
- if (int Res = cmpNumbers(L->isAlignStack(), R->isAlignStack()))
- return Res;
- if (int Res = cmpNumbers(L->getDialect(), R->getDialect()))
- return Res;
- llvm_unreachable("InlineAsm blocks were not uniqued.");
- return 0;
-}
-
-/// Compare two values used by the two functions under pair-wise comparison. If
-/// this is the first time the values are seen, they're added to the mapping so
-/// that we will detect mismatches on next use.
-/// See comments in declaration for more details.
-int FunctionComparator::cmpValues(const Value *L, const Value *R) const {
- // Catch self-reference case.
- if (L == FnL) {
- if (R == FnR)
- return 0;
- return -1;
- }
- if (R == FnR) {
- if (L == FnL)
- return 0;
- return 1;
- }
-
- const Constant *ConstL = dyn_cast<Constant>(L);
- const Constant *ConstR = dyn_cast<Constant>(R);
- if (ConstL && ConstR) {
- if (L == R)
- return 0;
- return cmpConstants(ConstL, ConstR);
- }
-
- if (ConstL)
- return 1;
- if (ConstR)
- return -1;
-
- const InlineAsm *InlineAsmL = dyn_cast<InlineAsm>(L);
- const InlineAsm *InlineAsmR = dyn_cast<InlineAsm>(R);
-
- if (InlineAsmL && InlineAsmR)
- return cmpInlineAsm(InlineAsmL, InlineAsmR);
- if (InlineAsmL)
- return 1;
- if (InlineAsmR)
- return -1;
-
- auto LeftSN = sn_mapL.insert(std::make_pair(L, sn_mapL.size())),
- RightSN = sn_mapR.insert(std::make_pair(R, sn_mapR.size()));
-
- return cmpNumbers(LeftSN.first->second, RightSN.first->second);
-}
-// Test whether two basic blocks have equivalent behaviour.
-int FunctionComparator::cmpBasicBlocks(const BasicBlock *BBL,
- const BasicBlock *BBR) const {
- BasicBlock::const_iterator InstL = BBL->begin(), InstLE = BBL->end();
- BasicBlock::const_iterator InstR = BBR->begin(), InstRE = BBR->end();
-
- do {
- if (int Res = cmpValues(&*InstL, &*InstR))
- return Res;
-
- const GetElementPtrInst *GEPL = dyn_cast<GetElementPtrInst>(InstL);
- const GetElementPtrInst *GEPR = dyn_cast<GetElementPtrInst>(InstR);
-
- if (GEPL && !GEPR)
- return 1;
- if (GEPR && !GEPL)
- return -1;
-
- if (GEPL && GEPR) {
- if (int Res =
- cmpValues(GEPL->getPointerOperand(), GEPR->getPointerOperand()))
- return Res;
- if (int Res = cmpGEPs(GEPL, GEPR))
- return Res;
- } else {
- if (int Res = cmpOperations(&*InstL, &*InstR))
- return Res;
- assert(InstL->getNumOperands() == InstR->getNumOperands());
-
- for (unsigned i = 0, e = InstL->getNumOperands(); i != e; ++i) {
- Value *OpL = InstL->getOperand(i);
- Value *OpR = InstR->getOperand(i);
- if (int Res = cmpValues(OpL, OpR))
- return Res;
- // cmpValues should ensure this is true.
- assert(cmpTypes(OpL->getType(), OpR->getType()) == 0);
- }
- }
-
- ++InstL;
- ++InstR;
- } while (InstL != InstLE && InstR != InstRE);
-
- if (InstL != InstLE && InstR == InstRE)
- return 1;
- if (InstL == InstLE && InstR != InstRE)
- return -1;
- return 0;
-}
-
-// Test whether the two functions have equivalent behaviour.
-int FunctionComparator::compare() {
- sn_mapL.clear();
- sn_mapR.clear();
-
- if (int Res = cmpAttrs(FnL->getAttributes(), FnR->getAttributes()))
- return Res;
-
- if (int Res = cmpNumbers(FnL->hasGC(), FnR->hasGC()))
- return Res;
-
- if (FnL->hasGC()) {
- if (int Res = cmpMem(FnL->getGC(), FnR->getGC()))
- return Res;
- }
-
- if (int Res = cmpNumbers(FnL->hasSection(), FnR->hasSection()))
- return Res;
-
- if (FnL->hasSection()) {
- if (int Res = cmpMem(FnL->getSection(), FnR->getSection()))
- return Res;
- }
-
- if (int Res = cmpNumbers(FnL->isVarArg(), FnR->isVarArg()))
- return Res;
-
- // TODO: if it's internal and only used in direct calls, we could handle this
- // case too.
- if (int Res = cmpNumbers(FnL->getCallingConv(), FnR->getCallingConv()))
- return Res;
-
- if (int Res = cmpTypes(FnL->getFunctionType(), FnR->getFunctionType()))
- return Res;
-
- assert(FnL->arg_size() == FnR->arg_size() &&
- "Identically typed functions have different numbers of args!");
-
- // Visit the arguments so that they get enumerated in the order they're
- // passed in.
- for (Function::const_arg_iterator ArgLI = FnL->arg_begin(),
- ArgRI = FnR->arg_begin(),
- ArgLE = FnL->arg_end();
- ArgLI != ArgLE; ++ArgLI, ++ArgRI) {
- if (cmpValues(&*ArgLI, &*ArgRI) != 0)
- llvm_unreachable("Arguments repeat!");
- }
-
- // We do a CFG-ordered walk since the actual ordering of the blocks in the
- // linked list is immaterial. Our walk starts at the entry block for both
- // functions, then takes each block from each terminator in order. As an
- // artifact, this also means that unreachable blocks are ignored.
- SmallVector<const BasicBlock *, 8> FnLBBs, FnRBBs;
- SmallPtrSet<const BasicBlock *, 32> VisitedBBs; // in terms of F1.
-
- FnLBBs.push_back(&FnL->getEntryBlock());
- FnRBBs.push_back(&FnR->getEntryBlock());
-
- VisitedBBs.insert(FnLBBs[0]);
- while (!FnLBBs.empty()) {
- const BasicBlock *BBL = FnLBBs.pop_back_val();
- const BasicBlock *BBR = FnRBBs.pop_back_val();
-
- if (int Res = cmpValues(BBL, BBR))
- return Res;
-
- if (int Res = cmpBasicBlocks(BBL, BBR))
- return Res;
-
- const TerminatorInst *TermL = BBL->getTerminator();
- const TerminatorInst *TermR = BBR->getTerminator();
-
- assert(TermL->getNumSuccessors() == TermR->getNumSuccessors());
- for (unsigned i = 0, e = TermL->getNumSuccessors(); i != e; ++i) {
- if (!VisitedBBs.insert(TermL->getSuccessor(i)).second)
- continue;
-
- FnLBBs.push_back(TermL->getSuccessor(i));
- FnRBBs.push_back(TermR->getSuccessor(i));
- }
- }
- return 0;
-}
-
-namespace {
-// Accumulate the hash of a sequence of 64-bit integers. This is similar to a
-// hash of a sequence of 64bit ints, but the entire input does not need to be
-// available at once. This interface is necessary for functionHash because it
-// needs to accumulate the hash as the structure of the function is traversed
-// without saving these values to an intermediate buffer. This form of hashing
-// is not often needed, as usually the object to hash is just read from a
-// buffer.
-class HashAccumulator64 {
- uint64_t Hash;
-public:
- // Initialize to random constant, so the state isn't zero.
- HashAccumulator64() { Hash = 0x6acaa36bef8325c5ULL; }
- void add(uint64_t V) {
- Hash = llvm::hashing::detail::hash_16_bytes(Hash, V);
- }
- // No finishing is required, because the entire hash value is used.
- uint64_t getHash() { return Hash; }
-};
-} // end anonymous namespace
-
-// A function hash is calculated by considering only the number of arguments and
-// whether a function is varargs, the order of basic blocks (given by the
-// successors of each basic block in depth first order), and the order of
-// opcodes of each instruction within each of these basic blocks. This mirrors
-// the strategy compare() uses to compare functions by walking the BBs in depth
-// first order and comparing each instruction in sequence. Because this hash
-// does not look at the operands, it is insensitive to things such as the
-// target of calls and the constants used in the function, which makes it useful
-// when possibly merging functions which are the same modulo constants and call
-// targets.
-FunctionComparator::FunctionHash FunctionComparator::functionHash(Function &F) {
- HashAccumulator64 H;
- H.add(F.isVarArg());
- H.add(F.arg_size());
-
- SmallVector<const BasicBlock *, 8> BBs;
- SmallSet<const BasicBlock *, 16> VisitedBBs;
-
- // Walk the blocks in the same order as FunctionComparator::cmpBasicBlocks(),
- // accumulating the hash of the function "structure." (BB and opcode sequence)
- BBs.push_back(&F.getEntryBlock());
- VisitedBBs.insert(BBs[0]);
- while (!BBs.empty()) {
- const BasicBlock *BB = BBs.pop_back_val();
- // This random value acts as a block header, as otherwise the partition of
- // opcodes into BBs wouldn't affect the hash, only the order of the opcodes
- H.add(45798);
- for (auto &Inst : *BB) {
- H.add(Inst.getOpcode());
- }
- const TerminatorInst *Term = BB->getTerminator();
- for (unsigned i = 0, e = Term->getNumSuccessors(); i != e; ++i) {
- if (!VisitedBBs.insert(Term->getSuccessor(i)).second)
- continue;
- BBs.push_back(Term->getSuccessor(i));
- }
- }
- return H.getHash();
-}
-
-
-namespace {
/// MergeFunctions finds functions which will generate identical machine code,
/// by considering all pointer types to be equivalent. Once identified,
diff --git a/contrib/llvm/lib/Transforms/IPO/PartialInlining.cpp b/contrib/llvm/lib/Transforms/IPO/PartialInlining.cpp
index 49c4417..7ef3fc1 100644
--- a/contrib/llvm/lib/Transforms/IPO/PartialInlining.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/PartialInlining.cpp
@@ -14,6 +14,9 @@
#include "llvm/Transforms/IPO/PartialInlining.h"
#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/Analysis/LoopInfo.h"
#include "llvm/IR/CFG.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/Instructions.h"
@@ -29,161 +32,193 @@ using namespace llvm;
STATISTIC(NumPartialInlined, "Number of functions partially inlined");
namespace {
+struct PartialInlinerImpl {
+ PartialInlinerImpl(InlineFunctionInfo IFI) : IFI(IFI) {}
+ bool run(Module &M);
+ Function *unswitchFunction(Function *F);
+
+private:
+ InlineFunctionInfo IFI;
+};
struct PartialInlinerLegacyPass : public ModulePass {
static char ID; // Pass identification, replacement for typeid
PartialInlinerLegacyPass() : ModulePass(ID) {
initializePartialInlinerLegacyPassPass(*PassRegistry::getPassRegistry());
}
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<AssumptionCacheTracker>();
+ }
bool runOnModule(Module &M) override {
if (skipModule(M))
return false;
- ModuleAnalysisManager DummyMAM;
- auto PA = Impl.run(M, DummyMAM);
- return !PA.areAllPreserved();
- }
-
-private:
- PartialInlinerPass Impl;
- };
-}
-
-char PartialInlinerLegacyPass::ID = 0;
-INITIALIZE_PASS(PartialInlinerLegacyPass, "partial-inliner", "Partial Inliner",
- false, false)
-ModulePass *llvm::createPartialInliningPass() {
- return new PartialInlinerLegacyPass();
+ AssumptionCacheTracker *ACT = &getAnalysis<AssumptionCacheTracker>();
+ std::function<AssumptionCache &(Function &)> GetAssumptionCache =
+ [&ACT](Function &F) -> AssumptionCache & {
+ return ACT->getAssumptionCache(F);
+ };
+ InlineFunctionInfo IFI(nullptr, &GetAssumptionCache);
+ return PartialInlinerImpl(IFI).run(M);
+ }
+};
}
-Function *PartialInlinerPass::unswitchFunction(Function *F) {
+Function *PartialInlinerImpl::unswitchFunction(Function *F) {
// First, verify that this function is an unswitching candidate...
- BasicBlock *entryBlock = &F->front();
- BranchInst *BR = dyn_cast<BranchInst>(entryBlock->getTerminator());
+ BasicBlock *EntryBlock = &F->front();
+ BranchInst *BR = dyn_cast<BranchInst>(EntryBlock->getTerminator());
if (!BR || BR->isUnconditional())
return nullptr;
-
- BasicBlock* returnBlock = nullptr;
- BasicBlock* nonReturnBlock = nullptr;
- unsigned returnCount = 0;
- for (BasicBlock *BB : successors(entryBlock)) {
+
+ BasicBlock *ReturnBlock = nullptr;
+ BasicBlock *NonReturnBlock = nullptr;
+ unsigned ReturnCount = 0;
+ for (BasicBlock *BB : successors(EntryBlock)) {
if (isa<ReturnInst>(BB->getTerminator())) {
- returnBlock = BB;
- returnCount++;
+ ReturnBlock = BB;
+ ReturnCount++;
} else
- nonReturnBlock = BB;
+ NonReturnBlock = BB;
}
-
- if (returnCount != 1)
+
+ if (ReturnCount != 1)
return nullptr;
-
+
// Clone the function, so that we can hack away on it.
ValueToValueMapTy VMap;
- Function* duplicateFunction = CloneFunction(F, VMap);
- duplicateFunction->setLinkage(GlobalValue::InternalLinkage);
- BasicBlock* newEntryBlock = cast<BasicBlock>(VMap[entryBlock]);
- BasicBlock* newReturnBlock = cast<BasicBlock>(VMap[returnBlock]);
- BasicBlock* newNonReturnBlock = cast<BasicBlock>(VMap[nonReturnBlock]);
-
+ Function *DuplicateFunction = CloneFunction(F, VMap);
+ DuplicateFunction->setLinkage(GlobalValue::InternalLinkage);
+ BasicBlock *NewEntryBlock = cast<BasicBlock>(VMap[EntryBlock]);
+ BasicBlock *NewReturnBlock = cast<BasicBlock>(VMap[ReturnBlock]);
+ BasicBlock *NewNonReturnBlock = cast<BasicBlock>(VMap[NonReturnBlock]);
+
// Go ahead and update all uses to the duplicate, so that we can just
// use the inliner functionality when we're done hacking.
- F->replaceAllUsesWith(duplicateFunction);
-
+ F->replaceAllUsesWith(DuplicateFunction);
+
// Special hackery is needed with PHI nodes that have inputs from more than
// one extracted block. For simplicity, just split the PHIs into a two-level
// sequence of PHIs, some of which will go in the extracted region, and some
// of which will go outside.
- BasicBlock* preReturn = newReturnBlock;
- newReturnBlock = newReturnBlock->splitBasicBlock(
- newReturnBlock->getFirstNonPHI()->getIterator());
- BasicBlock::iterator I = preReturn->begin();
- Instruction *Ins = &newReturnBlock->front();
- while (I != preReturn->end()) {
- PHINode* OldPhi = dyn_cast<PHINode>(I);
- if (!OldPhi) break;
-
- PHINode *retPhi = PHINode::Create(OldPhi->getType(), 2, "", Ins);
- OldPhi->replaceAllUsesWith(retPhi);
- Ins = newReturnBlock->getFirstNonPHI();
-
- retPhi->addIncoming(&*I, preReturn);
- retPhi->addIncoming(OldPhi->getIncomingValueForBlock(newEntryBlock),
- newEntryBlock);
- OldPhi->removeIncomingValue(newEntryBlock);
-
+ BasicBlock *PreReturn = NewReturnBlock;
+ NewReturnBlock = NewReturnBlock->splitBasicBlock(
+ NewReturnBlock->getFirstNonPHI()->getIterator());
+ BasicBlock::iterator I = PreReturn->begin();
+ Instruction *Ins = &NewReturnBlock->front();
+ while (I != PreReturn->end()) {
+ PHINode *OldPhi = dyn_cast<PHINode>(I);
+ if (!OldPhi)
+ break;
+
+ PHINode *RetPhi = PHINode::Create(OldPhi->getType(), 2, "", Ins);
+ OldPhi->replaceAllUsesWith(RetPhi);
+ Ins = NewReturnBlock->getFirstNonPHI();
+
+ RetPhi->addIncoming(&*I, PreReturn);
+ RetPhi->addIncoming(OldPhi->getIncomingValueForBlock(NewEntryBlock),
+ NewEntryBlock);
+ OldPhi->removeIncomingValue(NewEntryBlock);
+
++I;
}
- newEntryBlock->getTerminator()->replaceUsesOfWith(preReturn, newReturnBlock);
-
+ NewEntryBlock->getTerminator()->replaceUsesOfWith(PreReturn, NewReturnBlock);
+
// Gather up the blocks that we're going to extract.
- std::vector<BasicBlock*> toExtract;
- toExtract.push_back(newNonReturnBlock);
- for (BasicBlock &BB : *duplicateFunction)
- if (&BB != newEntryBlock && &BB != newReturnBlock &&
- &BB != newNonReturnBlock)
- toExtract.push_back(&BB);
+ std::vector<BasicBlock *> ToExtract;
+ ToExtract.push_back(NewNonReturnBlock);
+ for (BasicBlock &BB : *DuplicateFunction)
+ if (&BB != NewEntryBlock && &BB != NewReturnBlock &&
+ &BB != NewNonReturnBlock)
+ ToExtract.push_back(&BB);
// The CodeExtractor needs a dominator tree.
DominatorTree DT;
- DT.recalculate(*duplicateFunction);
+ DT.recalculate(*DuplicateFunction);
+
+ // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo.
+ LoopInfo LI(DT);
+ BranchProbabilityInfo BPI(*DuplicateFunction, LI);
+ BlockFrequencyInfo BFI(*DuplicateFunction, BPI, LI);
// Extract the body of the if.
- Function* extractedFunction
- = CodeExtractor(toExtract, &DT).extractCodeRegion();
-
- InlineFunctionInfo IFI;
-
+ Function *ExtractedFunction =
+ CodeExtractor(ToExtract, &DT, /*AggregateArgs*/ false, &BFI, &BPI)
+ .extractCodeRegion();
+
// Inline the top-level if test into all callers.
- std::vector<User *> Users(duplicateFunction->user_begin(),
- duplicateFunction->user_end());
+ std::vector<User *> Users(DuplicateFunction->user_begin(),
+ DuplicateFunction->user_end());
for (User *User : Users)
if (CallInst *CI = dyn_cast<CallInst>(User))
InlineFunction(CI, IFI);
else if (InvokeInst *II = dyn_cast<InvokeInst>(User))
InlineFunction(II, IFI);
-
+
// Ditch the duplicate, since we're done with it, and rewrite all remaining
// users (function pointers, etc.) back to the original function.
- duplicateFunction->replaceAllUsesWith(F);
- duplicateFunction->eraseFromParent();
-
+ DuplicateFunction->replaceAllUsesWith(F);
+ DuplicateFunction->eraseFromParent();
+
++NumPartialInlined;
-
- return extractedFunction;
+
+ return ExtractedFunction;
}
-PreservedAnalyses PartialInlinerPass::run(Module &M, ModuleAnalysisManager &) {
- std::vector<Function*> worklist;
- worklist.reserve(M.size());
+bool PartialInlinerImpl::run(Module &M) {
+ std::vector<Function *> Worklist;
+ Worklist.reserve(M.size());
for (Function &F : M)
if (!F.use_empty() && !F.isDeclaration())
- worklist.push_back(&F);
-
- bool changed = false;
- while (!worklist.empty()) {
- Function* currFunc = worklist.back();
- worklist.pop_back();
-
- if (currFunc->use_empty()) continue;
-
- bool recursive = false;
- for (User *U : currFunc->users())
- if (Instruction* I = dyn_cast<Instruction>(U))
- if (I->getParent()->getParent() == currFunc) {
- recursive = true;
+ Worklist.push_back(&F);
+
+ bool Changed = false;
+ while (!Worklist.empty()) {
+ Function *CurrFunc = Worklist.back();
+ Worklist.pop_back();
+
+ if (CurrFunc->use_empty())
+ continue;
+
+ bool Recursive = false;
+ for (User *U : CurrFunc->users())
+ if (Instruction *I = dyn_cast<Instruction>(U))
+ if (I->getParent()->getParent() == CurrFunc) {
+ Recursive = true;
break;
}
- if (recursive) continue;
-
-
- if (Function* newFunc = unswitchFunction(currFunc)) {
- worklist.push_back(newFunc);
- changed = true;
+ if (Recursive)
+ continue;
+
+ if (Function *NewFunc = unswitchFunction(CurrFunc)) {
+ Worklist.push_back(NewFunc);
+ Changed = true;
}
-
}
- if (changed)
+ return Changed;
+}
+
+char PartialInlinerLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(PartialInlinerLegacyPass, "partial-inliner",
+ "Partial Inliner", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_END(PartialInlinerLegacyPass, "partial-inliner",
+ "Partial Inliner", false, false)
+
+ModulePass *llvm::createPartialInliningPass() {
+ return new PartialInlinerLegacyPass();
+}
+
+PreservedAnalyses PartialInlinerPass::run(Module &M,
+ ModuleAnalysisManager &AM) {
+ auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+ std::function<AssumptionCache &(Function &)> GetAssumptionCache =
+ [&FAM](Function &F) -> AssumptionCache & {
+ return FAM.getResult<AssumptionAnalysis>(F);
+ };
+ InlineFunctionInfo IFI(nullptr, &GetAssumptionCache);
+ if (PartialInlinerImpl(IFI).run(M))
return PreservedAnalyses::none();
return PreservedAnalyses::all();
}
diff --git a/contrib/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp b/contrib/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
index df6a48e..941efb2 100644
--- a/contrib/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
@@ -19,6 +19,7 @@
#include "llvm/Analysis/CFLAndersAliasAnalysis.h"
#include "llvm/Analysis/CFLSteensAliasAnalysis.h"
#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/InlineCost.h"
#include "llvm/Analysis/Passes.h"
#include "llvm/Analysis/ScopedNoAliasAA.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
@@ -66,14 +67,13 @@ static cl::opt<bool>
RunLoopRerolling("reroll-loops", cl::Hidden,
cl::desc("Run the loop rerolling pass"));
-static cl::opt<bool>
-RunFloat2Int("float-to-int", cl::Hidden, cl::init(true),
- cl::desc("Run the float2int (float demotion) pass"));
-
static cl::opt<bool> RunLoadCombine("combine-loads", cl::init(false),
cl::Hidden,
cl::desc("Run the load combining pass"));
+static cl::opt<bool> RunNewGVN("enable-newgvn", cl::init(false), cl::Hidden,
+ cl::desc("Run the NewGVN pass"));
+
static cl::opt<bool>
RunSLPAfterLoopVectorization("run-slp-after-loop-vectorization",
cl::init(true), cl::Hidden,
@@ -91,8 +91,7 @@ static cl::opt<CFLAAType>
clEnumValN(CFLAAType::Andersen, "anders",
"Enable inclusion-based CFL-AA"),
clEnumValN(CFLAAType::Both, "both",
- "Enable both variants of CFL-aa"),
- clEnumValEnd));
+ "Enable both variants of CFL-AA")));
static cl::opt<bool>
EnableMLSM("mlsm", cl::init(true), cl::Hidden,
@@ -111,10 +110,17 @@ static cl::opt<bool> EnableLoopLoadElim(
"enable-loop-load-elim", cl::init(true), cl::Hidden,
cl::desc("Enable the LoopLoadElimination Pass"));
-static cl::opt<std::string> RunPGOInstrGen(
- "profile-generate", cl::init(""), cl::Hidden,
- cl::desc("Enable generation phase of PGO instrumentation and specify the "
- "path of profile data file"));
+static cl::opt<bool>
+ EnablePrepareForThinLTO("prepare-for-thinlto", cl::init(false), cl::Hidden,
+ cl::desc("Enable preparation for ThinLTO."));
+
+static cl::opt<bool> RunPGOInstrGen(
+ "profile-generate", cl::init(false), cl::Hidden,
+ cl::desc("Enable PGO instrumentation."));
+
+static cl::opt<std::string>
+ PGOOutputFile("profile-generate-file", cl::init(""), cl::Hidden,
+ cl::desc("Specify the path of profile data file."));
static cl::opt<std::string> RunPGOInstrUse(
"profile-use", cl::init(""), cl::Hidden, cl::value_desc("filename"),
@@ -136,14 +142,18 @@ static cl::opt<int> PreInlineThreshold(
static cl::opt<bool> EnableGVNHoist(
"enable-gvn-hoist", cl::init(false), cl::Hidden,
- cl::desc("Enable the experimental GVN Hoisting pass"));
+ cl::desc("Enable the GVN hoisting pass"));
+
+static cl::opt<bool>
+ DisableLibCallsShrinkWrap("disable-libcalls-shrinkwrap", cl::init(false),
+ cl::Hidden,
+ cl::desc("Disable shrink-wrap library calls"));
PassManagerBuilder::PassManagerBuilder() {
OptLevel = 2;
SizeLevel = 0;
LibraryInfo = nullptr;
Inliner = nullptr;
- ModuleSummary = nullptr;
DisableUnitAtATime = false;
DisableUnrollLoops = false;
BBVectorize = RunBBVectorization;
@@ -151,14 +161,16 @@ PassManagerBuilder::PassManagerBuilder() {
LoopVectorize = RunLoopVectorization;
RerollLoops = RunLoopRerolling;
LoadCombine = RunLoadCombine;
+ NewGVN = RunNewGVN;
DisableGVNLoadPRE = false;
VerifyInput = false;
VerifyOutput = false;
MergeFunctions = false;
PrepareForLTO = false;
- PGOInstrGen = RunPGOInstrGen;
+ EnablePGOInstrGen = RunPGOInstrGen;
+ PGOInstrGen = PGOOutputFile;
PGOInstrUse = RunPGOInstrUse;
- PrepareForThinLTO = false;
+ PrepareForThinLTO = EnablePrepareForThinLTO;
PerformThinLTO = false;
}
@@ -243,24 +255,34 @@ void PassManagerBuilder::populateFunctionPassManager(
// Do PGO instrumentation generation or use pass as the option specified.
void PassManagerBuilder::addPGOInstrPasses(legacy::PassManagerBase &MPM) {
- if (PGOInstrGen.empty() && PGOInstrUse.empty())
+ if (!EnablePGOInstrGen && PGOInstrUse.empty())
return;
// Perform the preinline and cleanup passes for O1 and above.
// And avoid doing them if optimizing for size.
if (OptLevel > 0 && SizeLevel == 0 && !DisablePreInliner) {
- // Create preinline pass.
- MPM.add(createFunctionInliningPass(PreInlineThreshold));
+ // Create preinline pass. We construct an InlineParams object and specify
+ // the threshold here to avoid the command line options of the regular
+ // inliner to influence pre-inlining. The only fields of InlineParams we
+ // care about are DefaultThreshold and HintThreshold.
+ InlineParams IP;
+ IP.DefaultThreshold = PreInlineThreshold;
+ // FIXME: The hint threshold has the same value used by the regular inliner.
+ // This should probably be lowered after performance testing.
+ IP.HintThreshold = 325;
+
+ MPM.add(createFunctionInliningPass(IP));
MPM.add(createSROAPass());
MPM.add(createEarlyCSEPass()); // Catch trivial redundancies
MPM.add(createCFGSimplificationPass()); // Merge & remove BBs
MPM.add(createInstructionCombiningPass()); // Combine silly seq's
addExtensionsToPM(EP_Peephole, MPM);
}
- if (!PGOInstrGen.empty()) {
+ if (EnablePGOInstrGen) {
MPM.add(createPGOInstrumentationGenLegacyPass());
// Add the profile lowering pass.
InstrProfOptions Options;
- Options.InstrProfileOutput = PGOInstrGen;
+ if (!PGOInstrGen.empty())
+ Options.InstrProfileOutput = PGOInstrGen;
MPM.add(createInstrProfilingLegacyPass(Options));
}
if (!PGOInstrUse.empty())
@@ -279,6 +301,8 @@ void PassManagerBuilder::addFunctionSimplificationPasses(
MPM.add(createCFGSimplificationPass()); // Merge & remove BBs
// Combine silly seq's
addInstructionCombiningPass(MPM);
+ if (SizeLevel == 0 && !DisableLibCallsShrinkWrap)
+ MPM.add(createLibCallsShrinkWrapPass());
addExtensionsToPM(EP_Peephole, MPM);
MPM.add(createTailCallEliminationPass()); // Eliminate tail calls
@@ -304,7 +328,8 @@ void PassManagerBuilder::addFunctionSimplificationPasses(
if (OptLevel > 1) {
if (EnableMLSM)
MPM.add(createMergedLoadStoreMotionPass()); // Merge ld/st in diamonds
- MPM.add(createGVNPass(DisableGVNLoadPRE)); // Remove redundancies
+ MPM.add(NewGVN ? createNewGVNPass()
+ : createGVNPass(DisableGVNLoadPRE)); // Remove redundancies
}
MPM.add(createMemCpyOptPass()); // Remove memcpy / form memset
MPM.add(createSCCPPass()); // Constant prop with SCCP
@@ -336,7 +361,9 @@ void PassManagerBuilder::addFunctionSimplificationPasses(
addInstructionCombiningPass(MPM);
addExtensionsToPM(EP_Peephole, MPM);
if (OptLevel > 1 && UseGVNAfterVectorization)
- MPM.add(createGVNPass(DisableGVNLoadPRE)); // Remove redundancies
+ MPM.add(NewGVN
+ ? createNewGVNPass()
+ : createGVNPass(DisableGVNLoadPRE)); // Remove redundancies
else
MPM.add(createEarlyCSEPass()); // Catch trivial redundancies
@@ -358,6 +385,11 @@ void PassManagerBuilder::addFunctionSimplificationPasses(
void PassManagerBuilder::populateModulePassManager(
legacy::PassManagerBase &MPM) {
+ if (!PGOSampleUse.empty()) {
+ MPM.add(createPruneEHPass());
+ MPM.add(createSampleProfileLoaderPass(PGOSampleUse));
+ }
+
// Allow forcing function attributes as a debugging and tuning aid.
MPM.add(createForceFunctionAttrsLegacyPass());
@@ -380,6 +412,10 @@ void PassManagerBuilder::populateModulePassManager(
else if (!GlobalExtensions->empty() || !Extensions.empty())
MPM.add(createBarrierNoopPass());
+ if (PrepareForThinLTO)
+ // Rename anon globals to be able to export them in the summary.
+ MPM.add(createNameAnonGlobalPass());
+
addExtensionsToPM(EP_EnabledOnOptLevel0, MPM);
return;
}
@@ -390,6 +426,16 @@ void PassManagerBuilder::populateModulePassManager(
addInitialAliasAnalysisPasses(MPM);
+ // For ThinLTO there are two passes of indirect call promotion. The
+ // first is during the compile phase when PerformThinLTO=false and
+ // intra-module indirect call targets are promoted. The second is during
+ // the ThinLTO backend when PerformThinLTO=true, when we promote imported
+ // inter-module indirect calls. For that we perform indirect call promotion
+ // earlier in the pass pipeline, here before globalopt. Otherwise imported
+ // available_externally functions look unreferenced and are removed.
+ if (PerformThinLTO)
+ MPM.add(createPGOIndirectCallPromotionLegacyPass(/*InLTO = */ true));
+
if (!DisableUnitAtATime) {
// Infer attributes about declarations if possible.
MPM.add(createInferFunctionAttrsLegacyPass());
@@ -412,11 +458,12 @@ void PassManagerBuilder::populateModulePassManager(
/// PGO instrumentation is added during the compile phase for ThinLTO, do
/// not run it a second time
addPGOInstrPasses(MPM);
+ // Indirect call promotion that promotes intra-module targets only.
+ // For ThinLTO this is done earlier due to interactions with globalopt
+ // for imported functions.
+ MPM.add(createPGOIndirectCallPromotionLegacyPass());
}
- // Indirect call promotion that promotes intra-module targets only.
- MPM.add(createPGOIndirectCallPromotionLegacyPass());
-
if (EnableNonLTOGlobalsModRef)
// We add a module alias analysis pass here. In part due to bugs in the
// analysis infrastructure this "works" in that the analysis stays alive
@@ -435,6 +482,7 @@ void PassManagerBuilder::populateModulePassManager(
if (OptLevel > 2)
MPM.add(createArgumentPromotionPass()); // Scalarize uninlined fn args
+ addExtensionsToPM(EP_CGSCCOptimizerLate, MPM);
addFunctionSimplificationPasses(MPM);
// FIXME: This is a HACK! The inliner pass above implicitly creates a CGSCC
@@ -464,8 +512,8 @@ void PassManagerBuilder::populateModulePassManager(
if (PrepareForThinLTO) {
// Reduce the size of the IR as much as possible.
MPM.add(createGlobalOptimizerPass());
- // Rename anon function to be able to export them in the summary.
- MPM.add(createNameAnonFunctionPass());
+ // Rename anon globals to be able to export them in the summary.
+ MPM.add(createNameAnonGlobalPass());
return;
}
@@ -502,8 +550,7 @@ void PassManagerBuilder::populateModulePassManager(
// correct in the face of IR changes).
MPM.add(createGlobalsAAWrapperPass());
- if (RunFloat2Int)
- MPM.add(createFloat2IntPass());
+ MPM.add(createFloat2IntPass());
addExtensionsToPM(EP_VectorizerStart, MPM);
@@ -516,7 +563,7 @@ void PassManagerBuilder::populateModulePassManager(
// into separate loop that would otherwise inhibit vectorization. This is
// currently only performed for loops marked with the metadata
// llvm.loop.distribute=true or when -enable-loop-distribute is specified.
- MPM.add(createLoopDistributePass(/*ProcessAllLoopsByDefault=*/false));
+ MPM.add(createLoopDistributePass());
MPM.add(createLoopVectorizePass(DisableUnrollLoops, LoopVectorize));
@@ -560,7 +607,9 @@ void PassManagerBuilder::populateModulePassManager(
addInstructionCombiningPass(MPM);
addExtensionsToPM(EP_Peephole, MPM);
if (OptLevel > 1 && UseGVNAfterVectorization)
- MPM.add(createGVNPass(DisableGVNLoadPRE)); // Remove redundancies
+ MPM.add(NewGVN
+ ? createNewGVNPass()
+ : createGVNPass(DisableGVNLoadPRE)); // Remove redundancies
else
MPM.add(createEarlyCSEPass()); // Catch trivial redundancies
@@ -585,10 +634,7 @@ void PassManagerBuilder::populateModulePassManager(
// outer loop. LICM pass can help to promote the runtime check out if the
// checked value is loop invariant.
MPM.add(createLICMPass());
-
- // Get rid of LCSSA nodes.
- MPM.add(createInstructionSimplifierPass());
- }
+ }
// After vectorization and unrolling, assume intrinsics may tell us more
// about pointer alignments.
@@ -609,6 +655,13 @@ void PassManagerBuilder::populateModulePassManager(
if (MergeFunctions)
MPM.add(createMergeFunctionsPass());
+ // LoopSink pass sinks instructions hoisted by LICM, which serves as a
+ // canonicalization pass that enables other optimizations. As a result,
+ // LoopSink pass needs to be a very late IR pass to avoid undoing LICM
+ // result too early.
+ MPM.add(createLoopSinkPass());
+ // Get rid of LCSSA nodes.
+ MPM.add(createInstructionSimplifierPass());
addExtensionsToPM(EP_OptimizerLast, MPM);
}
@@ -620,9 +673,6 @@ void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) {
// Provide AliasAnalysis services for optimizations.
addInitialAliasAnalysisPasses(PM);
- if (ModuleSummary)
- PM.add(createFunctionImportPass(ModuleSummary));
-
// Allow forcing function attributes as a debugging and tuning aid.
PM.add(createForceFunctionAttrsLegacyPass());
@@ -647,6 +697,11 @@ void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) {
PM.add(createPostOrderFunctionAttrsLegacyPass());
PM.add(createReversePostOrderFunctionAttrsPass());
+ // Split globals using inrange annotations on GEP indices. This can help
+ // improve the quality of generated code when virtual constant propagation or
+ // control flow integrity are enabled.
+ PM.add(createGlobalSplitPass());
+
// Apply whole-program devirtualization and virtual constant propagation.
PM.add(createWholeProgramDevirtPass());
@@ -706,7 +761,8 @@ void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) {
PM.add(createLICMPass()); // Hoist loop invariants.
if (EnableMLSM)
PM.add(createMergedLoadStoreMotionPass()); // Merge ld/st in diamonds.
- PM.add(createGVNPass(DisableGVNLoadPRE)); // Remove redundancies.
+ PM.add(NewGVN ? createNewGVNPass()
+ : createGVNPass(DisableGVNLoadPRE)); // Remove redundancies.
PM.add(createMemCpyOptPass()); // Remove dead memcpys.
// Nuke dead stores.
@@ -777,9 +833,6 @@ void PassManagerBuilder::populateThinLTOPassManager(
if (VerifyInput)
PM.add(createVerifierPass());
- if (ModuleSummary)
- PM.add(createFunctionImportPass(ModuleSummary));
-
populateModulePassManager(PM);
if (VerifyOutput)
@@ -804,7 +857,8 @@ void PassManagerBuilder::populateLTOPassManager(legacy::PassManagerBase &PM) {
// Lower type metadata and the type.test intrinsic. This pass supports Clang's
// control flow integrity mechanisms (-fsanitize=cfi*) and needs to run at
// link time if CFI is enabled. The pass does nothing if CFI is disabled.
- PM.add(createLowerTypeTestsPass());
+ PM.add(createLowerTypeTestsPass(LowerTypeTestsSummaryAction::None,
+ /*Summary=*/nullptr));
if (OptLevel != 0)
addLateLTOOptimizationPasses(PM);
diff --git a/contrib/llvm/lib/Transforms/IPO/PruneEH.cpp b/contrib/llvm/lib/Transforms/IPO/PruneEH.cpp
index 2aa3fa5..d9acb9b 100644
--- a/contrib/llvm/lib/Transforms/IPO/PruneEH.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/PruneEH.cpp
@@ -90,10 +90,7 @@ static bool runImpl(CallGraphSCC &SCC, CallGraph &CG) {
if (!F) {
SCCMightUnwind = true;
SCCMightReturn = true;
- } else if (F->isDeclaration() || F->isInterposable()) {
- // Note: isInterposable (as opposed to hasExactDefinition) is fine above,
- // since we're not inferring new attributes here, but only using existing,
- // assumed to be correct, function attributes.
+ } else if (!F->hasExactDefinition()) {
SCCMightUnwind |= !F->doesNotThrow();
SCCMightReturn |= !F->doesNotReturn();
} else {
diff --git a/contrib/llvm/lib/Transforms/IPO/SampleProfile.cpp b/contrib/llvm/lib/Transforms/IPO/SampleProfile.cpp
index 39de108..6a43f8d 100644
--- a/contrib/llvm/lib/Transforms/IPO/SampleProfile.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/SampleProfile.cpp
@@ -88,6 +88,52 @@ typedef DenseMap<Edge, uint64_t> EdgeWeightMap;
typedef DenseMap<const BasicBlock *, SmallVector<const BasicBlock *, 8>>
BlockEdgeMap;
+class SampleCoverageTracker {
+public:
+ SampleCoverageTracker() : SampleCoverage(), TotalUsedSamples(0) {}
+
+ bool markSamplesUsed(const FunctionSamples *FS, uint32_t LineOffset,
+ uint32_t Discriminator, uint64_t Samples);
+ unsigned computeCoverage(unsigned Used, unsigned Total) const;
+ unsigned countUsedRecords(const FunctionSamples *FS) const;
+ unsigned countBodyRecords(const FunctionSamples *FS) const;
+ uint64_t getTotalUsedSamples() const { return TotalUsedSamples; }
+ uint64_t countBodySamples(const FunctionSamples *FS) const;
+ void clear() {
+ SampleCoverage.clear();
+ TotalUsedSamples = 0;
+ }
+
+private:
+ typedef std::map<LineLocation, unsigned> BodySampleCoverageMap;
+ typedef DenseMap<const FunctionSamples *, BodySampleCoverageMap>
+ FunctionSamplesCoverageMap;
+
+ /// Coverage map for sampling records.
+ ///
+ /// This map keeps a record of sampling records that have been matched to
+ /// an IR instruction. This is used to detect some form of staleness in
+ /// profiles (see flag -sample-profile-check-coverage).
+ ///
+ /// Each entry in the map corresponds to a FunctionSamples instance. This is
+ /// another map that counts how many times the sample record at the
+ /// given location has been used.
+ FunctionSamplesCoverageMap SampleCoverage;
+
+ /// Number of samples used from the profile.
+ ///
+ /// When a sampling record is used for the first time, the samples from
+ /// that record are added to this accumulator. Coverage is later computed
+ /// based on the total number of samples available in this function and
+ /// its callsites.
+ ///
+ /// Note that this accumulator tracks samples used from a single function
+ /// and all the inlined callsites. Strictly, we should have a map of counters
+ /// keyed by FunctionSamples pointers, but these stats are cleared after
+ /// every function, so we just need to keep a single counter.
+ uint64_t TotalUsedSamples;
+};
+
/// \brief Sample profile pass.
///
/// This pass reads profile data from the file specified by
@@ -110,9 +156,9 @@ protected:
bool runOnFunction(Function &F);
unsigned getFunctionLoc(Function &F);
bool emitAnnotations(Function &F);
- ErrorOr<uint64_t> getInstWeight(const Instruction &I) const;
- ErrorOr<uint64_t> getBlockWeight(const BasicBlock *BB) const;
- const FunctionSamples *findCalleeFunctionSamples(const CallInst &I) const;
+ ErrorOr<uint64_t> getInstWeight(const Instruction &I);
+ ErrorOr<uint64_t> getBlockWeight(const BasicBlock *BB);
+ const FunctionSamples *findCalleeFunctionSamples(const Instruction &I) const;
const FunctionSamples *findFunctionSamples(const Instruction &I) const;
bool inlineHotFunctions(Function &F);
void printEdgeWeight(raw_ostream &OS, Edge E);
@@ -125,7 +171,7 @@ protected:
void propagateWeights(Function &F);
uint64_t visitEdge(Edge E, unsigned *NumUnknownEdges, Edge *UnknownEdge);
void buildEdges(Function &F);
- bool propagateThroughEdges(Function &F);
+ bool propagateThroughEdges(Function &F, bool UpdateBlockCount);
void computeDominanceAndLoopInfo(Function &F);
unsigned getOffset(unsigned L, unsigned H) const;
void clearFunctionData();
@@ -169,6 +215,8 @@ protected:
/// \brief Successors for each basic block in the CFG.
BlockEdgeMap Successors;
+ SampleCoverageTracker CoverageTracker;
+
/// \brief Profile reader object.
std::unique_ptr<SampleProfileReader> Reader;
@@ -176,7 +224,7 @@ protected:
FunctionSamples *Samples;
/// \brief Name of the profile file to load.
- StringRef Filename;
+ std::string Filename;
/// \brief Flag indicating whether the profile input loaded successfully.
bool ProfileIsValid;
@@ -204,64 +252,17 @@ public:
bool doInitialization(Module &M) override {
return SampleLoader.doInitialization(M);
}
- const char *getPassName() const override { return "Sample profile pass"; }
+ StringRef getPassName() const override { return "Sample profile pass"; }
bool runOnModule(Module &M) override;
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<AssumptionCacheTracker>();
}
-private:
- SampleProfileLoader SampleLoader;
-};
-
-class SampleCoverageTracker {
-public:
- SampleCoverageTracker() : SampleCoverage(), TotalUsedSamples(0) {}
-
- bool markSamplesUsed(const FunctionSamples *FS, uint32_t LineOffset,
- uint32_t Discriminator, uint64_t Samples);
- unsigned computeCoverage(unsigned Used, unsigned Total) const;
- unsigned countUsedRecords(const FunctionSamples *FS) const;
- unsigned countBodyRecords(const FunctionSamples *FS) const;
- uint64_t getTotalUsedSamples() const { return TotalUsedSamples; }
- uint64_t countBodySamples(const FunctionSamples *FS) const;
- void clear() {
- SampleCoverage.clear();
- TotalUsedSamples = 0;
- }
private:
- typedef std::map<LineLocation, unsigned> BodySampleCoverageMap;
- typedef DenseMap<const FunctionSamples *, BodySampleCoverageMap>
- FunctionSamplesCoverageMap;
-
- /// Coverage map for sampling records.
- ///
- /// This map keeps a record of sampling records that have been matched to
- /// an IR instruction. This is used to detect some form of staleness in
- /// profiles (see flag -sample-profile-check-coverage).
- ///
- /// Each entry in the map corresponds to a FunctionSamples instance. This is
- /// another map that counts how many times the sample record at the
- /// given location has been used.
- FunctionSamplesCoverageMap SampleCoverage;
-
- /// Number of samples used from the profile.
- ///
- /// When a sampling record is used for the first time, the samples from
- /// that record are added to this accumulator. Coverage is later computed
- /// based on the total number of samples available in this function and
- /// its callsites.
- ///
- /// Note that this accumulator tracks samples used from a single function
- /// and all the inlined callsites. Strictly, we should have a map of counters
- /// keyed by FunctionSamples pointers, but these stats are cleared after
- /// every function, so we just need to keep a single counter.
- uint64_t TotalUsedSamples;
+ SampleProfileLoader SampleLoader;
};
-SampleCoverageTracker CoverageTracker;
-
/// Return true if the given callsite is hot wrt to its caller.
///
/// Functions that were inlined in the original binary will be represented
@@ -451,7 +452,7 @@ void SampleProfileLoader::printBlockWeight(raw_ostream &OS,
///
/// \returns the weight of \p Inst.
ErrorOr<uint64_t>
-SampleProfileLoader::getInstWeight(const Instruction &Inst) const {
+SampleProfileLoader::getInstWeight(const Instruction &Inst) {
const DebugLoc &DLoc = Inst.getDebugLoc();
if (!DLoc)
return std::error_code();
@@ -460,18 +461,28 @@ SampleProfileLoader::getInstWeight(const Instruction &Inst) const {
if (!FS)
return std::error_code();
- // Ignore all dbg_value intrinsics.
- const IntrinsicInst *II = dyn_cast<IntrinsicInst>(&Inst);
- if (II && II->getIntrinsicID() == Intrinsic::dbg_value)
+ // Ignore all intrinsics and branch instructions.
+ // Branch instruction usually contains debug info from sources outside of
+ // the residing basic block, thus we ignore them during annotation.
+ if (isa<BranchInst>(Inst) || isa<IntrinsicInst>(Inst))
return std::error_code();
+ // If a call/invoke instruction is inlined in profile, but not inlined here,
+ // it means that the inlined callsite has no sample, thus the call
+ // instruction should have 0 count.
+ bool IsCall = isa<CallInst>(Inst) || isa<InvokeInst>(Inst);
+ if (IsCall && findCalleeFunctionSamples(Inst))
+ return 0;
+
const DILocation *DIL = DLoc;
unsigned Lineno = DLoc.getLine();
unsigned HeaderLineno = DIL->getScope()->getSubprogram()->getLine();
uint32_t LineOffset = getOffset(Lineno, HeaderLineno);
uint32_t Discriminator = DIL->getDiscriminator();
- ErrorOr<uint64_t> R = FS->findSamplesAt(LineOffset, Discriminator);
+ ErrorOr<uint64_t> R = IsCall
+ ? FS->findCallSamplesAt(LineOffset, Discriminator)
+ : FS->findSamplesAt(LineOffset, Discriminator);
if (R) {
bool FirstMark =
CoverageTracker.markSamplesUsed(FS, LineOffset, Discriminator, R.get());
@@ -488,13 +499,6 @@ SampleProfileLoader::getInstWeight(const Instruction &Inst) const {
<< Inst << " (line offset: " << Lineno - HeaderLineno << "."
<< DIL->getDiscriminator() << " - weight: " << R.get()
<< ")\n");
- } else {
- // If a call instruction is inlined in profile, but not inlined here,
- // it means that the inlined callsite has no sample, thus the call
- // instruction should have 0 count.
- const CallInst *CI = dyn_cast<CallInst>(&Inst);
- if (CI && findCalleeFunctionSamples(*CI))
- R = 0;
}
return R;
}
@@ -508,23 +512,17 @@ SampleProfileLoader::getInstWeight(const Instruction &Inst) const {
///
/// \returns the weight for \p BB.
ErrorOr<uint64_t>
-SampleProfileLoader::getBlockWeight(const BasicBlock *BB) const {
- DenseMap<uint64_t, uint64_t> CM;
+SampleProfileLoader::getBlockWeight(const BasicBlock *BB) {
+ uint64_t Max = 0;
+ bool HasWeight = false;
for (auto &I : BB->getInstList()) {
const ErrorOr<uint64_t> &R = getInstWeight(I);
- if (R) CM[R.get()]++;
- }
- if (CM.size() == 0) return std::error_code();
- uint64_t W = 0, C = 0;
- for (const auto &C_W : CM) {
- if (C_W.second == W) {
- C = std::max(C, C_W.first);
- } else if (C_W.second > W) {
- C = C_W.first;
- W = C_W.second;
+ if (R) {
+ Max = std::max(Max, R.get());
+ HasWeight = true;
}
}
- return C;
+ return HasWeight ? ErrorOr<uint64_t>(Max) : std::error_code();
}
/// \brief Compute and store the weights of every basic block.
@@ -551,18 +549,18 @@ bool SampleProfileLoader::computeBlockWeights(Function &F) {
/// \brief Get the FunctionSamples for a call instruction.
///
-/// The FunctionSamples of a call instruction \p Inst is the inlined
+/// The FunctionSamples of a call/invoke instruction \p Inst is the inlined
/// instance in which that call instruction is calling to. It contains
/// all samples that resides in the inlined instance. We first find the
/// inlined instance in which the call instruction is from, then we
/// traverse its children to find the callsite with the matching
-/// location and callee function name.
+/// location.
///
-/// \param Inst Call instruction to query.
+/// \param Inst Call/Invoke instruction to query.
///
/// \returns The FunctionSamples pointer to the inlined instance.
const FunctionSamples *
-SampleProfileLoader::findCalleeFunctionSamples(const CallInst &Inst) const {
+SampleProfileLoader::findCalleeFunctionSamples(const Instruction &Inst) const {
const DILocation *DIL = Inst.getDebugLoc();
if (!DIL) {
return nullptr;
@@ -611,7 +609,6 @@ SampleProfileLoader::findFunctionSamples(const Instruction &Inst) const {
return FS;
}
-
/// \brief Iteratively inline hot callsites of a function.
///
/// Iteratively traverse all callsites of the function \p F, and find if
@@ -627,22 +624,36 @@ SampleProfileLoader::findFunctionSamples(const Instruction &Inst) const {
bool SampleProfileLoader::inlineHotFunctions(Function &F) {
bool Changed = false;
LLVMContext &Ctx = F.getContext();
+ std::function<AssumptionCache &(Function &)> GetAssumptionCache = [&](
+ Function &F) -> AssumptionCache & { return ACT->getAssumptionCache(F); };
while (true) {
bool LocalChanged = false;
- SmallVector<CallInst *, 10> CIS;
+ SmallVector<Instruction *, 10> CIS;
for (auto &BB : F) {
+ bool Hot = false;
+ SmallVector<Instruction *, 10> Candidates;
for (auto &I : BB.getInstList()) {
- CallInst *CI = dyn_cast<CallInst>(&I);
- if (CI && callsiteIsHot(Samples, findCalleeFunctionSamples(*CI)))
- CIS.push_back(CI);
+ const FunctionSamples *FS = nullptr;
+ if ((isa<CallInst>(I) || isa<InvokeInst>(I)) &&
+ (FS = findCalleeFunctionSamples(I))) {
+ Candidates.push_back(&I);
+ if (callsiteIsHot(Samples, FS))
+ Hot = true;
+ }
+ }
+ if (Hot) {
+ CIS.insert(CIS.begin(), Candidates.begin(), Candidates.end());
}
}
- for (auto CI : CIS) {
- InlineFunctionInfo IFI(nullptr, ACT);
- Function *CalledFunction = CI->getCalledFunction();
- DebugLoc DLoc = CI->getDebugLoc();
- uint64_t NumSamples = findCalleeFunctionSamples(*CI)->getTotalSamples();
- if (InlineFunction(CI, IFI)) {
+ for (auto I : CIS) {
+ InlineFunctionInfo IFI(nullptr, ACT ? &GetAssumptionCache : nullptr);
+ CallSite CS(I);
+ Function *CalledFunction = CS.getCalledFunction();
+ if (!CalledFunction || !CalledFunction->getSubprogram())
+ continue;
+ DebugLoc DLoc = I->getDebugLoc();
+ uint64_t NumSamples = findCalleeFunctionSamples(*I)->getTotalSamples();
+ if (InlineFunction(CS, IFI)) {
LocalChanged = true;
emitOptimizationRemark(Ctx, DEBUG_TYPE, F, DLoc,
Twine("inlined hot callee '") +
@@ -693,6 +704,10 @@ void SampleProfileLoader::findEquivalencesFor(
bool IsInSameLoop = LI->getLoopFor(BB1) == LI->getLoopFor(BB2);
if (BB1 != BB2 && IsDomParent && IsInSameLoop) {
EquivalenceClass[BB2] = EC;
+ // If BB2 is visited, then the entire EC should be marked as visited.
+ if (VisitedBlocks.count(BB2)) {
+ VisitedBlocks.insert(EC);
+ }
// If BB2 is heavier than BB1, make BB2 have the same weight
// as BB1.
@@ -705,7 +720,11 @@ void SampleProfileLoader::findEquivalencesFor(
Weight = std::max(Weight, BlockWeights[BB2]);
}
}
- BlockWeights[EC] = Weight;
+ if (EC == &EC->getParent()->getEntryBlock()) {
+ BlockWeights[EC] = Samples->getHeadSamples() + 1;
+ } else {
+ BlockWeights[EC] = Weight;
+ }
}
/// \brief Find equivalence classes.
@@ -796,9 +815,12 @@ uint64_t SampleProfileLoader::visitEdge(Edge E, unsigned *NumUnknownEdges,
/// count of the basic block, if needed.
///
/// \param F Function to process.
+/// \param UpdateBlockCount Whether we should update basic block counts that
+/// has already been annotated.
///
/// \returns True if new weights were assigned to edges or blocks.
-bool SampleProfileLoader::propagateThroughEdges(Function &F) {
+bool SampleProfileLoader::propagateThroughEdges(Function &F,
+ bool UpdateBlockCount) {
bool Changed = false;
DEBUG(dbgs() << "\nPropagation through edges\n");
for (const auto &BI : F) {
@@ -890,11 +912,35 @@ bool SampleProfileLoader::propagateThroughEdges(Function &F) {
EdgeWeights[UnknownEdge] = BBWeight - TotalWeight;
else
EdgeWeights[UnknownEdge] = 0;
+ const BasicBlock *OtherEC;
+ if (i == 0)
+ OtherEC = EquivalenceClass[UnknownEdge.first];
+ else
+ OtherEC = EquivalenceClass[UnknownEdge.second];
+ // Edge weights should never exceed the BB weights it connects.
+ if (VisitedBlocks.count(OtherEC) &&
+ EdgeWeights[UnknownEdge] > BlockWeights[OtherEC])
+ EdgeWeights[UnknownEdge] = BlockWeights[OtherEC];
VisitedEdges.insert(UnknownEdge);
Changed = true;
DEBUG(dbgs() << "Set weight for edge: ";
printEdgeWeight(dbgs(), UnknownEdge));
}
+ } else if (VisitedBlocks.count(EC) && BlockWeights[EC] == 0) {
+ // If a block Weights 0, all its in/out edges should weight 0.
+ if (i == 0) {
+ for (auto *Pred : Predecessors[BB]) {
+ Edge E = std::make_pair(Pred, BB);
+ EdgeWeights[E] = 0;
+ VisitedEdges.insert(E);
+ }
+ } else {
+ for (auto *Succ : Successors[BB]) {
+ Edge E = std::make_pair(BB, Succ);
+ EdgeWeights[E] = 0;
+ VisitedEdges.insert(E);
+ }
+ }
} else if (SelfReferentialEdge.first && VisitedBlocks.count(EC)) {
uint64_t &BBWeight = BlockWeights[BB];
// We have a self-referential edge and the weight of BB is known.
@@ -907,6 +953,11 @@ bool SampleProfileLoader::propagateThroughEdges(Function &F) {
DEBUG(dbgs() << "Set self-referential edge weight to: ";
printEdgeWeight(dbgs(), SelfReferentialEdge));
}
+ if (UpdateBlockCount && !VisitedBlocks.count(EC) && TotalWeight > 0) {
+ BlockWeights[EC] = TotalWeight;
+ VisitedBlocks.insert(EC);
+ Changed = true;
+ }
}
}
@@ -966,7 +1017,21 @@ void SampleProfileLoader::propagateWeights(Function &F) {
// Add an entry count to the function using the samples gathered
// at the function entry.
- F.setEntryCount(Samples->getHeadSamples());
+ F.setEntryCount(Samples->getHeadSamples() + 1);
+
+ // If BB weight is larger than its corresponding loop's header BB weight,
+ // use the BB weight to replace the loop header BB weight.
+ for (auto &BI : F) {
+ BasicBlock *BB = &BI;
+ Loop *L = LI->getLoopFor(BB);
+ if (!L) {
+ continue;
+ }
+ BasicBlock *Header = L->getHeader();
+ if (Header && BlockWeights[BB] > BlockWeights[Header]) {
+ BlockWeights[Header] = BlockWeights[BB];
+ }
+ }
// Before propagation starts, build, for each block, a list of
// unique predecessors and successors. This is necessary to handle
@@ -977,7 +1042,23 @@ void SampleProfileLoader::propagateWeights(Function &F) {
// Propagate until we converge or we go past the iteration limit.
while (Changed && I++ < SampleProfileMaxPropagateIterations) {
- Changed = propagateThroughEdges(F);
+ Changed = propagateThroughEdges(F, false);
+ }
+
+ // The first propagation propagates BB counts from annotated BBs to unknown
+ // BBs. The 2nd propagation pass resets edges weights, and use all BB weights
+ // to propagate edge weights.
+ VisitedEdges.clear();
+ Changed = true;
+ while (Changed && I++ < SampleProfileMaxPropagateIterations) {
+ Changed = propagateThroughEdges(F, false);
+ }
+
+ // The 3rd propagation pass allows adjust annotated BB weights that are
+ // obviously wrong.
+ Changed = true;
+ while (Changed && I++ < SampleProfileMaxPropagateIterations) {
+ Changed = propagateThroughEdges(F, true);
}
// Generate MD_prof metadata for every branch instruction using the
@@ -994,7 +1075,7 @@ void SampleProfileLoader::propagateWeights(Function &F) {
if (!dyn_cast<IntrinsicInst>(&I)) {
SmallVector<uint32_t, 1> Weights;
Weights.push_back(BlockWeights[BB]);
- CI->setMetadata(LLVMContext::MD_prof,
+ CI->setMetadata(LLVMContext::MD_prof,
MDB.createBranchWeights(Weights));
}
}
@@ -1023,7 +1104,9 @@ void SampleProfileLoader::propagateWeights(Function &F) {
DEBUG(dbgs() << " (saturated due to uint32_t overflow)");
Weight = std::numeric_limits<uint32_t>::max();
}
- Weights.push_back(static_cast<uint32_t>(Weight));
+ // Weight is added by one to avoid propagation errors introduced by
+ // 0 weights.
+ Weights.push_back(static_cast<uint32_t>(Weight + 1));
if (Weight != 0) {
if (Weight > MaxWeight) {
MaxWeight = Weight;
@@ -1192,10 +1275,10 @@ bool SampleProfileLoader::emitAnnotations(Function &F) {
char SampleProfileLoaderLegacyPass::ID = 0;
INITIALIZE_PASS_BEGIN(SampleProfileLoaderLegacyPass, "sample-profile",
- "Sample Profile loader", false, false)
+ "Sample Profile loader", false, false)
INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
INITIALIZE_PASS_END(SampleProfileLoaderLegacyPass, "sample-profile",
- "Sample Profile loader", false, false)
+ "Sample Profile loader", false, false)
bool SampleProfileLoader::doInitialization(Module &M) {
auto &Ctx = M.getContext();
@@ -1232,12 +1315,13 @@ bool SampleProfileLoader::runOnModule(Module &M) {
clearFunctionData();
retval |= runOnFunction(F);
}
- M.setProfileSummary(Reader->getSummary().getMD(M.getContext()));
+ if (M.getProfileSummary() == nullptr)
+ M.setProfileSummary(Reader->getSummary().getMD(M.getContext()));
return retval;
}
bool SampleProfileLoaderLegacyPass::runOnModule(Module &M) {
- // FIXME: pass in AssumptionCache correctly for the new pass manager.
+ // FIXME: pass in AssumptionCache correctly for the new pass manager.
SampleLoader.setACT(&getAnalysis<AssumptionCacheTracker>());
return SampleLoader.runOnModule(M);
}
@@ -1251,7 +1335,7 @@ bool SampleProfileLoader::runOnFunction(Function &F) {
}
PreservedAnalyses SampleProfileLoaderPass::run(Module &M,
- AnalysisManager<Module> &AM) {
+ ModuleAnalysisManager &AM) {
SampleProfileLoader SampleLoader(SampleProfileFile);
diff --git a/contrib/llvm/lib/Transforms/IPO/StripSymbols.cpp b/contrib/llvm/lib/Transforms/IPO/StripSymbols.cpp
index fd25036..8f6f161 100644
--- a/contrib/llvm/lib/Transforms/IPO/StripSymbols.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/StripSymbols.cpp
@@ -219,7 +219,8 @@ static bool StripSymbolNames(Module &M, bool PreserveDbgInfo) {
if (I.hasLocalLinkage() && llvmUsedValues.count(&I) == 0)
if (!PreserveDbgInfo || !I.getName().startswith("llvm.dbg"))
I.setName(""); // Internal symbols can't participate in linkage
- StripSymtab(I.getValueSymbolTable(), PreserveDbgInfo);
+ if (auto *Symtab = I.getValueSymbolTable())
+ StripSymtab(*Symtab, PreserveDbgInfo);
}
// Remove all names from types.
@@ -312,26 +313,29 @@ bool StripDeadDebugInfo::runOnModule(Module &M) {
// replace the current list of potentially dead global variables/functions
// with the live list.
SmallVector<Metadata *, 64> LiveGlobalVariables;
- SmallVector<Metadata *, 64> LiveSubprograms;
- DenseSet<const MDNode *> VisitedSet;
-
- std::set<DISubprogram *> LiveSPs;
- for (Function &F : M) {
- if (DISubprogram *SP = F.getSubprogram())
- LiveSPs.insert(SP);
+ DenseSet<DIGlobalVariableExpression *> VisitedSet;
+
+ std::set<DIGlobalVariableExpression *> LiveGVs;
+ for (GlobalVariable &GV : M.globals()) {
+ SmallVector<DIGlobalVariableExpression *, 1> GVEs;
+ GV.getDebugInfo(GVEs);
+ for (auto *GVE : GVEs)
+ LiveGVs.insert(GVE);
}
for (DICompileUnit *DIC : F.compile_units()) {
// Create our live global variable list.
bool GlobalVariableChange = false;
- for (DIGlobalVariable *DIG : DIC->getGlobalVariables()) {
+ for (auto *DIG : DIC->getGlobalVariables()) {
+ if (DIG->getExpression() && DIG->getExpression()->isConstant())
+ LiveGVs.insert(DIG);
+
// Make sure we only visit each global variable only once.
if (!VisitedSet.insert(DIG).second)
continue;
- // If the global variable referenced by DIG is not null, the global
- // variable is live.
- if (DIG->getVariable())
+ // If a global variable references DIG, the global variable is live.
+ if (LiveGVs.count(DIG))
LiveGlobalVariables.push_back(DIG);
else
GlobalVariableChange = true;
@@ -345,7 +349,6 @@ bool StripDeadDebugInfo::runOnModule(Module &M) {
}
// Reset lists for the next iteration.
- LiveSubprograms.clear();
LiveGlobalVariables.clear();
}
diff --git a/contrib/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp b/contrib/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
new file mode 100644
index 0000000..3680cfc
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
@@ -0,0 +1,344 @@
+//===- ThinLTOBitcodeWriter.cpp - Bitcode writing pass for ThinLTO --------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass prepares a module containing type metadata for ThinLTO by splitting
+// it into regular and thin LTO parts if possible, and writing both parts to
+// a multi-module bitcode file. Modules that do not contain type metadata are
+// written unmodified as a single module.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Analysis/ModuleSummaryAnalysis.h"
+#include "llvm/Analysis/TypeMetadataUtils.h"
+#include "llvm/Bitcode/BitcodeWriter.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/ScopedPrinter.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+using namespace llvm;
+
+namespace {
+
+// Produce a unique identifier for this module by taking the MD5 sum of the
+// names of the module's strong external symbols. This identifier is
+// normally guaranteed to be unique, or the program would fail to link due to
+// multiply defined symbols.
+//
+// If the module has no strong external symbols (such a module may still have a
+// semantic effect if it performs global initialization), we cannot produce a
+// unique identifier for this module, so we return the empty string, which
+// causes the entire module to be written as a regular LTO module.
+std::string getModuleId(Module *M) {
+ MD5 Md5;
+ bool ExportsSymbols = false;
+ auto AddGlobal = [&](GlobalValue &GV) {
+ if (GV.isDeclaration() || GV.getName().startswith("llvm.") ||
+ !GV.hasExternalLinkage())
+ return;
+ ExportsSymbols = true;
+ Md5.update(GV.getName());
+ Md5.update(ArrayRef<uint8_t>{0});
+ };
+
+ for (auto &F : *M)
+ AddGlobal(F);
+ for (auto &GV : M->globals())
+ AddGlobal(GV);
+ for (auto &GA : M->aliases())
+ AddGlobal(GA);
+ for (auto &IF : M->ifuncs())
+ AddGlobal(IF);
+
+ if (!ExportsSymbols)
+ return "";
+
+ MD5::MD5Result R;
+ Md5.final(R);
+
+ SmallString<32> Str;
+ MD5::stringifyResult(R, Str);
+ return ("$" + Str).str();
+}
+
+// Promote each local-linkage entity defined by ExportM and used by ImportM by
+// changing visibility and appending the given ModuleId.
+void promoteInternals(Module &ExportM, Module &ImportM, StringRef ModuleId) {
+ auto PromoteInternal = [&](GlobalValue &ExportGV) {
+ if (!ExportGV.hasLocalLinkage())
+ return;
+
+ GlobalValue *ImportGV = ImportM.getNamedValue(ExportGV.getName());
+ if (!ImportGV || ImportGV->use_empty())
+ return;
+
+ std::string NewName = (ExportGV.getName() + ModuleId).str();
+
+ ExportGV.setName(NewName);
+ ExportGV.setLinkage(GlobalValue::ExternalLinkage);
+ ExportGV.setVisibility(GlobalValue::HiddenVisibility);
+
+ ImportGV->setName(NewName);
+ ImportGV->setVisibility(GlobalValue::HiddenVisibility);
+ };
+
+ for (auto &F : ExportM)
+ PromoteInternal(F);
+ for (auto &GV : ExportM.globals())
+ PromoteInternal(GV);
+ for (auto &GA : ExportM.aliases())
+ PromoteInternal(GA);
+ for (auto &IF : ExportM.ifuncs())
+ PromoteInternal(IF);
+}
+
+// Promote all internal (i.e. distinct) type ids used by the module by replacing
+// them with external type ids formed using the module id.
+//
+// Note that this needs to be done before we clone the module because each clone
+// will receive its own set of distinct metadata nodes.
+void promoteTypeIds(Module &M, StringRef ModuleId) {
+ DenseMap<Metadata *, Metadata *> LocalToGlobal;
+ auto ExternalizeTypeId = [&](CallInst *CI, unsigned ArgNo) {
+ Metadata *MD =
+ cast<MetadataAsValue>(CI->getArgOperand(ArgNo))->getMetadata();
+
+ if (isa<MDNode>(MD) && cast<MDNode>(MD)->isDistinct()) {
+ Metadata *&GlobalMD = LocalToGlobal[MD];
+ if (!GlobalMD) {
+ std::string NewName =
+ (to_string(LocalToGlobal.size()) + ModuleId).str();
+ GlobalMD = MDString::get(M.getContext(), NewName);
+ }
+
+ CI->setArgOperand(ArgNo,
+ MetadataAsValue::get(M.getContext(), GlobalMD));
+ }
+ };
+
+ if (Function *TypeTestFunc =
+ M.getFunction(Intrinsic::getName(Intrinsic::type_test))) {
+ for (const Use &U : TypeTestFunc->uses()) {
+ auto CI = cast<CallInst>(U.getUser());
+ ExternalizeTypeId(CI, 1);
+ }
+ }
+
+ if (Function *TypeCheckedLoadFunc =
+ M.getFunction(Intrinsic::getName(Intrinsic::type_checked_load))) {
+ for (const Use &U : TypeCheckedLoadFunc->uses()) {
+ auto CI = cast<CallInst>(U.getUser());
+ ExternalizeTypeId(CI, 2);
+ }
+ }
+
+ for (GlobalObject &GO : M.global_objects()) {
+ SmallVector<MDNode *, 1> MDs;
+ GO.getMetadata(LLVMContext::MD_type, MDs);
+
+ GO.eraseMetadata(LLVMContext::MD_type);
+ for (auto MD : MDs) {
+ auto I = LocalToGlobal.find(MD->getOperand(1));
+ if (I == LocalToGlobal.end()) {
+ GO.addMetadata(LLVMContext::MD_type, *MD);
+ continue;
+ }
+ GO.addMetadata(
+ LLVMContext::MD_type,
+ *MDNode::get(M.getContext(),
+ ArrayRef<Metadata *>{MD->getOperand(0), I->second}));
+ }
+ }
+}
+
+// Drop unused globals, and drop type information from function declarations.
+// FIXME: If we made functions typeless then there would be no need to do this.
+void simplifyExternals(Module &M) {
+ FunctionType *EmptyFT =
+ FunctionType::get(Type::getVoidTy(M.getContext()), false);
+
+ for (auto I = M.begin(), E = M.end(); I != E;) {
+ Function &F = *I++;
+ if (F.isDeclaration() && F.use_empty()) {
+ F.eraseFromParent();
+ continue;
+ }
+
+ if (!F.isDeclaration() || F.getFunctionType() == EmptyFT)
+ continue;
+
+ Function *NewF =
+ Function::Create(EmptyFT, GlobalValue::ExternalLinkage, "", &M);
+ NewF->setVisibility(F.getVisibility());
+ NewF->takeName(&F);
+ F.replaceAllUsesWith(ConstantExpr::getBitCast(NewF, F.getType()));
+ F.eraseFromParent();
+ }
+
+ for (auto I = M.global_begin(), E = M.global_end(); I != E;) {
+ GlobalVariable &GV = *I++;
+ if (GV.isDeclaration() && GV.use_empty()) {
+ GV.eraseFromParent();
+ continue;
+ }
+ }
+}
+
+void filterModule(
+ Module *M, std::function<bool(const GlobalValue *)> ShouldKeepDefinition) {
+ for (Function &F : *M) {
+ if (ShouldKeepDefinition(&F))
+ continue;
+
+ F.deleteBody();
+ F.clearMetadata();
+ }
+
+ for (GlobalVariable &GV : M->globals()) {
+ if (ShouldKeepDefinition(&GV))
+ continue;
+
+ GV.setInitializer(nullptr);
+ GV.setLinkage(GlobalValue::ExternalLinkage);
+ GV.clearMetadata();
+ }
+
+ for (Module::alias_iterator I = M->alias_begin(), E = M->alias_end();
+ I != E;) {
+ GlobalAlias *GA = &*I++;
+ if (ShouldKeepDefinition(GA))
+ continue;
+
+ GlobalObject *GO;
+ if (I->getValueType()->isFunctionTy())
+ GO = Function::Create(cast<FunctionType>(GA->getValueType()),
+ GlobalValue::ExternalLinkage, "", M);
+ else
+ GO = new GlobalVariable(
+ *M, GA->getValueType(), false, GlobalValue::ExternalLinkage,
+ (Constant *)nullptr, "", (GlobalVariable *)nullptr,
+ GA->getThreadLocalMode(), GA->getType()->getAddressSpace());
+ GO->takeName(GA);
+ GA->replaceAllUsesWith(GO);
+ GA->eraseFromParent();
+ }
+}
+
+// If it's possible to split M into regular and thin LTO parts, do so and write
+// a multi-module bitcode file with the two parts to OS. Otherwise, write only a
+// regular LTO bitcode file to OS.
+void splitAndWriteThinLTOBitcode(raw_ostream &OS, Module &M) {
+ std::string ModuleId = getModuleId(&M);
+ if (ModuleId.empty()) {
+ // We couldn't generate a module ID for this module, just write it out as a
+ // regular LTO module.
+ WriteBitcodeToFile(&M, OS);
+ return;
+ }
+
+ promoteTypeIds(M, ModuleId);
+
+ auto IsInMergedM = [&](const GlobalValue *GV) {
+ auto *GVar = dyn_cast<GlobalVariable>(GV->getBaseObject());
+ if (!GVar)
+ return false;
+
+ SmallVector<MDNode *, 1> MDs;
+ GVar->getMetadata(LLVMContext::MD_type, MDs);
+ return !MDs.empty();
+ };
+
+ ValueToValueMapTy VMap;
+ std::unique_ptr<Module> MergedM(CloneModule(&M, VMap, IsInMergedM));
+
+ filterModule(&M, [&](const GlobalValue *GV) { return !IsInMergedM(GV); });
+
+ promoteInternals(*MergedM, M, ModuleId);
+ promoteInternals(M, *MergedM, ModuleId);
+
+ simplifyExternals(*MergedM);
+
+ SmallVector<char, 0> Buffer;
+ BitcodeWriter W(Buffer);
+
+ // FIXME: Try to re-use BSI and PFI from the original module here.
+ ModuleSummaryIndex Index = buildModuleSummaryIndex(M, nullptr, nullptr);
+ W.writeModule(&M, /*ShouldPreserveUseListOrder=*/false, &Index,
+ /*GenerateHash=*/true);
+
+ W.writeModule(MergedM.get());
+
+ OS << Buffer;
+}
+
+// Returns whether this module needs to be split because it uses type metadata.
+bool requiresSplit(Module &M) {
+ SmallVector<MDNode *, 1> MDs;
+ for (auto &GO : M.global_objects()) {
+ GO.getMetadata(LLVMContext::MD_type, MDs);
+ if (!MDs.empty())
+ return true;
+ }
+
+ return false;
+}
+
+void writeThinLTOBitcode(raw_ostream &OS, Module &M,
+ const ModuleSummaryIndex *Index) {
+ // See if this module has any type metadata. If so, we need to split it.
+ if (requiresSplit(M))
+ return splitAndWriteThinLTOBitcode(OS, M);
+
+ // Otherwise we can just write it out as a regular module.
+ WriteBitcodeToFile(&M, OS, /*ShouldPreserveUseListOrder=*/false, Index,
+ /*GenerateHash=*/true);
+}
+
+class WriteThinLTOBitcode : public ModulePass {
+ raw_ostream &OS; // raw_ostream to print on
+
+public:
+ static char ID; // Pass identification, replacement for typeid
+ WriteThinLTOBitcode() : ModulePass(ID), OS(dbgs()) {
+ initializeWriteThinLTOBitcodePass(*PassRegistry::getPassRegistry());
+ }
+
+ explicit WriteThinLTOBitcode(raw_ostream &o)
+ : ModulePass(ID), OS(o) {
+ initializeWriteThinLTOBitcodePass(*PassRegistry::getPassRegistry());
+ }
+
+ StringRef getPassName() const override { return "ThinLTO Bitcode Writer"; }
+
+ bool runOnModule(Module &M) override {
+ const ModuleSummaryIndex *Index =
+ &(getAnalysis<ModuleSummaryIndexWrapperPass>().getIndex());
+ writeThinLTOBitcode(OS, M, Index);
+ return true;
+ }
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesAll();
+ AU.addRequired<ModuleSummaryIndexWrapperPass>();
+ }
+};
+} // anonymous namespace
+
+char WriteThinLTOBitcode::ID = 0;
+INITIALIZE_PASS_BEGIN(WriteThinLTOBitcode, "write-thinlto-bitcode",
+ "Write ThinLTO Bitcode", false, true)
+INITIALIZE_PASS_DEPENDENCY(ModuleSummaryIndexWrapperPass)
+INITIALIZE_PASS_END(WriteThinLTOBitcode, "write-thinlto-bitcode",
+ "Write ThinLTO Bitcode", false, true)
+
+ModulePass *llvm::createWriteThinLTOBitcodePass(raw_ostream &Str) {
+ return new WriteThinLTOBitcode(Str);
+}
diff --git a/contrib/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp b/contrib/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
index 53eb4e2..844cc0f 100644
--- a/contrib/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
@@ -29,24 +29,43 @@
#include "llvm/Transforms/IPO/WholeProgramDevirt.h"
#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseMapInfo.h"
#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/iterator_range.h"
#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/SmallVector.h"
#include "llvm/Analysis/TypeMetadataUtils.h"
#include "llvm/IR/CallSite.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalVariable.h"
#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
#include "llvm/IR/Module.h"
#include "llvm/Pass.h"
-#include "llvm/Support/raw_ostream.h"
+#include "llvm/PassRegistry.h"
+#include "llvm/PassSupport.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/MathExtras.h"
#include "llvm/Transforms/IPO.h"
#include "llvm/Transforms/Utils/Evaluator.h"
-#include "llvm/Transforms/Utils/Local.h"
-
+#include <algorithm>
+#include <cstddef>
+#include <map>
#include <set>
+#include <string>
using namespace llvm;
using namespace wholeprogramdevirt;
@@ -166,7 +185,7 @@ void wholeprogramdevirt::setAfterReturnValues(
VirtualCallTarget::VirtualCallTarget(Function *Fn, const TypeMemberInfo *TM)
: Fn(Fn), TM(TM),
- IsBigEndian(Fn->getParent()->getDataLayout().isBigEndian()) {}
+ IsBigEndian(Fn->getParent()->getDataLayout().isBigEndian()), WasDevirt(false) {}
namespace {
@@ -178,7 +197,7 @@ struct VTableSlot {
uint64_t ByteOffset;
};
-}
+} // end anonymous namespace
namespace llvm {
@@ -201,7 +220,7 @@ template <> struct DenseMapInfo<VTableSlot> {
}
};
-}
+} // end namespace llvm
namespace {
@@ -216,15 +235,18 @@ struct VirtualCallSite {
// of that field for details.
unsigned *NumUnsafeUses;
- void emitRemark() {
+ void emitRemark(const Twine &OptName, const Twine &TargetName) {
Function *F = CS.getCaller();
- emitOptimizationRemark(F->getContext(), DEBUG_TYPE, *F,
- CS.getInstruction()->getDebugLoc(),
- "devirtualized call");
+ emitOptimizationRemark(
+ F->getContext(), DEBUG_TYPE, *F,
+ CS.getInstruction()->getDebugLoc(),
+ OptName + ": devirtualized a call to " + TargetName);
}
- void replaceAndErase(Value *New) {
- emitRemark();
+ void replaceAndErase(const Twine &OptName, const Twine &TargetName,
+ bool RemarksEnabled, Value *New) {
+ if (RemarksEnabled)
+ emitRemark(OptName, TargetName);
CS->replaceAllUsesWith(New);
if (auto II = dyn_cast<InvokeInst>(CS.getInstruction())) {
BranchInst::Create(II->getNormalDest(), CS.getInstruction());
@@ -243,6 +265,8 @@ struct DevirtModule {
PointerType *Int8PtrTy;
IntegerType *Int32Ty;
+ bool RemarksEnabled;
+
MapVector<VTableSlot, std::vector<VirtualCallSite>> CallSlots;
// This map keeps track of the number of "unsafe" uses of a loaded function
@@ -258,7 +282,10 @@ struct DevirtModule {
DevirtModule(Module &M)
: M(M), Int8Ty(Type::getInt8Ty(M.getContext())),
Int8PtrTy(Type::getInt8PtrTy(M.getContext())),
- Int32Ty(Type::getInt32Ty(M.getContext())) {}
+ Int32Ty(Type::getInt32Ty(M.getContext())),
+ RemarksEnabled(areRemarksEnabled()) {}
+
+ bool areRemarksEnabled();
void scanTypeTestUsers(Function *TypeTestFunc, Function *AssumeFunc);
void scanTypeCheckedLoadUsers(Function *TypeCheckedLoadFunc);
@@ -266,20 +293,21 @@ struct DevirtModule {
void buildTypeIdentifierMap(
std::vector<VTableBits> &Bits,
DenseMap<Metadata *, std::set<TypeMemberInfo>> &TypeIdMap);
+ Constant *getPointerAtOffset(Constant *I, uint64_t Offset);
bool
tryFindVirtualCallTargets(std::vector<VirtualCallTarget> &TargetsForSlot,
const std::set<TypeMemberInfo> &TypeMemberInfos,
uint64_t ByteOffset);
- bool trySingleImplDevirt(ArrayRef<VirtualCallTarget> TargetsForSlot,
+ bool trySingleImplDevirt(MutableArrayRef<VirtualCallTarget> TargetsForSlot,
MutableArrayRef<VirtualCallSite> CallSites);
bool tryEvaluateFunctionsWithArgs(
MutableArrayRef<VirtualCallTarget> TargetsForSlot,
ArrayRef<ConstantInt *> Args);
bool tryUniformRetValOpt(IntegerType *RetType,
- ArrayRef<VirtualCallTarget> TargetsForSlot,
+ MutableArrayRef<VirtualCallTarget> TargetsForSlot,
MutableArrayRef<VirtualCallSite> CallSites);
bool tryUniqueRetValOpt(unsigned BitWidth,
- ArrayRef<VirtualCallTarget> TargetsForSlot,
+ MutableArrayRef<VirtualCallTarget> TargetsForSlot,
MutableArrayRef<VirtualCallSite> CallSites);
bool tryVirtualConstProp(MutableArrayRef<VirtualCallTarget> TargetsForSlot,
ArrayRef<VirtualCallSite> CallSites);
@@ -291,10 +319,12 @@ struct DevirtModule {
struct WholeProgramDevirt : public ModulePass {
static char ID;
+
WholeProgramDevirt() : ModulePass(ID) {
initializeWholeProgramDevirtPass(*PassRegistry::getPassRegistry());
}
- bool runOnModule(Module &M) {
+
+ bool runOnModule(Module &M) override {
if (skipModule(M))
return false;
@@ -302,7 +332,7 @@ struct WholeProgramDevirt : public ModulePass {
}
};
-} // anonymous namespace
+} // end anonymous namespace
INITIALIZE_PASS(WholeProgramDevirt, "wholeprogramdevirt",
"Whole program devirtualization", false, false)
@@ -353,6 +383,38 @@ void DevirtModule::buildTypeIdentifierMap(
}
}
+Constant *DevirtModule::getPointerAtOffset(Constant *I, uint64_t Offset) {
+ if (I->getType()->isPointerTy()) {
+ if (Offset == 0)
+ return I;
+ return nullptr;
+ }
+
+ const DataLayout &DL = M.getDataLayout();
+
+ if (auto *C = dyn_cast<ConstantStruct>(I)) {
+ const StructLayout *SL = DL.getStructLayout(C->getType());
+ if (Offset >= SL->getSizeInBytes())
+ return nullptr;
+
+ unsigned Op = SL->getElementContainingOffset(Offset);
+ return getPointerAtOffset(cast<Constant>(I->getOperand(Op)),
+ Offset - SL->getElementOffset(Op));
+ }
+ if (auto *C = dyn_cast<ConstantArray>(I)) {
+ ArrayType *VTableTy = C->getType();
+ uint64_t ElemSize = DL.getTypeAllocSize(VTableTy->getElementType());
+
+ unsigned Op = Offset / ElemSize;
+ if (Op >= C->getNumOperands())
+ return nullptr;
+
+ return getPointerAtOffset(cast<Constant>(I->getOperand(Op)),
+ Offset % ElemSize);
+ }
+ return nullptr;
+}
+
bool DevirtModule::tryFindVirtualCallTargets(
std::vector<VirtualCallTarget> &TargetsForSlot,
const std::set<TypeMemberInfo> &TypeMemberInfos, uint64_t ByteOffset) {
@@ -360,22 +422,12 @@ bool DevirtModule::tryFindVirtualCallTargets(
if (!TM.Bits->GV->isConstant())
return false;
- auto Init = dyn_cast<ConstantArray>(TM.Bits->GV->getInitializer());
- if (!Init)
- return false;
- ArrayType *VTableTy = Init->getType();
-
- uint64_t ElemSize =
- M.getDataLayout().getTypeAllocSize(VTableTy->getElementType());
- uint64_t GlobalSlotOffset = TM.Offset + ByteOffset;
- if (GlobalSlotOffset % ElemSize != 0)
- return false;
-
- unsigned Op = GlobalSlotOffset / ElemSize;
- if (Op >= Init->getNumOperands())
+ Constant *Ptr = getPointerAtOffset(TM.Bits->GV->getInitializer(),
+ TM.Offset + ByteOffset);
+ if (!Ptr)
return false;
- auto Fn = dyn_cast<Function>(Init->getOperand(Op)->stripPointerCasts());
+ auto Fn = dyn_cast<Function>(Ptr->stripPointerCasts());
if (!Fn)
return false;
@@ -392,7 +444,7 @@ bool DevirtModule::tryFindVirtualCallTargets(
}
bool DevirtModule::trySingleImplDevirt(
- ArrayRef<VirtualCallTarget> TargetsForSlot,
+ MutableArrayRef<VirtualCallTarget> TargetsForSlot,
MutableArrayRef<VirtualCallSite> CallSites) {
// See if the program contains a single implementation of this virtual
// function.
@@ -401,9 +453,12 @@ bool DevirtModule::trySingleImplDevirt(
if (TheFn != Target.Fn)
return false;
+ if (RemarksEnabled)
+ TargetsForSlot[0].WasDevirt = true;
// If so, update each call site to call that implementation directly.
for (auto &&VCallSite : CallSites) {
- VCallSite.emitRemark();
+ if (RemarksEnabled)
+ VCallSite.emitRemark("single-impl", TheFn->getName());
VCallSite.CS.setCalledFunction(ConstantExpr::getBitCast(
TheFn, VCallSite.CS.getCalledValue()->getType()));
// This use is no longer unsafe.
@@ -441,7 +496,7 @@ bool DevirtModule::tryEvaluateFunctionsWithArgs(
}
bool DevirtModule::tryUniformRetValOpt(
- IntegerType *RetType, ArrayRef<VirtualCallTarget> TargetsForSlot,
+ IntegerType *RetType, MutableArrayRef<VirtualCallTarget> TargetsForSlot,
MutableArrayRef<VirtualCallSite> CallSites) {
// Uniform return value optimization. If all functions return the same
// constant, replace all calls with that constant.
@@ -452,16 +507,20 @@ bool DevirtModule::tryUniformRetValOpt(
auto TheRetValConst = ConstantInt::get(RetType, TheRetVal);
for (auto Call : CallSites)
- Call.replaceAndErase(TheRetValConst);
+ Call.replaceAndErase("uniform-ret-val", TargetsForSlot[0].Fn->getName(),
+ RemarksEnabled, TheRetValConst);
+ if (RemarksEnabled)
+ for (auto &&Target : TargetsForSlot)
+ Target.WasDevirt = true;
return true;
}
bool DevirtModule::tryUniqueRetValOpt(
- unsigned BitWidth, ArrayRef<VirtualCallTarget> TargetsForSlot,
+ unsigned BitWidth, MutableArrayRef<VirtualCallTarget> TargetsForSlot,
MutableArrayRef<VirtualCallSite> CallSites) {
// IsOne controls whether we look for a 0 or a 1.
auto tryUniqueRetValOptFor = [&](bool IsOne) {
- const TypeMemberInfo *UniqueMember = 0;
+ const TypeMemberInfo *UniqueMember = nullptr;
for (const VirtualCallTarget &Target : TargetsForSlot) {
if (Target.RetVal == (IsOne ? 1 : 0)) {
if (UniqueMember)
@@ -481,8 +540,14 @@ bool DevirtModule::tryUniqueRetValOpt(
OneAddr = B.CreateConstGEP1_64(OneAddr, UniqueMember->Offset);
Value *Cmp = B.CreateICmp(IsOne ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_NE,
Call.VTable, OneAddr);
- Call.replaceAndErase(Cmp);
+ Call.replaceAndErase("unique-ret-val", TargetsForSlot[0].Fn->getName(),
+ RemarksEnabled, Cmp);
}
+ // Update devirtualization statistics for targets.
+ if (RemarksEnabled)
+ for (auto &&Target : TargetsForSlot)
+ Target.WasDevirt = true;
+
return true;
};
@@ -590,6 +655,10 @@ bool DevirtModule::tryVirtualConstProp(
setAfterReturnValues(TargetsForSlot, AllocAfter, BitWidth, OffsetByte,
OffsetBit);
+ if (RemarksEnabled)
+ for (auto &&Target : TargetsForSlot)
+ Target.WasDevirt = true;
+
// Rewrite each call to a load from OffsetByte/OffsetBit.
for (auto Call : CSByConstantArg.second) {
IRBuilder<> B(Call.CS.getInstruction());
@@ -599,11 +668,15 @@ bool DevirtModule::tryVirtualConstProp(
Value *Bit = ConstantInt::get(Int8Ty, 1ULL << OffsetBit);
Value *BitsAndBit = B.CreateAnd(Bits, Bit);
auto IsBitSet = B.CreateICmpNE(BitsAndBit, ConstantInt::get(Int8Ty, 0));
- Call.replaceAndErase(IsBitSet);
+ Call.replaceAndErase("virtual-const-prop-1-bit",
+ TargetsForSlot[0].Fn->getName(),
+ RemarksEnabled, IsBitSet);
} else {
Value *ValAddr = B.CreateBitCast(Addr, RetType->getPointerTo());
Value *Val = B.CreateLoad(RetType, ValAddr);
- Call.replaceAndErase(Val);
+ Call.replaceAndErase("virtual-const-prop",
+ TargetsForSlot[0].Fn->getName(),
+ RemarksEnabled, Val);
}
}
}
@@ -655,6 +728,15 @@ void DevirtModule::rebuildGlobal(VTableBits &B) {
B.GV->eraseFromParent();
}
+bool DevirtModule::areRemarksEnabled() {
+ const auto &FL = M.getFunctionList();
+ if (FL.empty())
+ return false;
+ const Function &Fn = FL.front();
+ auto DI = OptimizationRemark(DEBUG_TYPE, Fn, DebugLoc(), "");
+ return DI.isEnabled();
+}
+
void DevirtModule::scanTypeTestUsers(Function *TypeTestFunc,
Function *AssumeFunc) {
// Find all virtual calls via a virtual table pointer %p under an assumption
@@ -806,6 +888,7 @@ bool DevirtModule::run() {
// For each (type, offset) pair:
bool DidVirtualConstProp = false;
+ std::map<std::string, Function*> DevirtTargets;
for (auto &S : CallSlots) {
// Search each of the members of the type identifier for the virtual
// function implementation at offset S.first.ByteOffset, and add to
@@ -815,10 +898,26 @@ bool DevirtModule::run() {
S.first.ByteOffset))
continue;
- if (trySingleImplDevirt(TargetsForSlot, S.second))
- continue;
+ if (!trySingleImplDevirt(TargetsForSlot, S.second) &&
+ tryVirtualConstProp(TargetsForSlot, S.second))
+ DidVirtualConstProp = true;
- DidVirtualConstProp |= tryVirtualConstProp(TargetsForSlot, S.second);
+ // Collect functions devirtualized at least for one call site for stats.
+ if (RemarksEnabled)
+ for (const auto &T : TargetsForSlot)
+ if (T.WasDevirt)
+ DevirtTargets[T.Fn->getName()] = T.Fn;
+ }
+
+ if (RemarksEnabled) {
+ // Generate remarks for each devirtualized function.
+ for (const auto &DT : DevirtTargets) {
+ Function *F = DT.second;
+ DISubprogram *SP = F->getSubprogram();
+ DebugLoc DL = SP ? DebugLoc::get(SP->getScopeLine(), 0, SP) : DebugLoc();
+ emitOptimizationRemark(F->getContext(), DEBUG_TYPE, *F, DL,
+ Twine("devirtualized ") + F->getName());
+ }
}
// If we were able to eliminate all unsafe uses for a type checked load,
diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index 221a220..2d34c1c 100644
--- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -1035,7 +1035,7 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
return replaceInstUsesWith(I, V);
if (Value *V = SimplifyAddInst(LHS, RHS, I.hasNoSignedWrap(),
- I.hasNoUnsignedWrap(), DL, TLI, DT, AC))
+ I.hasNoUnsignedWrap(), DL, &TLI, &DT, &AC))
return replaceInstUsesWith(I, V);
// (A*B)+(A*C) -> A*(B+C) etc
@@ -1047,6 +1047,28 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
// X + (signbit) --> X ^ signbit
if (Val->isSignBit())
return BinaryOperator::CreateXor(LHS, RHS);
+
+ // Is this add the last step in a convoluted sext?
+ Value *X;
+ const APInt *C;
+ if (match(LHS, m_ZExt(m_Xor(m_Value(X), m_APInt(C)))) &&
+ C->isMinSignedValue() &&
+ C->sext(LHS->getType()->getScalarSizeInBits()) == *Val) {
+ // add(zext(xor i16 X, -32768), -32768) --> sext X
+ return CastInst::Create(Instruction::SExt, X, LHS->getType());
+ }
+
+ if (Val->isNegative() &&
+ match(LHS, m_ZExt(m_NUWAdd(m_Value(X), m_APInt(C)))) &&
+ Val->sge(-C->sext(Val->getBitWidth()))) {
+ // (add (zext (add nuw X, C)), Val) -> (zext (add nuw X, C+Val))
+ return CastInst::Create(
+ Instruction::ZExt,
+ Builder->CreateNUWAdd(
+ X, Constant::getIntegerValue(X->getType(),
+ *C + Val->trunc(C->getBitWidth()))),
+ I.getType());
+ }
}
// FIXME: Use the match above instead of dyn_cast to allow these transforms
@@ -1144,7 +1166,7 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
return replaceInstUsesWith(I, V);
// A+B --> A|B iff A and B have no bits set in common.
- if (haveNoCommonBitsSet(LHS, RHS, DL, AC, &I, DT))
+ if (haveNoCommonBitsSet(LHS, RHS, DL, &AC, &I, &DT))
return BinaryOperator::CreateOr(LHS, RHS);
if (Constant *CRHS = dyn_cast<Constant>(RHS)) {
@@ -1216,15 +1238,16 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
if (SExtInst *LHSConv = dyn_cast<SExtInst>(LHS)) {
// (add (sext x), cst) --> (sext (add x, cst'))
if (ConstantInt *RHSC = dyn_cast<ConstantInt>(RHS)) {
- Constant *CI =
- ConstantExpr::getTrunc(RHSC, LHSConv->getOperand(0)->getType());
- if (LHSConv->hasOneUse() &&
- ConstantExpr::getSExt(CI, I.getType()) == RHSC &&
- WillNotOverflowSignedAdd(LHSConv->getOperand(0), CI, I)) {
- // Insert the new, smaller add.
- Value *NewAdd = Builder->CreateNSWAdd(LHSConv->getOperand(0),
- CI, "addconv");
- return new SExtInst(NewAdd, I.getType());
+ if (LHSConv->hasOneUse()) {
+ Constant *CI =
+ ConstantExpr::getTrunc(RHSC, LHSConv->getOperand(0)->getType());
+ if (ConstantExpr::getSExt(CI, I.getType()) == RHSC &&
+ WillNotOverflowSignedAdd(LHSConv->getOperand(0), CI, I)) {
+ // Insert the new, smaller add.
+ Value *NewAdd =
+ Builder->CreateNSWAdd(LHSConv->getOperand(0), CI, "addconv");
+ return new SExtInst(NewAdd, I.getType());
+ }
}
}
@@ -1246,6 +1269,44 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
}
}
+ // Check for (add (zext x), y), see if we can merge this into an
+ // integer add followed by a zext.
+ if (auto *LHSConv = dyn_cast<ZExtInst>(LHS)) {
+ // (add (zext x), cst) --> (zext (add x, cst'))
+ if (ConstantInt *RHSC = dyn_cast<ConstantInt>(RHS)) {
+ if (LHSConv->hasOneUse()) {
+ Constant *CI =
+ ConstantExpr::getTrunc(RHSC, LHSConv->getOperand(0)->getType());
+ if (ConstantExpr::getZExt(CI, I.getType()) == RHSC &&
+ computeOverflowForUnsignedAdd(LHSConv->getOperand(0), CI, &I) ==
+ OverflowResult::NeverOverflows) {
+ // Insert the new, smaller add.
+ Value *NewAdd =
+ Builder->CreateNUWAdd(LHSConv->getOperand(0), CI, "addconv");
+ return new ZExtInst(NewAdd, I.getType());
+ }
+ }
+ }
+
+ // (add (zext x), (zext y)) --> (zext (add int x, y))
+ if (auto *RHSConv = dyn_cast<ZExtInst>(RHS)) {
+ // Only do this if x/y have the same type, if at last one of them has a
+ // single use (so we don't increase the number of zexts), and if the
+ // integer add will not overflow.
+ if (LHSConv->getOperand(0)->getType() ==
+ RHSConv->getOperand(0)->getType() &&
+ (LHSConv->hasOneUse() || RHSConv->hasOneUse()) &&
+ computeOverflowForUnsignedAdd(LHSConv->getOperand(0),
+ RHSConv->getOperand(0),
+ &I) == OverflowResult::NeverOverflows) {
+ // Insert the new integer add.
+ Value *NewAdd = Builder->CreateNUWAdd(
+ LHSConv->getOperand(0), RHSConv->getOperand(0), "addconv");
+ return new ZExtInst(NewAdd, I.getType());
+ }
+ }
+ }
+
// (add (xor A, B) (and A, B)) --> (or A, B)
{
Value *A = nullptr, *B = nullptr;
@@ -1307,18 +1368,12 @@ Instruction *InstCombiner::visitFAdd(BinaryOperator &I) {
return replaceInstUsesWith(I, V);
if (Value *V =
- SimplifyFAddInst(LHS, RHS, I.getFastMathFlags(), DL, TLI, DT, AC))
+ SimplifyFAddInst(LHS, RHS, I.getFastMathFlags(), DL, &TLI, &DT, &AC))
return replaceInstUsesWith(I, V);
- if (isa<Constant>(RHS)) {
- if (isa<PHINode>(LHS))
- if (Instruction *NV = FoldOpIntoPhi(I))
- return NV;
-
- if (SelectInst *SI = dyn_cast<SelectInst>(LHS))
- if (Instruction *NV = FoldOpIntoSelect(I, SI))
- return NV;
- }
+ if (isa<Constant>(RHS))
+ if (Instruction *FoldedFAdd = foldOpWithConstantIntoOperand(I))
+ return FoldedFAdd;
// -A + B --> B - A
// -A + -B --> -(A + B)
@@ -1483,7 +1538,7 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) {
return replaceInstUsesWith(I, V);
if (Value *V = SimplifySubInst(Op0, Op1, I.hasNoSignedWrap(),
- I.hasNoUnsignedWrap(), DL, TLI, DT, AC))
+ I.hasNoUnsignedWrap(), DL, &TLI, &DT, &AC))
return replaceInstUsesWith(I, V);
// (A*B)-(A*C) -> A*(B-C) etc
@@ -1544,34 +1599,35 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) {
return CastInst::CreateZExtOrBitCast(X, Op1->getType());
}
- if (ConstantInt *C = dyn_cast<ConstantInt>(Op0)) {
+ const APInt *Op0C;
+ if (match(Op0, m_APInt(Op0C))) {
+ unsigned BitWidth = I.getType()->getScalarSizeInBits();
+
// -(X >>u 31) -> (X >>s 31)
// -(X >>s 31) -> (X >>u 31)
- if (C->isZero()) {
+ if (*Op0C == 0) {
Value *X;
- ConstantInt *CI;
- if (match(Op1, m_LShr(m_Value(X), m_ConstantInt(CI))) &&
- // Verify we are shifting out everything but the sign bit.
- CI->getValue() == I.getType()->getPrimitiveSizeInBits() - 1)
- return BinaryOperator::CreateAShr(X, CI);
-
- if (match(Op1, m_AShr(m_Value(X), m_ConstantInt(CI))) &&
- // Verify we are shifting out everything but the sign bit.
- CI->getValue() == I.getType()->getPrimitiveSizeInBits() - 1)
- return BinaryOperator::CreateLShr(X, CI);
+ const APInt *ShAmt;
+ if (match(Op1, m_LShr(m_Value(X), m_APInt(ShAmt))) &&
+ *ShAmt == BitWidth - 1) {
+ Value *ShAmtOp = cast<Instruction>(Op1)->getOperand(1);
+ return BinaryOperator::CreateAShr(X, ShAmtOp);
+ }
+ if (match(Op1, m_AShr(m_Value(X), m_APInt(ShAmt))) &&
+ *ShAmt == BitWidth - 1) {
+ Value *ShAmtOp = cast<Instruction>(Op1)->getOperand(1);
+ return BinaryOperator::CreateLShr(X, ShAmtOp);
+ }
}
// Turn this into a xor if LHS is 2^n-1 and the remaining bits are known
// zero.
- APInt IntVal = C->getValue();
- if ((IntVal + 1).isPowerOf2()) {
- unsigned BitWidth = I.getType()->getScalarSizeInBits();
+ if ((*Op0C + 1).isPowerOf2()) {
APInt KnownZero(BitWidth, 0);
APInt KnownOne(BitWidth, 0);
computeKnownBits(&I, KnownZero, KnownOne, 0, &I);
- if ((IntVal | KnownZero).isAllOnesValue()) {
- return BinaryOperator::CreateXor(Op1, C);
- }
+ if ((*Op0C | KnownZero).isAllOnesValue())
+ return BinaryOperator::CreateXor(Op1, Op0);
}
}
@@ -1632,6 +1688,17 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) {
if (Value *XNeg = dyn_castNegVal(X))
return BinaryOperator::CreateShl(XNeg, Y);
+ // Subtracting -1/0 is the same as adding 1/0:
+ // sub [nsw] Op0, sext(bool Y) -> add [nsw] Op0, zext(bool Y)
+ // 'nuw' is dropped in favor of the canonical form.
+ if (match(Op1, m_SExt(m_Value(Y))) &&
+ Y->getType()->getScalarSizeInBits() == 1) {
+ Value *Zext = Builder->CreateZExt(Y, I.getType());
+ BinaryOperator *Add = BinaryOperator::CreateAdd(Op0, Zext);
+ Add->setHasNoSignedWrap(I.hasNoSignedWrap());
+ return Add;
+ }
+
// X - A*-B -> X + A*B
// X - -A*B -> X + A*B
Value *A, *B;
@@ -1682,7 +1749,7 @@ Instruction *InstCombiner::visitFSub(BinaryOperator &I) {
return replaceInstUsesWith(I, V);
if (Value *V =
- SimplifyFSubInst(Op0, Op1, I.getFastMathFlags(), DL, TLI, DT, AC))
+ SimplifyFSubInst(Op0, Op1, I.getFastMathFlags(), DL, &TLI, &DT, &AC))
return replaceInstUsesWith(I, V);
// fsub nsz 0, X ==> fsub nsz -0.0, X
diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index 1a6459b..da5384a 100644
--- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -98,12 +98,11 @@ Value *InstCombiner::SimplifyBSwap(BinaryOperator &I) {
IntegerType *ITy = dyn_cast<IntegerType>(I.getType());
// Can't do vectors.
- if (I.getType()->isVectorTy()) return nullptr;
+ if (I.getType()->isVectorTy())
+ return nullptr;
// Can only do bitwise ops.
- unsigned Op = I.getOpcode();
- if (Op != Instruction::And && Op != Instruction::Or &&
- Op != Instruction::Xor)
+ if (!I.isBitwiseLogicOp())
return nullptr;
Value *OldLHS = I.getOperand(0);
@@ -132,14 +131,7 @@ Value *InstCombiner::SimplifyBSwap(BinaryOperator &I) {
Value *NewRHS = IsBswapRHS ? IntrRHS->getOperand(0) :
Builder->getInt(ConstRHS->getValue().byteSwap());
- Value *BinOp = nullptr;
- if (Op == Instruction::And)
- BinOp = Builder->CreateAnd(NewLHS, NewRHS);
- else if (Op == Instruction::Or)
- BinOp = Builder->CreateOr(NewLHS, NewRHS);
- else //if (Op == Instruction::Xor)
- BinOp = Builder->CreateXor(NewLHS, NewRHS);
-
+ Value *BinOp = Builder->CreateBinOp(I.getOpcode(), NewLHS, NewRHS);
Function *F = Intrinsic::getDeclaration(I.getModule(), Intrinsic::bswap, ITy);
return Builder->CreateCall(F, BinOp);
}
@@ -283,51 +275,31 @@ Instruction *InstCombiner::OptAndOp(Instruction *Op,
}
/// Emit a computation of: (V >= Lo && V < Hi) if Inside is true, otherwise
-/// (V < Lo || V >= Hi). In practice, we emit the more efficient
-/// (V-Lo) \<u Hi-Lo. This method expects that Lo <= Hi. isSigned indicates
-/// whether to treat the V, Lo and HI as signed or not. IB is the location to
-/// insert new instructions.
-Value *InstCombiner::InsertRangeTest(Value *V, Constant *Lo, Constant *Hi,
+/// (V < Lo || V >= Hi). This method expects that Lo <= Hi. IsSigned indicates
+/// whether to treat V, Lo, and Hi as signed or not.
+Value *InstCombiner::insertRangeTest(Value *V, const APInt &Lo, const APInt &Hi,
bool isSigned, bool Inside) {
- assert(cast<ConstantInt>(ConstantExpr::getICmp((isSigned ?
- ICmpInst::ICMP_SLE:ICmpInst::ICMP_ULE), Lo, Hi))->getZExtValue() &&
+ assert((isSigned ? Lo.sle(Hi) : Lo.ule(Hi)) &&
"Lo is not <= Hi in range emission code!");
- if (Inside) {
- if (Lo == Hi) // Trivially false.
- return Builder->getFalse();
-
- // V >= Min && V < Hi --> V < Hi
- if (cast<ConstantInt>(Lo)->isMinValue(isSigned)) {
- ICmpInst::Predicate pred = (isSigned ?
- ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT);
- return Builder->CreateICmp(pred, V, Hi);
- }
-
- // Emit V-Lo <u Hi-Lo
- Constant *NegLo = ConstantExpr::getNeg(Lo);
- Value *Add = Builder->CreateAdd(V, NegLo, V->getName()+".off");
- Constant *UpperBound = ConstantExpr::getAdd(NegLo, Hi);
- return Builder->CreateICmpULT(Add, UpperBound);
- }
-
- if (Lo == Hi) // Trivially true.
- return Builder->getTrue();
+ Type *Ty = V->getType();
+ if (Lo == Hi)
+ return Inside ? ConstantInt::getFalse(Ty) : ConstantInt::getTrue(Ty);
- // V < Min || V >= Hi -> V > Hi-1
- Hi = SubOne(cast<ConstantInt>(Hi));
- if (cast<ConstantInt>(Lo)->isMinValue(isSigned)) {
- ICmpInst::Predicate pred = (isSigned ?
- ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT);
- return Builder->CreateICmp(pred, V, Hi);
+ // V >= Min && V < Hi --> V < Hi
+ // V < Min || V >= Hi --> V >= Hi
+ ICmpInst::Predicate Pred = Inside ? ICmpInst::ICMP_ULT : ICmpInst::ICMP_UGE;
+ if (isSigned ? Lo.isMinSignedValue() : Lo.isMinValue()) {
+ Pred = isSigned ? ICmpInst::getSignedPredicate(Pred) : Pred;
+ return Builder->CreateICmp(Pred, V, ConstantInt::get(Ty, Hi));
}
- // Emit V-Lo >u Hi-1-Lo
- // Note that Hi has already had one subtracted from it, above.
- ConstantInt *NegLo = cast<ConstantInt>(ConstantExpr::getNeg(Lo));
- Value *Add = Builder->CreateAdd(V, NegLo, V->getName()+".off");
- Constant *LowerBound = ConstantExpr::getAdd(NegLo, Hi);
- return Builder->CreateICmpUGT(Add, LowerBound);
+ // V >= Lo && V < Hi --> V - Lo u< Hi - Lo
+ // V < Lo || V >= Hi --> V - Lo u>= Hi - Lo
+ Value *VMinusLo =
+ Builder->CreateSub(V, ConstantInt::get(Ty, Lo), V->getName() + ".off");
+ Constant *HiMinusLo = ConstantInt::get(Ty, Hi - Lo);
+ return Builder->CreateICmp(Pred, VMinusLo, HiMinusLo);
}
/// Returns true iff Val consists of one contiguous run of 1s with any number
@@ -524,53 +496,6 @@ static unsigned conjugateICmpMask(unsigned Mask) {
return NewMask;
}
-/// Decompose an icmp into the form ((X & Y) pred Z) if possible.
-/// The returned predicate is either == or !=. Returns false if
-/// decomposition fails.
-static bool decomposeBitTestICmp(const ICmpInst *I, ICmpInst::Predicate &Pred,
- Value *&X, Value *&Y, Value *&Z) {
- ConstantInt *C = dyn_cast<ConstantInt>(I->getOperand(1));
- if (!C)
- return false;
-
- switch (I->getPredicate()) {
- default:
- return false;
- case ICmpInst::ICMP_SLT:
- // X < 0 is equivalent to (X & SignBit) != 0.
- if (!C->isZero())
- return false;
- Y = ConstantInt::get(I->getContext(), APInt::getSignBit(C->getBitWidth()));
- Pred = ICmpInst::ICMP_NE;
- break;
- case ICmpInst::ICMP_SGT:
- // X > -1 is equivalent to (X & SignBit) == 0.
- if (!C->isAllOnesValue())
- return false;
- Y = ConstantInt::get(I->getContext(), APInt::getSignBit(C->getBitWidth()));
- Pred = ICmpInst::ICMP_EQ;
- break;
- case ICmpInst::ICMP_ULT:
- // X <u 2^n is equivalent to (X & ~(2^n-1)) == 0.
- if (!C->getValue().isPowerOf2())
- return false;
- Y = ConstantInt::get(I->getContext(), -C->getValue());
- Pred = ICmpInst::ICMP_EQ;
- break;
- case ICmpInst::ICMP_UGT:
- // X >u 2^n-1 is equivalent to (X & ~(2^n-1)) != 0.
- if (!(C->getValue() + 1).isPowerOf2())
- return false;
- Y = ConstantInt::get(I->getContext(), ~C->getValue());
- Pred = ICmpInst::ICMP_NE;
- break;
- }
-
- X = I->getOperand(0);
- Z = ConstantInt::getNullValue(C->getType());
- return true;
-}
-
/// Handle (icmp(A & B) ==/!= C) &/| (icmp(A & D) ==/!= E)
/// Return the set of pattern classes (from MaskedICmpType)
/// that both LHS and RHS satisfy.
@@ -1001,7 +926,8 @@ Value *InstCombiner::FoldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS) {
if (LHSCst == SubOne(RHSCst)) // (X != 13 & X u< 14) -> X < 13
return Builder->CreateICmpULT(Val, LHSCst);
if (LHSCst->isNullValue()) // (X != 0 & X u< 14) -> X-1 u< 13
- return InsertRangeTest(Val, AddOne(LHSCst), RHSCst, false, true);
+ return insertRangeTest(Val, LHSCst->getValue() + 1, RHSCst->getValue(),
+ false, true);
break; // (X != 13 & X u< 15) -> no change
case ICmpInst::ICMP_SLT:
if (LHSCst == SubOne(RHSCst)) // (X != 13 & X s< 14) -> X < 13
@@ -1065,7 +991,8 @@ Value *InstCombiner::FoldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS) {
return Builder->CreateICmp(LHSCC, Val, RHSCst);
break; // (X u> 13 & X != 15) -> no change
case ICmpInst::ICMP_ULT: // (X u> 13 & X u< 15) -> (X-14) <u 1
- return InsertRangeTest(Val, AddOne(LHSCst), RHSCst, false, true);
+ return insertRangeTest(Val, LHSCst->getValue() + 1, RHSCst->getValue(),
+ false, true);
case ICmpInst::ICMP_SLT: // (X u> 13 & X s< 15) -> no change
break;
}
@@ -1083,7 +1010,8 @@ Value *InstCombiner::FoldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS) {
return Builder->CreateICmp(LHSCC, Val, RHSCst);
break; // (X s> 13 & X != 15) -> no change
case ICmpInst::ICMP_SLT: // (X s> 13 & X s< 15) -> (X-14) s< 1
- return InsertRangeTest(Val, AddOne(LHSCst), RHSCst, true, true);
+ return insertRangeTest(Val, LHSCst->getValue() + 1, RHSCst->getValue(),
+ true, true);
case ICmpInst::ICMP_ULT: // (X s> 13 & X u< 15) -> no change
break;
}
@@ -1170,34 +1098,73 @@ static Instruction *matchDeMorgansLaws(BinaryOperator &I,
return BinaryOperator::CreateNot(LogicOp);
}
- // De Morgan's Law in disguise:
- // (zext(bool A) ^ 1) & (zext(bool B) ^ 1) -> zext(~(A | B))
- // (zext(bool A) ^ 1) | (zext(bool B) ^ 1) -> zext(~(A & B))
- Value *A = nullptr;
- Value *B = nullptr;
- ConstantInt *C1 = nullptr;
- if (match(Op0, m_OneUse(m_Xor(m_ZExt(m_Value(A)), m_ConstantInt(C1)))) &&
- match(Op1, m_OneUse(m_Xor(m_ZExt(m_Value(B)), m_Specific(C1))))) {
- // TODO: This check could be loosened to handle different type sizes.
- // Alternatively, we could fix the definition of m_Not to recognize a not
- // operation hidden by a zext?
- if (A->getType()->isIntegerTy(1) && B->getType()->isIntegerTy(1) &&
- C1->isOne()) {
- Value *LogicOp = Builder->CreateBinOp(Opcode, A, B,
- I.getName() + ".demorgan");
- Value *Not = Builder->CreateNot(LogicOp);
- return CastInst::CreateZExtOrBitCast(Not, I.getType());
+ return nullptr;
+}
+
+bool InstCombiner::shouldOptimizeCast(CastInst *CI) {
+ Value *CastSrc = CI->getOperand(0);
+
+ // Noop casts and casts of constants should be eliminated trivially.
+ if (CI->getSrcTy() == CI->getDestTy() || isa<Constant>(CastSrc))
+ return false;
+
+ // If this cast is paired with another cast that can be eliminated, we prefer
+ // to have it eliminated.
+ if (const auto *PrecedingCI = dyn_cast<CastInst>(CastSrc))
+ if (isEliminableCastPair(PrecedingCI, CI))
+ return false;
+
+ // If this is a vector sext from a compare, then we don't want to break the
+ // idiom where each element of the extended vector is either zero or all ones.
+ if (CI->getOpcode() == Instruction::SExt &&
+ isa<CmpInst>(CastSrc) && CI->getDestTy()->isVectorTy())
+ return false;
+
+ return true;
+}
+
+/// Fold {and,or,xor} (cast X), C.
+static Instruction *foldLogicCastConstant(BinaryOperator &Logic, CastInst *Cast,
+ InstCombiner::BuilderTy *Builder) {
+ Constant *C;
+ if (!match(Logic.getOperand(1), m_Constant(C)))
+ return nullptr;
+
+ auto LogicOpc = Logic.getOpcode();
+ Type *DestTy = Logic.getType();
+ Type *SrcTy = Cast->getSrcTy();
+
+ // If the first operand is bitcast, move the logic operation ahead of the
+ // bitcast (do the logic operation in the original type). This can eliminate
+ // bitcasts and allow combines that would otherwise be impeded by the bitcast.
+ Value *X;
+ if (match(Cast, m_BitCast(m_Value(X)))) {
+ Value *NewConstant = ConstantExpr::getBitCast(C, SrcTy);
+ Value *NewOp = Builder->CreateBinOp(LogicOpc, X, NewConstant);
+ return CastInst::CreateBitOrPointerCast(NewOp, DestTy);
+ }
+
+ // Similarly, move the logic operation ahead of a zext if the constant is
+ // unchanged in the smaller source type. Performing the logic in a smaller
+ // type may provide more information to later folds, and the smaller logic
+ // instruction may be cheaper (particularly in the case of vectors).
+ if (match(Cast, m_OneUse(m_ZExt(m_Value(X))))) {
+ Constant *TruncC = ConstantExpr::getTrunc(C, SrcTy);
+ Constant *ZextTruncC = ConstantExpr::getZExt(TruncC, DestTy);
+ if (ZextTruncC == C) {
+ // LogicOpc (zext X), C --> zext (LogicOpc X, C)
+ Value *NewOp = Builder->CreateBinOp(LogicOpc, X, TruncC);
+ return new ZExtInst(NewOp, DestTy);
}
}
return nullptr;
}
+/// Fold {and,or,xor} (cast X), Y.
Instruction *InstCombiner::foldCastedBitwiseLogic(BinaryOperator &I) {
auto LogicOpc = I.getOpcode();
- assert((LogicOpc == Instruction::And || LogicOpc == Instruction::Or ||
- LogicOpc == Instruction::Xor) &&
- "Unexpected opcode for bitwise logic folding");
+ assert(I.isBitwiseLogicOp() && "Unexpected opcode for bitwise logic folding");
Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
CastInst *Cast0 = dyn_cast<CastInst>(Op0);
@@ -1211,18 +1178,8 @@ Instruction *InstCombiner::foldCastedBitwiseLogic(BinaryOperator &I) {
if (!SrcTy->isIntOrIntVectorTy())
return nullptr;
- // If one operand is a bitcast and the other is a constant, move the logic
- // operation ahead of the bitcast. That is, do the logic operation in the
- // original type. This can eliminate useless bitcasts and allow normal
- // combines that would otherwise be impeded by the bitcast. Canonicalization
- // ensures that if there is a constant operand, it will be the second operand.
- Value *BC = nullptr;
- Constant *C = nullptr;
- if ((match(Op0, m_BitCast(m_Value(BC))) && match(Op1, m_Constant(C)))) {
- Value *NewConstant = ConstantExpr::getBitCast(C, SrcTy);
- Value *NewOp = Builder->CreateBinOp(LogicOpc, BC, NewConstant, I.getName());
- return CastInst::CreateBitOrPointerCast(NewOp, DestTy);
- }
+ if (Instruction *Ret = foldLogicCastConstant(I, Cast0, Builder))
+ return Ret;
CastInst *Cast1 = dyn_cast<CastInst>(Op1);
if (!Cast1)
@@ -1237,12 +1194,8 @@ Instruction *InstCombiner::foldCastedBitwiseLogic(BinaryOperator &I) {
Value *Cast0Src = Cast0->getOperand(0);
Value *Cast1Src = Cast1->getOperand(0);
- // fold (logic (cast A), (cast B)) -> (cast (logic A, B))
-
- // Only do this if the casts both really cause code to be generated.
- if ((!isa<ICmpInst>(Cast0Src) || !isa<ICmpInst>(Cast1Src)) &&
- ShouldOptimizeCast(CastOpcode, Cast0Src, DestTy) &&
- ShouldOptimizeCast(CastOpcode, Cast1Src, DestTy)) {
+ // fold logic(cast(A), cast(B)) -> cast(logic(A, B))
+ if (shouldOptimizeCast(Cast0) && shouldOptimizeCast(Cast1)) {
Value *NewOp = Builder->CreateBinOp(LogicOpc, Cast0Src, Cast1Src,
I.getName());
return CastInst::Create(CastOpcode, NewOp, DestTy);
@@ -1301,10 +1254,13 @@ static Instruction *foldBoolSextMaskToSelect(BinaryOperator &I) {
Value *Zero = Constant::getNullValue(Op0->getType());
return SelectInst::Create(X, Zero, Op1);
}
-
+
return nullptr;
}
+// FIXME: We use commutative matchers (m_c_*) for some, but not all, matches
+// here. We should standardize that construct where it is needed or choose some
+// other way to ensure that commutated variants of patterns are not missed.
Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
bool Changed = SimplifyAssociativeOrCommutative(I);
Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
@@ -1312,7 +1268,7 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
if (Value *V = SimplifyVectorOp(I))
return replaceInstUsesWith(I, V);
- if (Value *V = SimplifyAndInst(Op0, Op1, DL, TLI, DT, AC))
+ if (Value *V = SimplifyAndInst(Op0, Op1, DL, &TLI, &DT, &AC))
return replaceInstUsesWith(I, V);
// (A|B)&(A|C) -> A|(B&C) etc
@@ -1426,13 +1382,8 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
}
}
- // Try to fold constant and into select arguments.
- if (SelectInst *SI = dyn_cast<SelectInst>(Op0))
- if (Instruction *R = FoldOpIntoSelect(I, SI))
- return R;
- if (isa<PHINode>(Op0))
- if (Instruction *NV = FoldOpIntoPhi(I))
- return NV;
+ if (Instruction *FoldedLogic = foldOpWithConstantIntoOperand(I))
+ return FoldedLogic;
}
if (Instruction *DeMorgan = matchDeMorgansLaws(I, Builder))
@@ -1503,8 +1454,9 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
return BinaryOperator::CreateAnd(A, B);
// ((~A) ^ B) & (A | B) -> (A & B)
+ // ((~A) ^ B) & (B | A) -> (A & B)
if (match(Op0, m_Xor(m_Not(m_Value(A)), m_Value(B))) &&
- match(Op1, m_Or(m_Specific(A), m_Specific(B))))
+ match(Op1, m_c_Or(m_Specific(A), m_Specific(B))))
return BinaryOperator::CreateAnd(A, B);
}
@@ -1697,17 +1649,17 @@ Value *InstCombiner::FoldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS,
Value *Mask = nullptr;
Value *Masked = nullptr;
if (LAnd->getOperand(0) == RAnd->getOperand(0) &&
- isKnownToBeAPowerOfTwo(LAnd->getOperand(1), DL, false, 0, AC, CxtI,
- DT) &&
- isKnownToBeAPowerOfTwo(RAnd->getOperand(1), DL, false, 0, AC, CxtI,
- DT)) {
+ isKnownToBeAPowerOfTwo(LAnd->getOperand(1), DL, false, 0, &AC, CxtI,
+ &DT) &&
+ isKnownToBeAPowerOfTwo(RAnd->getOperand(1), DL, false, 0, &AC, CxtI,
+ &DT)) {
Mask = Builder->CreateOr(LAnd->getOperand(1), RAnd->getOperand(1));
Masked = Builder->CreateAnd(LAnd->getOperand(0), Mask);
} else if (LAnd->getOperand(1) == RAnd->getOperand(1) &&
- isKnownToBeAPowerOfTwo(LAnd->getOperand(0), DL, false, 0, AC,
- CxtI, DT) &&
- isKnownToBeAPowerOfTwo(RAnd->getOperand(0), DL, false, 0, AC,
- CxtI, DT)) {
+ isKnownToBeAPowerOfTwo(LAnd->getOperand(0), DL, false, 0, &AC,
+ CxtI, &DT) &&
+ isKnownToBeAPowerOfTwo(RAnd->getOperand(0), DL, false, 0, &AC,
+ CxtI, &DT)) {
Mask = Builder->CreateOr(LAnd->getOperand(0), RAnd->getOperand(0));
Masked = Builder->CreateAnd(LAnd->getOperand(1), Mask);
}
@@ -1825,7 +1777,7 @@ Value *InstCombiner::FoldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS,
// E.g. (icmp sgt x, n) | (icmp slt x, 0) --> icmp ugt x, n
if (Value *V = simplifyRangeCheck(RHS, LHS, /*Inverted=*/true))
return V;
-
+
// This only handles icmp of constants: (icmp1 A, C1) | (icmp2 B, C2).
if (!LHSCst || !RHSCst) return nullptr;
@@ -1943,7 +1895,8 @@ Value *InstCombiner::FoldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS,
// this can cause overflow.
if (RHSCst->isMaxValue(false))
return LHS;
- return InsertRangeTest(Val, LHSCst, AddOne(RHSCst), false, false);
+ return insertRangeTest(Val, LHSCst->getValue(), RHSCst->getValue() + 1,
+ false, false);
case ICmpInst::ICMP_SGT: // (X u< 13 | X s> 15) -> no change
break;
case ICmpInst::ICMP_NE: // (X u< 13 | X != 15) -> X != 15
@@ -1963,7 +1916,8 @@ Value *InstCombiner::FoldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS,
// this can cause overflow.
if (RHSCst->isMaxValue(true))
return LHS;
- return InsertRangeTest(Val, LHSCst, AddOne(RHSCst), true, false);
+ return insertRangeTest(Val, LHSCst->getValue(), RHSCst->getValue() + 1,
+ true, false);
case ICmpInst::ICMP_UGT: // (X s< 13 | X u> 15) -> no change
break;
case ICmpInst::ICMP_NE: // (X s< 13 | X != 15) -> X != 15
@@ -2119,6 +2073,9 @@ Instruction *InstCombiner::FoldXorWithConstants(BinaryOperator &I, Value *Op,
return nullptr;
}
+// FIXME: We use commutative matchers (m_c_*) for some, but not all, matches
+// here. We should standardize that construct where it is needed or choose some
+// other way to ensure that commutated variants of patterns are not missed.
Instruction *InstCombiner::visitOr(BinaryOperator &I) {
bool Changed = SimplifyAssociativeOrCommutative(I);
Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
@@ -2126,7 +2083,7 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
if (Value *V = SimplifyVectorOp(I))
return replaceInstUsesWith(I, V);
- if (Value *V = SimplifyOrInst(Op0, Op1, DL, TLI, DT, AC))
+ if (Value *V = SimplifyOrInst(Op0, Op1, DL, &TLI, &DT, &AC))
return replaceInstUsesWith(I, V);
// (A&B)|(A&C) -> A&(B|C) etc
@@ -2163,14 +2120,8 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
Builder->getInt(C1->getValue() & ~RHS->getValue()));
}
- // Try to fold constant and into select arguments.
- if (SelectInst *SI = dyn_cast<SelectInst>(Op0))
- if (Instruction *R = FoldOpIntoSelect(I, SI))
- return R;
-
- if (isa<PHINode>(Op0))
- if (Instruction *NV = FoldOpIntoPhi(I))
- return NV;
+ if (Instruction *FoldedLogic = foldOpWithConstantIntoOperand(I))
+ return FoldedLogic;
}
// Given an OR instruction, check to see if this is a bswap.
@@ -2208,14 +2159,17 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
match(Op1, m_Not(m_Specific(A))))
return BinaryOperator::CreateOr(Builder->CreateNot(A), B);
- // (A & (~B)) | (A ^ B) -> (A ^ B)
- if (match(Op0, m_And(m_Value(A), m_Not(m_Value(B)))) &&
+ // (A & ~B) | (A ^ B) -> (A ^ B)
+ // (~B & A) | (A ^ B) -> (A ^ B)
+ if (match(Op0, m_c_And(m_Value(A), m_Not(m_Value(B)))) &&
match(Op1, m_Xor(m_Specific(A), m_Specific(B))))
return BinaryOperator::CreateXor(A, B);
- // (A ^ B) | ( A & (~B)) -> (A ^ B)
- if (match(Op0, m_Xor(m_Value(A), m_Value(B))) &&
- match(Op1, m_And(m_Specific(A), m_Not(m_Specific(B)))))
+ // Commute the 'or' operands.
+ // (A ^ B) | (A & ~B) -> (A ^ B)
+ // (A ^ B) | (~B & A) -> (A ^ B)
+ if (match(Op1, m_c_And(m_Value(A), m_Not(m_Value(B)))) &&
+ match(Op0, m_Xor(m_Specific(A), m_Specific(B))))
return BinaryOperator::CreateXor(A, B);
// (A & C)|(B & D)
@@ -2385,14 +2339,15 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
return BinaryOperator::CreateOr(Not, Op0);
}
- // (A & B) | ((~A) ^ B) -> (~A ^ B)
- if (match(Op0, m_And(m_Value(A), m_Value(B))) &&
- match(Op1, m_Xor(m_Not(m_Specific(A)), m_Specific(B))))
- return BinaryOperator::CreateXor(Builder->CreateNot(A), B);
-
- // ((~A) ^ B) | (A & B) -> (~A ^ B)
- if (match(Op0, m_Xor(m_Not(m_Value(A)), m_Value(B))) &&
- match(Op1, m_And(m_Specific(A), m_Specific(B))))
+ // (A & B) | (~A ^ B) -> (~A ^ B)
+ // (A & B) | (B ^ ~A) -> (~A ^ B)
+ // (B & A) | (~A ^ B) -> (~A ^ B)
+ // (B & A) | (B ^ ~A) -> (~A ^ B)
+ // The match order is important: match the xor first because the 'not'
+ // operation defines 'A'. We do not need to match the xor as Op0 because the
+ // xor was canonicalized to Op1 above.
+ if (match(Op1, m_c_Xor(m_Not(m_Value(A)), m_Value(B))) &&
+ match(Op0, m_c_And(m_Specific(A), m_Specific(B))))
return BinaryOperator::CreateXor(Builder->CreateNot(A), B);
if (SwappedForXor)
@@ -2472,6 +2427,9 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
return Changed ? &I : nullptr;
}
+// FIXME: We use commutative matchers (m_c_*) for some, but not all, matches
+// here. We should standardize that construct where it is needed or choose some
+// other way to ensure that commutated variants of patterns are not missed.
Instruction *InstCombiner::visitXor(BinaryOperator &I) {
bool Changed = SimplifyAssociativeOrCommutative(I);
Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
@@ -2479,7 +2437,7 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
if (Value *V = SimplifyVectorOp(I))
return replaceInstUsesWith(I, V);
- if (Value *V = SimplifyXorInst(Op0, Op1, DL, TLI, DT, AC))
+ if (Value *V = SimplifyXorInst(Op0, Op1, DL, &TLI, &DT, &AC))
return replaceInstUsesWith(I, V);
// (A&B)^(A&C) -> A&(B^C) etc
@@ -2625,13 +2583,8 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
}
}
- // Try to fold constant and into select arguments.
- if (SelectInst *SI = dyn_cast<SelectInst>(Op0))
- if (Instruction *R = FoldOpIntoSelect(I, SI))
- return R;
- if (isa<PHINode>(Op0))
- if (Instruction *NV = FoldOpIntoPhi(I))
- return NV;
+ if (Instruction *FoldedLogic = foldOpWithConstantIntoOperand(I))
+ return FoldedLogic;
}
BinaryOperator *Op1I = dyn_cast<BinaryOperator>(Op1);
@@ -2694,20 +2647,22 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
return BinaryOperator::CreateXor(A, B);
}
// (A | ~B) ^ (~A | B) -> A ^ B
- if (match(Op0I, m_Or(m_Value(A), m_Not(m_Value(B)))) &&
- match(Op1I, m_Or(m_Not(m_Specific(A)), m_Specific(B)))) {
+ // (~B | A) ^ (~A | B) -> A ^ B
+ if (match(Op0I, m_c_Or(m_Value(A), m_Not(m_Value(B)))) &&
+ match(Op1I, m_Or(m_Not(m_Specific(A)), m_Specific(B))))
return BinaryOperator::CreateXor(A, B);
- }
+
// (~A | B) ^ (A | ~B) -> A ^ B
if (match(Op0I, m_Or(m_Not(m_Value(A)), m_Value(B))) &&
match(Op1I, m_Or(m_Specific(A), m_Not(m_Specific(B))))) {
return BinaryOperator::CreateXor(A, B);
}
// (A & ~B) ^ (~A & B) -> A ^ B
- if (match(Op0I, m_And(m_Value(A), m_Not(m_Value(B)))) &&
- match(Op1I, m_And(m_Not(m_Specific(A)), m_Specific(B)))) {
+ // (~B & A) ^ (~A & B) -> A ^ B
+ if (match(Op0I, m_c_And(m_Value(A), m_Not(m_Value(B)))) &&
+ match(Op1I, m_And(m_Not(m_Specific(A)), m_Specific(B))))
return BinaryOperator::CreateXor(A, B);
- }
+
// (~A & B) ^ (A & ~B) -> A ^ B
if (match(Op0I, m_And(m_Not(m_Value(A)), m_Value(B))) &&
match(Op1I, m_And(m_Specific(A), m_Not(m_Specific(B))))) {
@@ -2743,9 +2698,10 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
return BinaryOperator::CreateOr(A, B);
}
- Value *A = nullptr, *B = nullptr;
- // (A & ~B) ^ (~A) -> ~(A & B)
- if (match(Op0, m_And(m_Value(A), m_Not(m_Value(B)))) &&
+ // (A & ~B) ^ ~A -> ~(A & B)
+ // (~B & A) ^ ~A -> ~(A & B)
+ Value *A, *B;
+ if (match(Op0, m_c_And(m_Value(A), m_Not(m_Value(B)))) &&
match(Op1, m_Not(m_Specific(A))))
return BinaryOperator::CreateNot(Builder->CreateAnd(A, B));
diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 8acff91..2ef82ba 100644
--- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -12,17 +12,47 @@
//===----------------------------------------------------------------------===//
#include "InstCombineInternal.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/None.h"
#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Twine.h"
#include "llvm/Analysis/InstructionSimplify.h"
-#include "llvm/Analysis/Loads.h"
#include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/CallSite.h"
-#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/IR/Statepoint.h"
-#include "llvm/Transforms/Utils/BuildLibCalls.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/SimplifyLibCalls.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+#include <vector>
+
using namespace llvm;
using namespace PatternMatch;
@@ -79,8 +109,8 @@ static Constant *getNegativeIsTrueBoolVec(ConstantDataVector *V) {
}
Instruction *InstCombiner::SimplifyMemTransfer(MemIntrinsic *MI) {
- unsigned DstAlign = getKnownAlignment(MI->getArgOperand(0), DL, MI, AC, DT);
- unsigned SrcAlign = getKnownAlignment(MI->getArgOperand(1), DL, MI, AC, DT);
+ unsigned DstAlign = getKnownAlignment(MI->getArgOperand(0), DL, MI, &AC, &DT);
+ unsigned SrcAlign = getKnownAlignment(MI->getArgOperand(1), DL, MI, &AC, &DT);
unsigned MinAlign = std::min(DstAlign, SrcAlign);
unsigned CopyAlign = MI->getAlignment();
@@ -162,10 +192,17 @@ Instruction *InstCombiner::SimplifyMemTransfer(MemIntrinsic *MI) {
L->setAlignment(SrcAlign);
if (CopyMD)
L->setMetadata(LLVMContext::MD_tbaa, CopyMD);
+ MDNode *LoopMemParallelMD =
+ MI->getMetadata(LLVMContext::MD_mem_parallel_loop_access);
+ if (LoopMemParallelMD)
+ L->setMetadata(LLVMContext::MD_mem_parallel_loop_access, LoopMemParallelMD);
+
StoreInst *S = Builder->CreateStore(L, Dest, MI->isVolatile());
S->setAlignment(DstAlign);
if (CopyMD)
S->setMetadata(LLVMContext::MD_tbaa, CopyMD);
+ if (LoopMemParallelMD)
+ S->setMetadata(LLVMContext::MD_mem_parallel_loop_access, LoopMemParallelMD);
// Set the size of the copy to 0, it will be deleted on the next iteration.
MI->setArgOperand(2, Constant::getNullValue(MemOpLength->getType()));
@@ -173,7 +210,7 @@ Instruction *InstCombiner::SimplifyMemTransfer(MemIntrinsic *MI) {
}
Instruction *InstCombiner::SimplifyMemSet(MemSetInst *MI) {
- unsigned Alignment = getKnownAlignment(MI->getDest(), DL, MI, AC, DT);
+ unsigned Alignment = getKnownAlignment(MI->getDest(), DL, MI, &AC, &DT);
if (MI->getAlignment() < Alignment) {
MI->setAlignment(ConstantInt::get(MI->getAlignmentType(),
Alignment, false));
@@ -221,8 +258,7 @@ static Value *simplifyX86immShift(const IntrinsicInst &II,
bool ShiftLeft = false;
switch (II.getIntrinsicID()) {
- default:
- return nullptr;
+ default: llvm_unreachable("Unexpected intrinsic!");
case Intrinsic::x86_sse2_psra_d:
case Intrinsic::x86_sse2_psra_w:
case Intrinsic::x86_sse2_psrai_d:
@@ -231,6 +267,16 @@ static Value *simplifyX86immShift(const IntrinsicInst &II,
case Intrinsic::x86_avx2_psra_w:
case Intrinsic::x86_avx2_psrai_d:
case Intrinsic::x86_avx2_psrai_w:
+ case Intrinsic::x86_avx512_psra_q_128:
+ case Intrinsic::x86_avx512_psrai_q_128:
+ case Intrinsic::x86_avx512_psra_q_256:
+ case Intrinsic::x86_avx512_psrai_q_256:
+ case Intrinsic::x86_avx512_psra_d_512:
+ case Intrinsic::x86_avx512_psra_q_512:
+ case Intrinsic::x86_avx512_psra_w_512:
+ case Intrinsic::x86_avx512_psrai_d_512:
+ case Intrinsic::x86_avx512_psrai_q_512:
+ case Intrinsic::x86_avx512_psrai_w_512:
LogicalShift = false; ShiftLeft = false;
break;
case Intrinsic::x86_sse2_psrl_d:
@@ -245,6 +291,12 @@ static Value *simplifyX86immShift(const IntrinsicInst &II,
case Intrinsic::x86_avx2_psrli_d:
case Intrinsic::x86_avx2_psrli_q:
case Intrinsic::x86_avx2_psrli_w:
+ case Intrinsic::x86_avx512_psrl_d_512:
+ case Intrinsic::x86_avx512_psrl_q_512:
+ case Intrinsic::x86_avx512_psrl_w_512:
+ case Intrinsic::x86_avx512_psrli_d_512:
+ case Intrinsic::x86_avx512_psrli_q_512:
+ case Intrinsic::x86_avx512_psrli_w_512:
LogicalShift = true; ShiftLeft = false;
break;
case Intrinsic::x86_sse2_psll_d:
@@ -259,6 +311,12 @@ static Value *simplifyX86immShift(const IntrinsicInst &II,
case Intrinsic::x86_avx2_pslli_d:
case Intrinsic::x86_avx2_pslli_q:
case Intrinsic::x86_avx2_pslli_w:
+ case Intrinsic::x86_avx512_psll_d_512:
+ case Intrinsic::x86_avx512_psll_q_512:
+ case Intrinsic::x86_avx512_psll_w_512:
+ case Intrinsic::x86_avx512_pslli_d_512:
+ case Intrinsic::x86_avx512_pslli_q_512:
+ case Intrinsic::x86_avx512_pslli_w_512:
LogicalShift = true; ShiftLeft = true;
break;
}
@@ -334,10 +392,16 @@ static Value *simplifyX86varShift(const IntrinsicInst &II,
bool ShiftLeft = false;
switch (II.getIntrinsicID()) {
- default:
- return nullptr;
+ default: llvm_unreachable("Unexpected intrinsic!");
case Intrinsic::x86_avx2_psrav_d:
case Intrinsic::x86_avx2_psrav_d_256:
+ case Intrinsic::x86_avx512_psrav_q_128:
+ case Intrinsic::x86_avx512_psrav_q_256:
+ case Intrinsic::x86_avx512_psrav_d_512:
+ case Intrinsic::x86_avx512_psrav_q_512:
+ case Intrinsic::x86_avx512_psrav_w_128:
+ case Intrinsic::x86_avx512_psrav_w_256:
+ case Intrinsic::x86_avx512_psrav_w_512:
LogicalShift = false;
ShiftLeft = false;
break;
@@ -345,6 +409,11 @@ static Value *simplifyX86varShift(const IntrinsicInst &II,
case Intrinsic::x86_avx2_psrlv_d_256:
case Intrinsic::x86_avx2_psrlv_q:
case Intrinsic::x86_avx2_psrlv_q_256:
+ case Intrinsic::x86_avx512_psrlv_d_512:
+ case Intrinsic::x86_avx512_psrlv_q_512:
+ case Intrinsic::x86_avx512_psrlv_w_128:
+ case Intrinsic::x86_avx512_psrlv_w_256:
+ case Intrinsic::x86_avx512_psrlv_w_512:
LogicalShift = true;
ShiftLeft = false;
break;
@@ -352,6 +421,11 @@ static Value *simplifyX86varShift(const IntrinsicInst &II,
case Intrinsic::x86_avx2_psllv_d_256:
case Intrinsic::x86_avx2_psllv_q:
case Intrinsic::x86_avx2_psllv_q_256:
+ case Intrinsic::x86_avx512_psllv_d_512:
+ case Intrinsic::x86_avx512_psllv_q_512:
+ case Intrinsic::x86_avx512_psllv_w_128:
+ case Intrinsic::x86_avx512_psllv_w_256:
+ case Intrinsic::x86_avx512_psllv_w_512:
LogicalShift = true;
ShiftLeft = true;
break;
@@ -400,7 +474,7 @@ static Value *simplifyX86varShift(const IntrinsicInst &II,
// If all elements out of range or UNDEF, return vector of zeros/undefs.
// ArithmeticShift should only hit this if they are all UNDEF.
auto OutOfRange = [&](int Idx) { return (Idx < 0) || (BitWidth <= Idx); };
- if (llvm::all_of(ShiftAmts, OutOfRange)) {
+ if (all_of(ShiftAmts, OutOfRange)) {
SmallVector<Constant *, 8> ConstantVec;
for (int Idx : ShiftAmts) {
if (Idx < 0) {
@@ -547,7 +621,7 @@ static Value *simplifyX86extrq(IntrinsicInst &II, Value *Op0,
// See if we're dealing with constant values.
Constant *C0 = dyn_cast<Constant>(Op0);
ConstantInt *CI0 =
- C0 ? dyn_cast<ConstantInt>(C0->getAggregateElement((unsigned)0))
+ C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0))
: nullptr;
// Attempt to constant fold.
@@ -630,7 +704,6 @@ static Value *simplifyX86extrq(IntrinsicInst &II, Value *Op0,
static Value *simplifyX86insertq(IntrinsicInst &II, Value *Op0, Value *Op1,
APInt APLength, APInt APIndex,
InstCombiner::BuilderTy &Builder) {
-
// From AMD documentation: "The bit index and field length are each six bits
// in length other bits of the field are ignored."
APIndex = APIndex.zextOrTrunc(6);
@@ -686,10 +759,10 @@ static Value *simplifyX86insertq(IntrinsicInst &II, Value *Op0, Value *Op1,
Constant *C0 = dyn_cast<Constant>(Op0);
Constant *C1 = dyn_cast<Constant>(Op1);
ConstantInt *CI00 =
- C0 ? dyn_cast<ConstantInt>(C0->getAggregateElement((unsigned)0))
+ C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0))
: nullptr;
ConstantInt *CI10 =
- C1 ? dyn_cast<ConstantInt>(C1->getAggregateElement((unsigned)0))
+ C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0))
: nullptr;
// Constant Fold - insert bottom Length bits starting at the Index'th bit.
@@ -732,11 +805,11 @@ static Value *simplifyX86pshufb(const IntrinsicInst &II,
auto *VecTy = cast<VectorType>(II.getType());
auto *MaskEltTy = Type::getInt32Ty(II.getContext());
unsigned NumElts = VecTy->getNumElements();
- assert((NumElts == 16 || NumElts == 32) &&
+ assert((NumElts == 16 || NumElts == 32 || NumElts == 64) &&
"Unexpected number of elements in shuffle mask!");
// Construct a shuffle mask from constant integers or UNDEFs.
- Constant *Indexes[32] = {NULL};
+ Constant *Indexes[64] = {nullptr};
// Each byte in the shuffle control mask forms an index to permute the
// corresponding byte in the destination operand.
@@ -776,12 +849,15 @@ static Value *simplifyX86vpermilvar(const IntrinsicInst &II,
if (!V)
return nullptr;
+ auto *VecTy = cast<VectorType>(II.getType());
auto *MaskEltTy = Type::getInt32Ty(II.getContext());
- unsigned NumElts = cast<VectorType>(V->getType())->getNumElements();
- assert(NumElts == 8 || NumElts == 4 || NumElts == 2);
+ unsigned NumElts = VecTy->getVectorNumElements();
+ bool IsPD = VecTy->getScalarType()->isDoubleTy();
+ unsigned NumLaneElts = IsPD ? 2 : 4;
+ assert(NumElts == 16 || NumElts == 8 || NumElts == 4 || NumElts == 2);
// Construct a shuffle mask from constant integers or UNDEFs.
- Constant *Indexes[8] = {NULL};
+ Constant *Indexes[16] = {nullptr};
// The intrinsics only read one or two bits, clear the rest.
for (unsigned I = 0; I < NumElts; ++I) {
@@ -799,18 +875,13 @@ static Value *simplifyX86vpermilvar(const IntrinsicInst &II,
// The PD variants uses bit 1 to select per-lane element index, so
// shift down to convert to generic shuffle mask index.
- if (II.getIntrinsicID() == Intrinsic::x86_avx_vpermilvar_pd ||
- II.getIntrinsicID() == Intrinsic::x86_avx_vpermilvar_pd_256)
+ if (IsPD)
Index = Index.lshr(1);
// The _256 variants are a bit trickier since the mask bits always index
// into the corresponding 128 half. In order to convert to a generic
// shuffle, we have to make that explicit.
- if ((II.getIntrinsicID() == Intrinsic::x86_avx_vpermilvar_ps_256 ||
- II.getIntrinsicID() == Intrinsic::x86_avx_vpermilvar_pd_256) &&
- ((NumElts / 2) <= I)) {
- Index += APInt(32, NumElts / 2);
- }
+ Index += APInt(32, (I / NumLaneElts) * NumLaneElts);
Indexes[I] = ConstantInt::get(MaskEltTy, Index);
}
@@ -831,10 +902,11 @@ static Value *simplifyX86vpermv(const IntrinsicInst &II,
auto *VecTy = cast<VectorType>(II.getType());
auto *MaskEltTy = Type::getInt32Ty(II.getContext());
unsigned Size = VecTy->getNumElements();
- assert(Size == 8 && "Unexpected shuffle mask size");
+ assert((Size == 4 || Size == 8 || Size == 16 || Size == 32 || Size == 64) &&
+ "Unexpected shuffle mask size");
// Construct a shuffle mask from constant integers or UNDEFs.
- Constant *Indexes[8] = {NULL};
+ Constant *Indexes[64] = {nullptr};
for (unsigned I = 0; I < Size; ++I) {
Constant *COp = V->getAggregateElement(I);
@@ -846,8 +918,8 @@ static Value *simplifyX86vpermv(const IntrinsicInst &II,
continue;
}
- APInt Index = cast<ConstantInt>(COp)->getValue();
- Index = Index.zextOrTrunc(32).getLoBits(3);
+ uint32_t Index = cast<ConstantInt>(COp)->getZExtValue();
+ Index &= Size - 1;
Indexes[I] = ConstantInt::get(MaskEltTy, Index);
}
@@ -962,6 +1034,36 @@ static Value *simplifyX86vpcom(const IntrinsicInst &II,
return nullptr;
}
+// Emit a select instruction and appropriate bitcasts to help simplify
+// masked intrinsics.
+static Value *emitX86MaskSelect(Value *Mask, Value *Op0, Value *Op1,
+ InstCombiner::BuilderTy &Builder) {
+ unsigned VWidth = Op0->getType()->getVectorNumElements();
+
+ // If the mask is all ones we don't need the select. But we need to check
+ // only the bit thats will be used in case VWidth is less than 8.
+ if (auto *C = dyn_cast<ConstantInt>(Mask))
+ if (C->getValue().zextOrTrunc(VWidth).isAllOnesValue())
+ return Op0;
+
+ auto *MaskTy = VectorType::get(Builder.getInt1Ty(),
+ cast<IntegerType>(Mask->getType())->getBitWidth());
+ Mask = Builder.CreateBitCast(Mask, MaskTy);
+
+ // If we have less than 8 elements, then the starting mask was an i8 and
+ // we need to extract down to the right number of elements.
+ if (VWidth < 8) {
+ uint32_t Indices[4];
+ for (unsigned i = 0; i != VWidth; ++i)
+ Indices[i] = i;
+ Mask = Builder.CreateShuffleVector(Mask, Mask,
+ makeArrayRef(Indices, VWidth),
+ "extract");
+ }
+
+ return Builder.CreateSelect(Mask, Op0, Op1);
+}
+
static Value *simplifyMinnumMaxnum(const IntrinsicInst &II) {
Value *Arg0 = II.getArgOperand(0);
Value *Arg1 = II.getArgOperand(1);
@@ -1104,6 +1206,50 @@ static Instruction *simplifyMaskedScatter(IntrinsicInst &II, InstCombiner &IC) {
return nullptr;
}
+static Instruction *foldCttzCtlz(IntrinsicInst &II, InstCombiner &IC) {
+ assert((II.getIntrinsicID() == Intrinsic::cttz ||
+ II.getIntrinsicID() == Intrinsic::ctlz) &&
+ "Expected cttz or ctlz intrinsic");
+ Value *Op0 = II.getArgOperand(0);
+ // FIXME: Try to simplify vectors of integers.
+ auto *IT = dyn_cast<IntegerType>(Op0->getType());
+ if (!IT)
+ return nullptr;
+
+ unsigned BitWidth = IT->getBitWidth();
+ APInt KnownZero(BitWidth, 0);
+ APInt KnownOne(BitWidth, 0);
+ IC.computeKnownBits(Op0, KnownZero, KnownOne, 0, &II);
+
+ // Create a mask for bits above (ctlz) or below (cttz) the first known one.
+ bool IsTZ = II.getIntrinsicID() == Intrinsic::cttz;
+ unsigned NumMaskBits = IsTZ ? KnownOne.countTrailingZeros()
+ : KnownOne.countLeadingZeros();
+ APInt Mask = IsTZ ? APInt::getLowBitsSet(BitWidth, NumMaskBits)
+ : APInt::getHighBitsSet(BitWidth, NumMaskBits);
+
+ // If all bits above (ctlz) or below (cttz) the first known one are known
+ // zero, this value is constant.
+ // FIXME: This should be in InstSimplify because we're replacing an
+ // instruction with a constant.
+ if ((Mask & KnownZero) == Mask) {
+ auto *C = ConstantInt::get(IT, APInt(BitWidth, NumMaskBits));
+ return IC.replaceInstUsesWith(II, C);
+ }
+
+ // If the input to cttz/ctlz is known to be non-zero,
+ // then change the 'ZeroIsUndef' parameter to 'true'
+ // because we know the zero behavior can't affect the result.
+ if (KnownOne != 0 || isKnownNonZero(Op0, IC.getDataLayout())) {
+ if (!match(II.getArgOperand(1), m_One())) {
+ II.setOperand(1, IC.Builder->getTrue());
+ return &II;
+ }
+ }
+
+ return nullptr;
+}
+
// TODO: If the x86 backend knew how to convert a bool vector mask back to an
// XMM register mask efficiently, we could transform all x86 masked intrinsics
// to LLVM masked intrinsics and remove the x86 masked intrinsic defs.
@@ -1243,16 +1389,15 @@ Instruction *InstCombiner::visitVACopyInst(VACopyInst &I) {
Instruction *InstCombiner::visitCallInst(CallInst &CI) {
auto Args = CI.arg_operands();
if (Value *V = SimplifyCall(CI.getCalledValue(), Args.begin(), Args.end(), DL,
- TLI, DT, AC))
+ &TLI, &DT, &AC))
return replaceInstUsesWith(CI, V);
- if (isFreeCall(&CI, TLI))
+ if (isFreeCall(&CI, &TLI))
return visitFree(CI);
// If the caller function is nounwind, mark the call as nounwind, even if the
// callee isn't.
- if (CI.getParent()->getParent()->doesNotThrow() &&
- !CI.doesNotThrow()) {
+ if (CI.getFunction()->doesNotThrow() && !CI.doesNotThrow()) {
CI.setDoesNotThrow();
return &CI;
}
@@ -1323,26 +1468,15 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
APInt DemandedElts = APInt::getLowBitsSet(Width, DemandedWidth);
return SimplifyDemandedVectorElts(Op, DemandedElts, UndefElts);
};
- auto SimplifyDemandedVectorEltsHigh = [this](Value *Op, unsigned Width,
- unsigned DemandedWidth) {
- APInt UndefElts(Width, 0);
- APInt DemandedElts = APInt::getHighBitsSet(Width, DemandedWidth);
- return SimplifyDemandedVectorElts(Op, DemandedElts, UndefElts);
- };
switch (II->getIntrinsicID()) {
default: break;
- case Intrinsic::objectsize: {
- uint64_t Size;
- if (getObjectSize(II->getArgOperand(0), Size, DL, TLI)) {
- APInt APSize(II->getType()->getIntegerBitWidth(), Size);
- // Equality check to be sure that `Size` can fit in a value of type
- // `II->getType()`
- if (APSize == Size)
- return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), APSize));
- }
+ case Intrinsic::objectsize:
+ if (ConstantInt *N =
+ lowerObjectSizeCall(II, DL, &TLI, /*MustSucceed=*/false))
+ return replaceInstUsesWith(CI, N);
return nullptr;
- }
+
case Intrinsic::bswap: {
Value *IIOperand = II->getArgOperand(0);
Value *X = nullptr;
@@ -1397,41 +1531,11 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
II->getArgOperand(0));
}
break;
- case Intrinsic::cttz: {
- // If all bits below the first known one are known zero,
- // this value is constant.
- IntegerType *IT = dyn_cast<IntegerType>(II->getArgOperand(0)->getType());
- // FIXME: Try to simplify vectors of integers.
- if (!IT) break;
- uint32_t BitWidth = IT->getBitWidth();
- APInt KnownZero(BitWidth, 0);
- APInt KnownOne(BitWidth, 0);
- computeKnownBits(II->getArgOperand(0), KnownZero, KnownOne, 0, II);
- unsigned TrailingZeros = KnownOne.countTrailingZeros();
- APInt Mask(APInt::getLowBitsSet(BitWidth, TrailingZeros));
- if ((Mask & KnownZero) == Mask)
- return replaceInstUsesWith(CI, ConstantInt::get(IT,
- APInt(BitWidth, TrailingZeros)));
-
- }
- break;
- case Intrinsic::ctlz: {
- // If all bits above the first known one are known zero,
- // this value is constant.
- IntegerType *IT = dyn_cast<IntegerType>(II->getArgOperand(0)->getType());
- // FIXME: Try to simplify vectors of integers.
- if (!IT) break;
- uint32_t BitWidth = IT->getBitWidth();
- APInt KnownZero(BitWidth, 0);
- APInt KnownOne(BitWidth, 0);
- computeKnownBits(II->getArgOperand(0), KnownZero, KnownOne, 0, II);
- unsigned LeadingZeros = KnownOne.countLeadingZeros();
- APInt Mask(APInt::getHighBitsSet(BitWidth, LeadingZeros));
- if ((Mask & KnownZero) == Mask)
- return replaceInstUsesWith(CI, ConstantInt::get(IT,
- APInt(BitWidth, LeadingZeros)));
- }
+ case Intrinsic::cttz:
+ case Intrinsic::ctlz:
+ if (auto *I = foldCttzCtlz(*II, *this))
+ return I;
break;
case Intrinsic::uadd_with_overflow:
@@ -1446,7 +1550,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
II->setArgOperand(1, LHS);
return II;
}
- // fall through
+ LLVM_FALLTHROUGH;
case Intrinsic::usub_with_overflow:
case Intrinsic::ssub_with_overflow: {
@@ -1477,11 +1581,77 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
return replaceInstUsesWith(*II, V);
break;
}
+ case Intrinsic::fma:
+ case Intrinsic::fmuladd: {
+ Value *Src0 = II->getArgOperand(0);
+ Value *Src1 = II->getArgOperand(1);
+
+ // Canonicalize constants into the RHS.
+ if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
+ II->setArgOperand(0, Src1);
+ II->setArgOperand(1, Src0);
+ std::swap(Src0, Src1);
+ }
+
+ Value *LHS = nullptr;
+ Value *RHS = nullptr;
+
+ // fma fneg(x), fneg(y), z -> fma x, y, z
+ if (match(Src0, m_FNeg(m_Value(LHS))) &&
+ match(Src1, m_FNeg(m_Value(RHS)))) {
+ II->setArgOperand(0, LHS);
+ II->setArgOperand(1, RHS);
+ return II;
+ }
+
+ // fma fabs(x), fabs(x), z -> fma x, x, z
+ if (match(Src0, m_Intrinsic<Intrinsic::fabs>(m_Value(LHS))) &&
+ match(Src1, m_Intrinsic<Intrinsic::fabs>(m_Value(RHS))) && LHS == RHS) {
+ II->setArgOperand(0, LHS);
+ II->setArgOperand(1, RHS);
+ return II;
+ }
+
+ // fma x, 1, z -> fadd x, z
+ if (match(Src1, m_FPOne())) {
+ Instruction *RI = BinaryOperator::CreateFAdd(Src0, II->getArgOperand(2));
+ RI->copyFastMathFlags(II);
+ return RI;
+ }
+
+ break;
+ }
+ case Intrinsic::fabs: {
+ Value *Cond;
+ Constant *LHS, *RHS;
+ if (match(II->getArgOperand(0),
+ m_Select(m_Value(Cond), m_Constant(LHS), m_Constant(RHS)))) {
+ CallInst *Call0 = Builder->CreateCall(II->getCalledFunction(), {LHS});
+ CallInst *Call1 = Builder->CreateCall(II->getCalledFunction(), {RHS});
+ return SelectInst::Create(Cond, Call0, Call1);
+ }
+
+ break;
+ }
+ case Intrinsic::cos:
+ case Intrinsic::amdgcn_cos: {
+ Value *SrcSrc;
+ Value *Src = II->getArgOperand(0);
+ if (match(Src, m_FNeg(m_Value(SrcSrc))) ||
+ match(Src, m_Intrinsic<Intrinsic::fabs>(m_Value(SrcSrc)))) {
+ // cos(-x) -> cos(x)
+ // cos(fabs(x)) -> cos(x)
+ II->setArgOperand(0, SrcSrc);
+ return II;
+ }
+
+ break;
+ }
case Intrinsic::ppc_altivec_lvx:
case Intrinsic::ppc_altivec_lvxl:
// Turn PPC lvx -> load if the pointer is known aligned.
- if (getOrEnforceKnownAlignment(II->getArgOperand(0), 16, DL, II, AC, DT) >=
- 16) {
+ if (getOrEnforceKnownAlignment(II->getArgOperand(0), 16, DL, II, &AC,
+ &DT) >= 16) {
Value *Ptr = Builder->CreateBitCast(II->getArgOperand(0),
PointerType::getUnqual(II->getType()));
return new LoadInst(Ptr);
@@ -1497,8 +1667,8 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
case Intrinsic::ppc_altivec_stvx:
case Intrinsic::ppc_altivec_stvxl:
// Turn stvx -> store if the pointer is known aligned.
- if (getOrEnforceKnownAlignment(II->getArgOperand(1), 16, DL, II, AC, DT) >=
- 16) {
+ if (getOrEnforceKnownAlignment(II->getArgOperand(1), 16, DL, II, &AC,
+ &DT) >= 16) {
Type *OpPtrTy =
PointerType::getUnqual(II->getArgOperand(0)->getType());
Value *Ptr = Builder->CreateBitCast(II->getArgOperand(1), OpPtrTy);
@@ -1514,8 +1684,8 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
}
case Intrinsic::ppc_qpx_qvlfs:
// Turn PPC QPX qvlfs -> load if the pointer is known aligned.
- if (getOrEnforceKnownAlignment(II->getArgOperand(0), 16, DL, II, AC, DT) >=
- 16) {
+ if (getOrEnforceKnownAlignment(II->getArgOperand(0), 16, DL, II, &AC,
+ &DT) >= 16) {
Type *VTy = VectorType::get(Builder->getFloatTy(),
II->getType()->getVectorNumElements());
Value *Ptr = Builder->CreateBitCast(II->getArgOperand(0),
@@ -1526,8 +1696,8 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
break;
case Intrinsic::ppc_qpx_qvlfd:
// Turn PPC QPX qvlfd -> load if the pointer is known aligned.
- if (getOrEnforceKnownAlignment(II->getArgOperand(0), 32, DL, II, AC, DT) >=
- 32) {
+ if (getOrEnforceKnownAlignment(II->getArgOperand(0), 32, DL, II, &AC,
+ &DT) >= 32) {
Value *Ptr = Builder->CreateBitCast(II->getArgOperand(0),
PointerType::getUnqual(II->getType()));
return new LoadInst(Ptr);
@@ -1535,8 +1705,8 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
break;
case Intrinsic::ppc_qpx_qvstfs:
// Turn PPC QPX qvstfs -> store if the pointer is known aligned.
- if (getOrEnforceKnownAlignment(II->getArgOperand(1), 16, DL, II, AC, DT) >=
- 16) {
+ if (getOrEnforceKnownAlignment(II->getArgOperand(1), 16, DL, II, &AC,
+ &DT) >= 16) {
Type *VTy = VectorType::get(Builder->getFloatTy(),
II->getArgOperand(0)->getType()->getVectorNumElements());
Value *TOp = Builder->CreateFPTrunc(II->getArgOperand(0), VTy);
@@ -1547,8 +1717,8 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
break;
case Intrinsic::ppc_qpx_qvstfd:
// Turn PPC QPX qvstfd -> store if the pointer is known aligned.
- if (getOrEnforceKnownAlignment(II->getArgOperand(1), 32, DL, II, AC, DT) >=
- 32) {
+ if (getOrEnforceKnownAlignment(II->getArgOperand(1), 32, DL, II, &AC,
+ &DT) >= 32) {
Type *OpPtrTy =
PointerType::getUnqual(II->getArgOperand(0)->getType());
Value *Ptr = Builder->CreateBitCast(II->getArgOperand(1), OpPtrTy);
@@ -1607,7 +1777,23 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
case Intrinsic::x86_sse2_cvtsd2si:
case Intrinsic::x86_sse2_cvtsd2si64:
case Intrinsic::x86_sse2_cvttsd2si:
- case Intrinsic::x86_sse2_cvttsd2si64: {
+ case Intrinsic::x86_sse2_cvttsd2si64:
+ case Intrinsic::x86_avx512_vcvtss2si32:
+ case Intrinsic::x86_avx512_vcvtss2si64:
+ case Intrinsic::x86_avx512_vcvtss2usi32:
+ case Intrinsic::x86_avx512_vcvtss2usi64:
+ case Intrinsic::x86_avx512_vcvtsd2si32:
+ case Intrinsic::x86_avx512_vcvtsd2si64:
+ case Intrinsic::x86_avx512_vcvtsd2usi32:
+ case Intrinsic::x86_avx512_vcvtsd2usi64:
+ case Intrinsic::x86_avx512_cvttss2si:
+ case Intrinsic::x86_avx512_cvttss2si64:
+ case Intrinsic::x86_avx512_cvttss2usi:
+ case Intrinsic::x86_avx512_cvttss2usi64:
+ case Intrinsic::x86_avx512_cvttsd2si:
+ case Intrinsic::x86_avx512_cvttsd2si64:
+ case Intrinsic::x86_avx512_cvttsd2usi:
+ case Intrinsic::x86_avx512_cvttsd2usi64: {
// These intrinsics only demand the 0th element of their input vectors. If
// we can simplify the input based on that, do so now.
Value *Arg = II->getArgOperand(0);
@@ -1654,7 +1840,11 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
case Intrinsic::x86_sse2_ucomigt_sd:
case Intrinsic::x86_sse2_ucomile_sd:
case Intrinsic::x86_sse2_ucomilt_sd:
- case Intrinsic::x86_sse2_ucomineq_sd: {
+ case Intrinsic::x86_sse2_ucomineq_sd:
+ case Intrinsic::x86_avx512_vcomi_ss:
+ case Intrinsic::x86_avx512_vcomi_sd:
+ case Intrinsic::x86_avx512_mask_cmp_ss:
+ case Intrinsic::x86_avx512_mask_cmp_sd: {
// These intrinsics only demand the 0th element of their input vectors. If
// we can simplify the input based on that, do so now.
bool MadeChange = false;
@@ -1674,50 +1864,155 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
break;
}
- case Intrinsic::x86_sse_add_ss:
- case Intrinsic::x86_sse_sub_ss:
- case Intrinsic::x86_sse_mul_ss:
- case Intrinsic::x86_sse_div_ss:
+ case Intrinsic::x86_avx512_mask_add_ps_512:
+ case Intrinsic::x86_avx512_mask_div_ps_512:
+ case Intrinsic::x86_avx512_mask_mul_ps_512:
+ case Intrinsic::x86_avx512_mask_sub_ps_512:
+ case Intrinsic::x86_avx512_mask_add_pd_512:
+ case Intrinsic::x86_avx512_mask_div_pd_512:
+ case Intrinsic::x86_avx512_mask_mul_pd_512:
+ case Intrinsic::x86_avx512_mask_sub_pd_512:
+ // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular
+ // IR operations.
+ if (auto *R = dyn_cast<ConstantInt>(II->getArgOperand(4))) {
+ if (R->getValue() == 4) {
+ Value *Arg0 = II->getArgOperand(0);
+ Value *Arg1 = II->getArgOperand(1);
+
+ Value *V;
+ switch (II->getIntrinsicID()) {
+ default: llvm_unreachable("Case stmts out of sync!");
+ case Intrinsic::x86_avx512_mask_add_ps_512:
+ case Intrinsic::x86_avx512_mask_add_pd_512:
+ V = Builder->CreateFAdd(Arg0, Arg1);
+ break;
+ case Intrinsic::x86_avx512_mask_sub_ps_512:
+ case Intrinsic::x86_avx512_mask_sub_pd_512:
+ V = Builder->CreateFSub(Arg0, Arg1);
+ break;
+ case Intrinsic::x86_avx512_mask_mul_ps_512:
+ case Intrinsic::x86_avx512_mask_mul_pd_512:
+ V = Builder->CreateFMul(Arg0, Arg1);
+ break;
+ case Intrinsic::x86_avx512_mask_div_ps_512:
+ case Intrinsic::x86_avx512_mask_div_pd_512:
+ V = Builder->CreateFDiv(Arg0, Arg1);
+ break;
+ }
+
+ // Create a select for the masking.
+ V = emitX86MaskSelect(II->getArgOperand(3), V, II->getArgOperand(2),
+ *Builder);
+ return replaceInstUsesWith(*II, V);
+ }
+ }
+ break;
+
+ case Intrinsic::x86_avx512_mask_add_ss_round:
+ case Intrinsic::x86_avx512_mask_div_ss_round:
+ case Intrinsic::x86_avx512_mask_mul_ss_round:
+ case Intrinsic::x86_avx512_mask_sub_ss_round:
+ case Intrinsic::x86_avx512_mask_add_sd_round:
+ case Intrinsic::x86_avx512_mask_div_sd_round:
+ case Intrinsic::x86_avx512_mask_mul_sd_round:
+ case Intrinsic::x86_avx512_mask_sub_sd_round:
+ // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular
+ // IR operations.
+ if (auto *R = dyn_cast<ConstantInt>(II->getArgOperand(4))) {
+ if (R->getValue() == 4) {
+ // Extract the element as scalars.
+ Value *Arg0 = II->getArgOperand(0);
+ Value *Arg1 = II->getArgOperand(1);
+ Value *LHS = Builder->CreateExtractElement(Arg0, (uint64_t)0);
+ Value *RHS = Builder->CreateExtractElement(Arg1, (uint64_t)0);
+
+ Value *V;
+ switch (II->getIntrinsicID()) {
+ default: llvm_unreachable("Case stmts out of sync!");
+ case Intrinsic::x86_avx512_mask_add_ss_round:
+ case Intrinsic::x86_avx512_mask_add_sd_round:
+ V = Builder->CreateFAdd(LHS, RHS);
+ break;
+ case Intrinsic::x86_avx512_mask_sub_ss_round:
+ case Intrinsic::x86_avx512_mask_sub_sd_round:
+ V = Builder->CreateFSub(LHS, RHS);
+ break;
+ case Intrinsic::x86_avx512_mask_mul_ss_round:
+ case Intrinsic::x86_avx512_mask_mul_sd_round:
+ V = Builder->CreateFMul(LHS, RHS);
+ break;
+ case Intrinsic::x86_avx512_mask_div_ss_round:
+ case Intrinsic::x86_avx512_mask_div_sd_round:
+ V = Builder->CreateFDiv(LHS, RHS);
+ break;
+ }
+
+ // Handle the masking aspect of the intrinsic.
+ Value *Mask = II->getArgOperand(3);
+ auto *C = dyn_cast<ConstantInt>(Mask);
+ // We don't need a select if we know the mask bit is a 1.
+ if (!C || !C->getValue()[0]) {
+ // Cast the mask to an i1 vector and then extract the lowest element.
+ auto *MaskTy = VectorType::get(Builder->getInt1Ty(),
+ cast<IntegerType>(Mask->getType())->getBitWidth());
+ Mask = Builder->CreateBitCast(Mask, MaskTy);
+ Mask = Builder->CreateExtractElement(Mask, (uint64_t)0);
+ // Extract the lowest element from the passthru operand.
+ Value *Passthru = Builder->CreateExtractElement(II->getArgOperand(2),
+ (uint64_t)0);
+ V = Builder->CreateSelect(Mask, V, Passthru);
+ }
+
+ // Insert the result back into the original argument 0.
+ V = Builder->CreateInsertElement(Arg0, V, (uint64_t)0);
+
+ return replaceInstUsesWith(*II, V);
+ }
+ }
+ LLVM_FALLTHROUGH;
+
+ // X86 scalar intrinsics simplified with SimplifyDemandedVectorElts.
+ case Intrinsic::x86_avx512_mask_max_ss_round:
+ case Intrinsic::x86_avx512_mask_min_ss_round:
+ case Intrinsic::x86_avx512_mask_max_sd_round:
+ case Intrinsic::x86_avx512_mask_min_sd_round:
+ case Intrinsic::x86_avx512_mask_vfmadd_ss:
+ case Intrinsic::x86_avx512_mask_vfmadd_sd:
+ case Intrinsic::x86_avx512_maskz_vfmadd_ss:
+ case Intrinsic::x86_avx512_maskz_vfmadd_sd:
+ case Intrinsic::x86_avx512_mask3_vfmadd_ss:
+ case Intrinsic::x86_avx512_mask3_vfmadd_sd:
+ case Intrinsic::x86_avx512_mask3_vfmsub_ss:
+ case Intrinsic::x86_avx512_mask3_vfmsub_sd:
+ case Intrinsic::x86_avx512_mask3_vfnmsub_ss:
+ case Intrinsic::x86_avx512_mask3_vfnmsub_sd:
+ case Intrinsic::x86_fma_vfmadd_ss:
+ case Intrinsic::x86_fma_vfmsub_ss:
+ case Intrinsic::x86_fma_vfnmadd_ss:
+ case Intrinsic::x86_fma_vfnmsub_ss:
+ case Intrinsic::x86_fma_vfmadd_sd:
+ case Intrinsic::x86_fma_vfmsub_sd:
+ case Intrinsic::x86_fma_vfnmadd_sd:
+ case Intrinsic::x86_fma_vfnmsub_sd:
+ case Intrinsic::x86_sse_cmp_ss:
case Intrinsic::x86_sse_min_ss:
case Intrinsic::x86_sse_max_ss:
- case Intrinsic::x86_sse_cmp_ss:
- case Intrinsic::x86_sse2_add_sd:
- case Intrinsic::x86_sse2_sub_sd:
- case Intrinsic::x86_sse2_mul_sd:
- case Intrinsic::x86_sse2_div_sd:
+ case Intrinsic::x86_sse2_cmp_sd:
case Intrinsic::x86_sse2_min_sd:
case Intrinsic::x86_sse2_max_sd:
- case Intrinsic::x86_sse2_cmp_sd: {
- // These intrinsics only demand the lowest element of the second input
- // vector.
- Value *Arg1 = II->getArgOperand(1);
- unsigned VWidth = Arg1->getType()->getVectorNumElements();
- if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, 1)) {
- II->setArgOperand(1, V);
- return II;
- }
- break;
- }
-
case Intrinsic::x86_sse41_round_ss:
- case Intrinsic::x86_sse41_round_sd: {
- // These intrinsics demand the upper elements of the first input vector and
- // the lowest element of the second input vector.
- bool MadeChange = false;
- Value *Arg0 = II->getArgOperand(0);
- Value *Arg1 = II->getArgOperand(1);
- unsigned VWidth = Arg0->getType()->getVectorNumElements();
- if (Value *V = SimplifyDemandedVectorEltsHigh(Arg0, VWidth, VWidth - 1)) {
- II->setArgOperand(0, V);
- MadeChange = true;
- }
- if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, 1)) {
- II->setArgOperand(1, V);
- MadeChange = true;
- }
- if (MadeChange)
- return II;
- break;
+ case Intrinsic::x86_sse41_round_sd:
+ case Intrinsic::x86_xop_vfrcz_ss:
+ case Intrinsic::x86_xop_vfrcz_sd: {
+ unsigned VWidth = II->getType()->getVectorNumElements();
+ APInt UndefElts(VWidth, 0);
+ APInt AllOnesEltMask(APInt::getAllOnesValue(VWidth));
+ if (Value *V = SimplifyDemandedVectorElts(II, AllOnesEltMask, UndefElts)) {
+ if (V != II)
+ return replaceInstUsesWith(*II, V);
+ return II;
+ }
+ break;
}
// Constant fold ashr( <A x Bi>, Ci ).
@@ -1727,18 +2022,29 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
case Intrinsic::x86_sse2_psrai_w:
case Intrinsic::x86_avx2_psrai_d:
case Intrinsic::x86_avx2_psrai_w:
+ case Intrinsic::x86_avx512_psrai_q_128:
+ case Intrinsic::x86_avx512_psrai_q_256:
+ case Intrinsic::x86_avx512_psrai_d_512:
+ case Intrinsic::x86_avx512_psrai_q_512:
+ case Intrinsic::x86_avx512_psrai_w_512:
case Intrinsic::x86_sse2_psrli_d:
case Intrinsic::x86_sse2_psrli_q:
case Intrinsic::x86_sse2_psrli_w:
case Intrinsic::x86_avx2_psrli_d:
case Intrinsic::x86_avx2_psrli_q:
case Intrinsic::x86_avx2_psrli_w:
+ case Intrinsic::x86_avx512_psrli_d_512:
+ case Intrinsic::x86_avx512_psrli_q_512:
+ case Intrinsic::x86_avx512_psrli_w_512:
case Intrinsic::x86_sse2_pslli_d:
case Intrinsic::x86_sse2_pslli_q:
case Intrinsic::x86_sse2_pslli_w:
case Intrinsic::x86_avx2_pslli_d:
case Intrinsic::x86_avx2_pslli_q:
case Intrinsic::x86_avx2_pslli_w:
+ case Intrinsic::x86_avx512_pslli_d_512:
+ case Intrinsic::x86_avx512_pslli_q_512:
+ case Intrinsic::x86_avx512_pslli_w_512:
if (Value *V = simplifyX86immShift(*II, *Builder))
return replaceInstUsesWith(*II, V);
break;
@@ -1747,18 +2053,29 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
case Intrinsic::x86_sse2_psra_w:
case Intrinsic::x86_avx2_psra_d:
case Intrinsic::x86_avx2_psra_w:
+ case Intrinsic::x86_avx512_psra_q_128:
+ case Intrinsic::x86_avx512_psra_q_256:
+ case Intrinsic::x86_avx512_psra_d_512:
+ case Intrinsic::x86_avx512_psra_q_512:
+ case Intrinsic::x86_avx512_psra_w_512:
case Intrinsic::x86_sse2_psrl_d:
case Intrinsic::x86_sse2_psrl_q:
case Intrinsic::x86_sse2_psrl_w:
case Intrinsic::x86_avx2_psrl_d:
case Intrinsic::x86_avx2_psrl_q:
case Intrinsic::x86_avx2_psrl_w:
+ case Intrinsic::x86_avx512_psrl_d_512:
+ case Intrinsic::x86_avx512_psrl_q_512:
+ case Intrinsic::x86_avx512_psrl_w_512:
case Intrinsic::x86_sse2_psll_d:
case Intrinsic::x86_sse2_psll_q:
case Intrinsic::x86_sse2_psll_w:
case Intrinsic::x86_avx2_psll_d:
case Intrinsic::x86_avx2_psll_q:
- case Intrinsic::x86_avx2_psll_w: {
+ case Intrinsic::x86_avx2_psll_w:
+ case Intrinsic::x86_avx512_psll_d_512:
+ case Intrinsic::x86_avx512_psll_q_512:
+ case Intrinsic::x86_avx512_psll_w_512: {
if (Value *V = simplifyX86immShift(*II, *Builder))
return replaceInstUsesWith(*II, V);
@@ -1780,16 +2097,50 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
case Intrinsic::x86_avx2_psllv_d_256:
case Intrinsic::x86_avx2_psllv_q:
case Intrinsic::x86_avx2_psllv_q_256:
+ case Intrinsic::x86_avx512_psllv_d_512:
+ case Intrinsic::x86_avx512_psllv_q_512:
+ case Intrinsic::x86_avx512_psllv_w_128:
+ case Intrinsic::x86_avx512_psllv_w_256:
+ case Intrinsic::x86_avx512_psllv_w_512:
case Intrinsic::x86_avx2_psrav_d:
case Intrinsic::x86_avx2_psrav_d_256:
+ case Intrinsic::x86_avx512_psrav_q_128:
+ case Intrinsic::x86_avx512_psrav_q_256:
+ case Intrinsic::x86_avx512_psrav_d_512:
+ case Intrinsic::x86_avx512_psrav_q_512:
+ case Intrinsic::x86_avx512_psrav_w_128:
+ case Intrinsic::x86_avx512_psrav_w_256:
+ case Intrinsic::x86_avx512_psrav_w_512:
case Intrinsic::x86_avx2_psrlv_d:
case Intrinsic::x86_avx2_psrlv_d_256:
case Intrinsic::x86_avx2_psrlv_q:
case Intrinsic::x86_avx2_psrlv_q_256:
+ case Intrinsic::x86_avx512_psrlv_d_512:
+ case Intrinsic::x86_avx512_psrlv_q_512:
+ case Intrinsic::x86_avx512_psrlv_w_128:
+ case Intrinsic::x86_avx512_psrlv_w_256:
+ case Intrinsic::x86_avx512_psrlv_w_512:
if (Value *V = simplifyX86varShift(*II, *Builder))
return replaceInstUsesWith(*II, V);
break;
+ case Intrinsic::x86_sse2_pmulu_dq:
+ case Intrinsic::x86_sse41_pmuldq:
+ case Intrinsic::x86_avx2_pmul_dq:
+ case Intrinsic::x86_avx2_pmulu_dq:
+ case Intrinsic::x86_avx512_pmul_dq_512:
+ case Intrinsic::x86_avx512_pmulu_dq_512: {
+ unsigned VWidth = II->getType()->getVectorNumElements();
+ APInt UndefElts(VWidth, 0);
+ APInt DemandedElts = APInt::getAllOnesValue(VWidth);
+ if (Value *V = SimplifyDemandedVectorElts(II, DemandedElts, UndefElts)) {
+ if (V != II)
+ return replaceInstUsesWith(*II, V);
+ return II;
+ }
+ break;
+ }
+
case Intrinsic::x86_sse41_insertps:
if (Value *V = simplifyX86insertps(*II, *Builder))
return replaceInstUsesWith(*II, V);
@@ -1807,10 +2158,10 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
// See if we're dealing with constant values.
Constant *C1 = dyn_cast<Constant>(Op1);
ConstantInt *CILength =
- C1 ? dyn_cast<ConstantInt>(C1->getAggregateElement((unsigned)0))
+ C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0))
: nullptr;
ConstantInt *CIIndex =
- C1 ? dyn_cast<ConstantInt>(C1->getAggregateElement((unsigned)1))
+ C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1))
: nullptr;
// Attempt to simplify to a constant, shuffle vector or EXTRQI call.
@@ -1870,7 +2221,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
// See if we're dealing with constant values.
Constant *C1 = dyn_cast<Constant>(Op1);
ConstantInt *CI11 =
- C1 ? dyn_cast<ConstantInt>(C1->getAggregateElement((unsigned)1))
+ C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1))
: nullptr;
// Attempt to simplify to a constant, shuffle vector or INSERTQI call.
@@ -1964,14 +2315,17 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
case Intrinsic::x86_ssse3_pshuf_b_128:
case Intrinsic::x86_avx2_pshuf_b:
+ case Intrinsic::x86_avx512_pshuf_b_512:
if (Value *V = simplifyX86pshufb(*II, *Builder))
return replaceInstUsesWith(*II, V);
break;
case Intrinsic::x86_avx_vpermilvar_ps:
case Intrinsic::x86_avx_vpermilvar_ps_256:
+ case Intrinsic::x86_avx512_vpermilvar_ps_512:
case Intrinsic::x86_avx_vpermilvar_pd:
case Intrinsic::x86_avx_vpermilvar_pd_256:
+ case Intrinsic::x86_avx512_vpermilvar_pd_512:
if (Value *V = simplifyX86vpermilvar(*II, *Builder))
return replaceInstUsesWith(*II, V);
break;
@@ -1982,6 +2336,28 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
return replaceInstUsesWith(*II, V);
break;
+ case Intrinsic::x86_avx512_mask_permvar_df_256:
+ case Intrinsic::x86_avx512_mask_permvar_df_512:
+ case Intrinsic::x86_avx512_mask_permvar_di_256:
+ case Intrinsic::x86_avx512_mask_permvar_di_512:
+ case Intrinsic::x86_avx512_mask_permvar_hi_128:
+ case Intrinsic::x86_avx512_mask_permvar_hi_256:
+ case Intrinsic::x86_avx512_mask_permvar_hi_512:
+ case Intrinsic::x86_avx512_mask_permvar_qi_128:
+ case Intrinsic::x86_avx512_mask_permvar_qi_256:
+ case Intrinsic::x86_avx512_mask_permvar_qi_512:
+ case Intrinsic::x86_avx512_mask_permvar_sf_256:
+ case Intrinsic::x86_avx512_mask_permvar_sf_512:
+ case Intrinsic::x86_avx512_mask_permvar_si_256:
+ case Intrinsic::x86_avx512_mask_permvar_si_512:
+ if (Value *V = simplifyX86vpermv(*II, *Builder)) {
+ // We simplified the permuting, now create a select for the masking.
+ V = emitX86MaskSelect(II->getArgOperand(3), V, II->getArgOperand(2),
+ *Builder);
+ return replaceInstUsesWith(*II, V);
+ }
+ break;
+
case Intrinsic::x86_avx_vperm2f128_pd_256:
case Intrinsic::x86_avx_vperm2f128_ps_256:
case Intrinsic::x86_avx_vperm2f128_si_256:
@@ -2104,7 +2480,8 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
case Intrinsic::arm_neon_vst2lane:
case Intrinsic::arm_neon_vst3lane:
case Intrinsic::arm_neon_vst4lane: {
- unsigned MemAlign = getKnownAlignment(II->getArgOperand(0), DL, II, AC, DT);
+ unsigned MemAlign =
+ getKnownAlignment(II->getArgOperand(0), DL, II, &AC, &DT);
unsigned AlignArg = II->getNumArgOperands() - 1;
ConstantInt *IntrAlign = dyn_cast<ConstantInt>(II->getArgOperand(AlignArg));
if (IntrAlign && IntrAlign->getZExtValue() < MemAlign) {
@@ -2194,6 +2571,85 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
break;
}
+ case Intrinsic::amdgcn_class: {
+ enum {
+ S_NAN = 1 << 0, // Signaling NaN
+ Q_NAN = 1 << 1, // Quiet NaN
+ N_INFINITY = 1 << 2, // Negative infinity
+ N_NORMAL = 1 << 3, // Negative normal
+ N_SUBNORMAL = 1 << 4, // Negative subnormal
+ N_ZERO = 1 << 5, // Negative zero
+ P_ZERO = 1 << 6, // Positive zero
+ P_SUBNORMAL = 1 << 7, // Positive subnormal
+ P_NORMAL = 1 << 8, // Positive normal
+ P_INFINITY = 1 << 9 // Positive infinity
+ };
+
+ const uint32_t FullMask = S_NAN | Q_NAN | N_INFINITY | N_NORMAL |
+ N_SUBNORMAL | N_ZERO | P_ZERO | P_SUBNORMAL | P_NORMAL | P_INFINITY;
+
+ Value *Src0 = II->getArgOperand(0);
+ Value *Src1 = II->getArgOperand(1);
+ const ConstantInt *CMask = dyn_cast<ConstantInt>(Src1);
+ if (!CMask) {
+ if (isa<UndefValue>(Src0))
+ return replaceInstUsesWith(*II, UndefValue::get(II->getType()));
+
+ if (isa<UndefValue>(Src1))
+ return replaceInstUsesWith(*II, ConstantInt::get(II->getType(), false));
+ break;
+ }
+
+ uint32_t Mask = CMask->getZExtValue();
+
+ // If all tests are made, it doesn't matter what the value is.
+ if ((Mask & FullMask) == FullMask)
+ return replaceInstUsesWith(*II, ConstantInt::get(II->getType(), true));
+
+ if ((Mask & FullMask) == 0)
+ return replaceInstUsesWith(*II, ConstantInt::get(II->getType(), false));
+
+ if (Mask == (S_NAN | Q_NAN)) {
+ // Equivalent of isnan. Replace with standard fcmp.
+ Value *FCmp = Builder->CreateFCmpUNO(Src0, Src0);
+ FCmp->takeName(II);
+ return replaceInstUsesWith(*II, FCmp);
+ }
+
+ const ConstantFP *CVal = dyn_cast<ConstantFP>(Src0);
+ if (!CVal) {
+ if (isa<UndefValue>(Src0))
+ return replaceInstUsesWith(*II, UndefValue::get(II->getType()));
+
+ // Clamp mask to used bits
+ if ((Mask & FullMask) != Mask) {
+ CallInst *NewCall = Builder->CreateCall(II->getCalledFunction(),
+ { Src0, ConstantInt::get(Src1->getType(), Mask & FullMask) }
+ );
+
+ NewCall->takeName(II);
+ return replaceInstUsesWith(*II, NewCall);
+ }
+
+ break;
+ }
+
+ const APFloat &Val = CVal->getValueAPF();
+
+ bool Result =
+ ((Mask & S_NAN) && Val.isNaN() && Val.isSignaling()) ||
+ ((Mask & Q_NAN) && Val.isNaN() && !Val.isSignaling()) ||
+ ((Mask & N_INFINITY) && Val.isInfinity() && Val.isNegative()) ||
+ ((Mask & N_NORMAL) && Val.isNormal() && Val.isNegative()) ||
+ ((Mask & N_SUBNORMAL) && Val.isDenormal() && Val.isNegative()) ||
+ ((Mask & N_ZERO) && Val.isZero() && Val.isNegative()) ||
+ ((Mask & P_ZERO) && Val.isZero() && !Val.isNegative()) ||
+ ((Mask & P_SUBNORMAL) && Val.isDenormal() && !Val.isNegative()) ||
+ ((Mask & P_NORMAL) && Val.isNormal() && !Val.isNegative()) ||
+ ((Mask & P_INFINITY) && Val.isInfinity() && !Val.isNegative());
+
+ return replaceInstUsesWith(*II, ConstantInt::get(II->getType(), Result));
+ }
case Intrinsic::stackrestore: {
// If the save is right next to the restore, remove the restore. This can
// happen when variable allocas are DCE'd.
@@ -2243,6 +2699,11 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
break;
}
case Intrinsic::lifetime_start:
+ // Asan needs to poison memory to detect invalid access which is possible
+ // even for empty lifetime range.
+ if (II->getFunction()->hasFnAttribute(Attribute::SanitizeAddress))
+ break;
+
if (removeTriviallyEmptyRange(*II, Intrinsic::lifetime_start,
Intrinsic::lifetime_end, *this))
return nullptr;
@@ -2274,24 +2735,20 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
// assume( (load addr) != null ) -> add 'nonnull' metadata to load
// (if assume is valid at the load)
- if (ICmpInst* ICmp = dyn_cast<ICmpInst>(IIOperand)) {
- Value *LHS = ICmp->getOperand(0);
- Value *RHS = ICmp->getOperand(1);
- if (ICmpInst::ICMP_NE == ICmp->getPredicate() &&
- isa<LoadInst>(LHS) &&
- isa<Constant>(RHS) &&
- RHS->getType()->isPointerTy() &&
- cast<Constant>(RHS)->isNullValue()) {
- LoadInst* LI = cast<LoadInst>(LHS);
- if (isValidAssumeForContext(II, LI, DT)) {
- MDNode *MD = MDNode::get(II->getContext(), None);
- LI->setMetadata(LLVMContext::MD_nonnull, MD);
- return eraseInstFromFunction(*II);
- }
- }
+ CmpInst::Predicate Pred;
+ Instruction *LHS;
+ if (match(IIOperand, m_ICmp(Pred, m_Instruction(LHS), m_Zero())) &&
+ Pred == ICmpInst::ICMP_NE && LHS->getOpcode() == Instruction::Load &&
+ LHS->getType()->isPointerTy() &&
+ isValidAssumeForContext(II, LHS, &DT)) {
+ MDNode *MD = MDNode::get(II->getContext(), None);
+ LHS->setMetadata(LLVMContext::MD_nonnull, MD);
+ return eraseInstFromFunction(*II);
+
// TODO: apply nonnull return attributes to calls and invokes
// TODO: apply range metadata for range check patterns?
}
+
// If there is a dominating assume with the same condition as this one,
// then this one is redundant, and should be removed.
APInt KnownZero(1, 0), KnownOne(1, 0);
@@ -2299,6 +2756,9 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
if (KnownOne.isAllOnesValue())
return eraseInstFromFunction(*II);
+ // Update the cache of affected values for this assumption (we might be
+ // here because we just simplified the condition).
+ AC.updateAffectedValues(II);
break;
}
case Intrinsic::experimental_gc_relocate: {
@@ -2329,7 +2789,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
return replaceInstUsesWith(*II, ConstantPointerNull::get(PT));
// isKnownNonNull -> nonnull attribute
- if (isKnownNonNullAt(DerivedPtr, II, DT))
+ if (isKnownNonNullAt(DerivedPtr, II, &DT))
II->addAttribute(AttributeSet::ReturnIndex, Attribute::NonNull);
}
@@ -2389,7 +2849,7 @@ Instruction *InstCombiner::tryOptimizeCall(CallInst *CI) {
auto InstCombineRAUW = [this](Instruction *From, Value *With) {
replaceInstUsesWith(*From, With);
};
- LibCallSimplifier Simplifier(DL, TLI, InstCombineRAUW);
+ LibCallSimplifier Simplifier(DL, &TLI, InstCombineRAUW);
if (Value *With = Simplifier.optimizeCall(CI)) {
++NumSimplified;
return CI->use_empty() ? CI : replaceInstUsesWith(*CI, With);
@@ -2477,8 +2937,7 @@ static IntrinsicInst *findInitTrampoline(Value *Callee) {
/// Improvements for call and invoke instructions.
Instruction *InstCombiner::visitCallSite(CallSite CS) {
-
- if (isAllocLikeFn(CS.getInstruction(), TLI))
+ if (isAllocLikeFn(CS.getInstruction(), &TLI))
return visitAllocSite(*CS.getInstruction());
bool Changed = false;
@@ -2492,7 +2951,7 @@ Instruction *InstCombiner::visitCallSite(CallSite CS) {
for (Value *V : CS.args()) {
if (V->getType()->isPointerTy() &&
!CS.paramHasAttr(ArgNo + 1, Attribute::NonNull) &&
- isKnownNonNullAt(V, CS.getInstruction(), DT))
+ isKnownNonNullAt(V, CS.getInstruction(), &DT))
Indices.push_back(ArgNo + 1);
ArgNo++;
}
@@ -2613,14 +3072,14 @@ Instruction *InstCombiner::visitCallSite(CallSite CS) {
/// If the callee is a constexpr cast of a function, attempt to move the cast to
/// the arguments of the call/invoke.
bool InstCombiner::transformConstExprCastCall(CallSite CS) {
- Function *Callee =
- dyn_cast<Function>(CS.getCalledValue()->stripPointerCasts());
+ auto *Callee = dyn_cast<Function>(CS.getCalledValue()->stripPointerCasts());
if (!Callee)
return false;
- // The prototype of thunks are a lie, don't try to directly call such
- // functions.
+
+ // The prototype of a thunk is a lie. Don't directly call such a function.
if (Callee->hasFnAttribute("thunk"))
return false;
+
Instruction *Caller = CS.getInstruction();
const AttributeSet &CallerPAL = CS.getAttributes();
@@ -2842,8 +3301,7 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) {
CallInst *CI = cast<CallInst>(Caller);
NC = Builder->CreateCall(Callee, Args, OpBundles);
NC->takeName(CI);
- if (CI->isTailCall())
- cast<CallInst>(NC)->setTailCall();
+ cast<CallInst>(NC)->setTailCallKind(CI->getTailCallKind());
cast<CallInst>(NC)->setCallingConv(CI->getCallingConv());
cast<CallInst>(NC)->setAttributes(NewCallerPAL);
}
@@ -2966,7 +3424,7 @@ InstCombiner::transformCallThroughTrampoline(CallSite CS,
++Idx;
++I;
- } while (1);
+ } while (true);
}
// Add any function attributes.
@@ -3001,7 +3459,7 @@ InstCombiner::transformCallThroughTrampoline(CallSite CS,
++Idx;
++I;
- } while (1);
+ } while (true);
}
// Replace the trampoline call with a direct call. Let the generic
@@ -3027,10 +3485,10 @@ InstCombiner::transformCallThroughTrampoline(CallSite CS,
cast<InvokeInst>(NewCaller)->setAttributes(NewPAL);
} else {
NewCaller = CallInst::Create(NewCallee, NewArgs, OpBundles);
- if (cast<CallInst>(Caller)->isTailCall())
- cast<CallInst>(NewCaller)->setTailCall();
- cast<CallInst>(NewCaller)->
- setCallingConv(cast<CallInst>(Caller)->getCallingConv());
+ cast<CallInst>(NewCaller)->setTailCallKind(
+ cast<CallInst>(Caller)->getTailCallKind());
+ cast<CallInst>(NewCaller)->setCallingConv(
+ cast<CallInst>(Caller)->getCallingConv());
cast<CallInst>(NewCaller)->setAttributes(NewPAL);
}
diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
index 2055615..e74b590 100644
--- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -12,6 +12,7 @@
//===----------------------------------------------------------------------===//
#include "InstCombineInternal.h"
+#include "llvm/ADT/SetVector.h"
#include "llvm/Analysis/ConstantFolding.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/PatternMatch.h"
@@ -161,8 +162,8 @@ Value *InstCombiner::EvaluateInDifferentType(Value *V, Type *Ty,
if (Constant *C = dyn_cast<Constant>(V)) {
C = ConstantExpr::getIntegerCast(C, Ty, isSigned /*Sext or ZExt*/);
// If we got a constantexpr back, try to simplify it with DL info.
- if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C))
- C = ConstantFoldConstantExpression(CE, DL, TLI);
+ if (Constant *FoldedC = ConstantFoldConstant(C, DL, &TLI))
+ C = FoldedC;
return C;
}
@@ -227,20 +228,14 @@ Value *InstCombiner::EvaluateInDifferentType(Value *V, Type *Ty,
return InsertNewInstWith(Res, *I);
}
+Instruction::CastOps InstCombiner::isEliminableCastPair(const CastInst *CI1,
+ const CastInst *CI2) {
+ Type *SrcTy = CI1->getSrcTy();
+ Type *MidTy = CI1->getDestTy();
+ Type *DstTy = CI2->getDestTy();
-/// This function is a wrapper around CastInst::isEliminableCastPair. It
-/// simply extracts arguments and returns what that function returns.
-static Instruction::CastOps
-isEliminableCastPair(const CastInst *CI, ///< First cast instruction
- unsigned opcode, ///< Opcode for the second cast
- Type *DstTy, ///< Target type for the second cast
- const DataLayout &DL) {
- Type *SrcTy = CI->getOperand(0)->getType(); // A from above
- Type *MidTy = CI->getType(); // B from above
-
- // Get the opcodes of the two Cast instructions
- Instruction::CastOps firstOp = Instruction::CastOps(CI->getOpcode());
- Instruction::CastOps secondOp = Instruction::CastOps(opcode);
+ Instruction::CastOps firstOp = Instruction::CastOps(CI1->getOpcode());
+ Instruction::CastOps secondOp = Instruction::CastOps(CI2->getOpcode());
Type *SrcIntPtrTy =
SrcTy->isPtrOrPtrVectorTy() ? DL.getIntPtrType(SrcTy) : nullptr;
Type *MidIntPtrTy =
@@ -260,54 +255,28 @@ isEliminableCastPair(const CastInst *CI, ///< First cast instruction
return Instruction::CastOps(Res);
}
-/// Return true if the cast from "V to Ty" actually results in any code being
-/// generated and is interesting to optimize out.
-/// If the cast can be eliminated by some other simple transformation, we prefer
-/// to do the simplification first.
-bool InstCombiner::ShouldOptimizeCast(Instruction::CastOps opc, const Value *V,
- Type *Ty) {
- // Noop casts and casts of constants should be eliminated trivially.
- if (V->getType() == Ty || isa<Constant>(V)) return false;
-
- // If this is another cast that can be eliminated, we prefer to have it
- // eliminated.
- if (const CastInst *CI = dyn_cast<CastInst>(V))
- if (isEliminableCastPair(CI, opc, Ty, DL))
- return false;
-
- // If this is a vector sext from a compare, then we don't want to break the
- // idiom where each element of the extended vector is either zero or all ones.
- if (opc == Instruction::SExt && isa<CmpInst>(V) && Ty->isVectorTy())
- return false;
-
- return true;
-}
-
-
/// @brief Implement the transforms common to all CastInst visitors.
Instruction *InstCombiner::commonCastTransforms(CastInst &CI) {
Value *Src = CI.getOperand(0);
- // Many cases of "cast of a cast" are eliminable. If it's eliminable we just
- // eliminate it now.
- if (CastInst *CSrc = dyn_cast<CastInst>(Src)) { // A->B->C cast
- if (Instruction::CastOps opc =
- isEliminableCastPair(CSrc, CI.getOpcode(), CI.getType(), DL)) {
+ // Try to eliminate a cast of a cast.
+ if (auto *CSrc = dyn_cast<CastInst>(Src)) { // A->B->C cast
+ if (Instruction::CastOps NewOpc = isEliminableCastPair(CSrc, &CI)) {
// The first cast (CSrc) is eliminable so we need to fix up or replace
// the second cast (CI). CSrc will then have a good chance of being dead.
- return CastInst::Create(opc, CSrc->getOperand(0), CI.getType());
+ return CastInst::Create(NewOpc, CSrc->getOperand(0), CI.getType());
}
}
- // If we are casting a select then fold the cast into the select
- if (SelectInst *SI = dyn_cast<SelectInst>(Src))
+ // If we are casting a select, then fold the cast into the select.
+ if (auto *SI = dyn_cast<SelectInst>(Src))
if (Instruction *NV = FoldOpIntoSelect(CI, SI))
return NV;
- // If we are casting a PHI then fold the cast into the PHI
+ // If we are casting a PHI, then fold the cast into the PHI.
if (isa<PHINode>(Src)) {
- // We don't do this if this would create a PHI node with an illegal type if
- // it is currently legal.
+ // Don't do this if it would create a PHI node with an illegal type from a
+ // legal type.
if (!Src->getType()->isIntegerTy() || !CI.getType()->isIntegerTy() ||
ShouldChangeType(CI.getType(), Src->getType()))
if (Instruction *NV = FoldOpIntoPhi(CI))
@@ -474,19 +443,39 @@ static Instruction *foldVecTruncToExtElt(TruncInst &Trunc, InstCombiner &IC,
return ExtractElementInst::Create(VecInput, IC.Builder->getInt32(Elt));
}
+/// Try to narrow the width of bitwise logic instructions with constants.
+Instruction *InstCombiner::shrinkBitwiseLogic(TruncInst &Trunc) {
+ Type *SrcTy = Trunc.getSrcTy();
+ Type *DestTy = Trunc.getType();
+ if (isa<IntegerType>(SrcTy) && !ShouldChangeType(SrcTy, DestTy))
+ return nullptr;
+
+ BinaryOperator *LogicOp;
+ Constant *C;
+ if (!match(Trunc.getOperand(0), m_OneUse(m_BinOp(LogicOp))) ||
+ !LogicOp->isBitwiseLogicOp() ||
+ !match(LogicOp->getOperand(1), m_Constant(C)))
+ return nullptr;
+
+ // trunc (logic X, C) --> logic (trunc X, C')
+ Constant *NarrowC = ConstantExpr::getTrunc(C, DestTy);
+ Value *NarrowOp0 = Builder->CreateTrunc(LogicOp->getOperand(0), DestTy);
+ return BinaryOperator::Create(LogicOp->getOpcode(), NarrowOp0, NarrowC);
+}
+
Instruction *InstCombiner::visitTrunc(TruncInst &CI) {
if (Instruction *Result = commonCastTransforms(CI))
return Result;
// Test if the trunc is the user of a select which is part of a
// minimum or maximum operation. If so, don't do any more simplification.
- // Even simplifying demanded bits can break the canonical form of a
+ // Even simplifying demanded bits can break the canonical form of a
// min/max.
Value *LHS, *RHS;
if (SelectInst *SI = dyn_cast<SelectInst>(CI.getOperand(0)))
if (matchSelectPattern(SI, LHS, RHS).Flavor != SPF_UNKNOWN)
return nullptr;
-
+
// See if we can simplify any instructions used by the input whose sole
// purpose is to compute bits we don't care about.
if (SimplifyDemandedInstructionBits(CI))
@@ -562,14 +551,26 @@ Instruction *InstCombiner::visitTrunc(TruncInst &CI) {
}
}
- // Transform "trunc (and X, cst)" -> "and (trunc X), cst" so long as the dest
- // type isn't non-native.
+ if (Instruction *I = shrinkBitwiseLogic(CI))
+ return I;
+
if (Src->hasOneUse() && isa<IntegerType>(SrcTy) &&
- ShouldChangeType(SrcTy, DestTy) &&
- match(Src, m_And(m_Value(A), m_ConstantInt(Cst)))) {
- Value *NewTrunc = Builder->CreateTrunc(A, DestTy, A->getName() + ".tr");
- return BinaryOperator::CreateAnd(NewTrunc,
- ConstantExpr::getTrunc(Cst, DestTy));
+ ShouldChangeType(SrcTy, DestTy)) {
+ // Transform "trunc (shl X, cst)" -> "shl (trunc X), cst" so long as the
+ // dest type is native and cst < dest size.
+ if (match(Src, m_Shl(m_Value(A), m_ConstantInt(Cst))) &&
+ !match(A, m_Shr(m_Value(), m_Constant()))) {
+ // Skip shifts of shift by constants. It undoes a combine in
+ // FoldShiftByConstant and is the extend in reg pattern.
+ const unsigned DestSize = DestTy->getScalarSizeInBits();
+ if (Cst->getValue().ult(DestSize)) {
+ Value *NewTrunc = Builder->CreateTrunc(A, DestTy, A->getName() + ".tr");
+
+ return BinaryOperator::Create(
+ Instruction::Shl, NewTrunc,
+ ConstantInt::get(DestTy, Cst->getValue().trunc(DestSize)));
+ }
+ }
}
if (Instruction *I = foldVecTruncToExtElt(CI, *this, DL))
@@ -578,10 +579,8 @@ Instruction *InstCombiner::visitTrunc(TruncInst &CI) {
return nullptr;
}
-/// Transform (zext icmp) to bitwise / integer operations in order to eliminate
-/// the icmp.
-Instruction *InstCombiner::transformZExtICmp(ICmpInst *ICI, Instruction &CI,
- bool DoXform) {
+Instruction *InstCombiner::transformZExtICmp(ICmpInst *ICI, ZExtInst &CI,
+ bool DoTransform) {
// If we are just checking for a icmp eq of a single bit and zext'ing it
// to an integer, then shift the bit to the appropriate place and then
// cast to integer to avoid the comparison.
@@ -592,7 +591,7 @@ Instruction *InstCombiner::transformZExtICmp(ICmpInst *ICI, Instruction &CI,
// zext (x >s -1) to i32 --> (x>>u31)^1 true if signbit clear.
if ((ICI->getPredicate() == ICmpInst::ICMP_SLT && Op1CV == 0) ||
(ICI->getPredicate() == ICmpInst::ICMP_SGT && Op1CV.isAllOnesValue())) {
- if (!DoXform) return ICI;
+ if (!DoTransform) return ICI;
Value *In = ICI->getOperand(0);
Value *Sh = ConstantInt::get(In->getType(),
@@ -627,7 +626,7 @@ Instruction *InstCombiner::transformZExtICmp(ICmpInst *ICI, Instruction &CI,
APInt KnownZeroMask(~KnownZero);
if (KnownZeroMask.isPowerOf2()) { // Exactly 1 possible 1?
- if (!DoXform) return ICI;
+ if (!DoTransform) return ICI;
bool isNE = ICI->getPredicate() == ICmpInst::ICMP_NE;
if (Op1CV != 0 && (Op1CV != KnownZeroMask)) {
@@ -655,7 +654,9 @@ Instruction *InstCombiner::transformZExtICmp(ICmpInst *ICI, Instruction &CI,
if (CI.getType() == In->getType())
return replaceInstUsesWith(CI, In);
- return CastInst::CreateIntegerCast(In, CI.getType(), false/*ZExt*/);
+
+ Value *IntCast = Builder->CreateIntCast(In, CI.getType(), false);
+ return replaceInstUsesWith(CI, IntCast);
}
}
}
@@ -678,7 +679,7 @@ Instruction *InstCombiner::transformZExtICmp(ICmpInst *ICI, Instruction &CI,
APInt KnownBits = KnownZeroLHS | KnownOneLHS;
APInt UnknownBit = ~KnownBits;
if (UnknownBit.countPopulation() == 1) {
- if (!DoXform) return ICI;
+ if (!DoTransform) return ICI;
Value *Result = Builder->CreateXor(LHS, RHS);
@@ -760,9 +761,7 @@ static bool canEvaluateZExtd(Value *V, Type *Ty, unsigned &BitsToClear,
// If the operation is an AND/OR/XOR and the bits to clear are zero in the
// other side, BitsToClear is ok.
- if (Tmp == 0 &&
- (Opc == Instruction::And || Opc == Instruction::Or ||
- Opc == Instruction::Xor)) {
+ if (Tmp == 0 && I->isBitwiseLogicOp()) {
// We use MaskedValueIsZero here for generality, but the case we care
// about the most is constant RHS.
unsigned VSize = V->getType()->getScalarSizeInBits();
@@ -922,16 +921,26 @@ Instruction *InstCombiner::visitZExt(ZExtInst &CI) {
BinaryOperator *SrcI = dyn_cast<BinaryOperator>(Src);
if (SrcI && SrcI->getOpcode() == Instruction::Or) {
- // zext (or icmp, icmp) --> or (zext icmp), (zext icmp) if at least one
- // of the (zext icmp) will be transformed.
+ // zext (or icmp, icmp) -> or (zext icmp), (zext icmp) if at least one
+ // of the (zext icmp) can be eliminated. If so, immediately perform the
+ // according elimination.
ICmpInst *LHS = dyn_cast<ICmpInst>(SrcI->getOperand(0));
ICmpInst *RHS = dyn_cast<ICmpInst>(SrcI->getOperand(1));
if (LHS && RHS && LHS->hasOneUse() && RHS->hasOneUse() &&
(transformZExtICmp(LHS, CI, false) ||
transformZExtICmp(RHS, CI, false))) {
+ // zext (or icmp, icmp) -> or (zext icmp), (zext icmp)
Value *LCast = Builder->CreateZExt(LHS, CI.getType(), LHS->getName());
Value *RCast = Builder->CreateZExt(RHS, CI.getType(), RHS->getName());
- return BinaryOperator::Create(Instruction::Or, LCast, RCast);
+ BinaryOperator *Or = BinaryOperator::Create(Instruction::Or, LCast, RCast);
+
+ // Perform the elimination.
+ if (auto *LZExt = dyn_cast<ZExtInst>(LCast))
+ transformZExtICmp(LHS, *LZExt);
+ if (auto *RZExt = dyn_cast<ZExtInst>(RCast))
+ transformZExtICmp(RHS, *RZExt);
+
+ return Or;
}
}
@@ -952,14 +961,6 @@ Instruction *InstCombiner::visitZExt(ZExtInst &CI) {
return BinaryOperator::CreateXor(Builder->CreateAnd(X, ZC), ZC);
}
- // zext (xor i1 X, true) to i32 --> xor (zext i1 X to i32), 1
- if (SrcI && SrcI->hasOneUse() &&
- SrcI->getType()->getScalarType()->isIntegerTy(1) &&
- match(SrcI, m_Not(m_Value(X))) && (!X->hasOneUse() || !isa<CmpInst>(X))) {
- Value *New = Builder->CreateZExt(X, CI.getType());
- return BinaryOperator::CreateXor(New, ConstantInt::get(CI.getType(), 1));
- }
-
return nullptr;
}
@@ -1132,7 +1133,7 @@ Instruction *InstCombiner::visitSExt(SExtInst &CI) {
Type *SrcTy = Src->getType(), *DestTy = CI.getType();
// If we know that the value being extended is positive, we can use a zext
- // instead.
+ // instead.
bool KnownZero, KnownOne;
ComputeSignBit(Src, KnownZero, KnownOne, 0, &CI);
if (KnownZero) {
@@ -1238,14 +1239,14 @@ static Value *lookThroughFPExtensions(Value *V) {
if (CFP->getType() == Type::getPPC_FP128Ty(V->getContext()))
return V; // No constant folding of this.
// See if the value can be truncated to half and then reextended.
- if (Value *V = fitsInFPType(CFP, APFloat::IEEEhalf))
+ if (Value *V = fitsInFPType(CFP, APFloat::IEEEhalf()))
return V;
// See if the value can be truncated to float and then reextended.
- if (Value *V = fitsInFPType(CFP, APFloat::IEEEsingle))
+ if (Value *V = fitsInFPType(CFP, APFloat::IEEEsingle()))
return V;
if (CFP->getType()->isDoubleTy())
return V; // Won't shrink.
- if (Value *V = fitsInFPType(CFP, APFloat::IEEEdouble))
+ if (Value *V = fitsInFPType(CFP, APFloat::IEEEdouble()))
return V;
// Don't try to shrink to various long double types.
}
@@ -1789,6 +1790,205 @@ static Instruction *canonicalizeBitCastExtElt(BitCastInst &BitCast,
return ExtractElementInst::Create(NewBC, ExtElt->getIndexOperand());
}
+/// Change the type of a bitwise logic operation if we can eliminate a bitcast.
+static Instruction *foldBitCastBitwiseLogic(BitCastInst &BitCast,
+ InstCombiner::BuilderTy &Builder) {
+ Type *DestTy = BitCast.getType();
+ BinaryOperator *BO;
+ if (!DestTy->getScalarType()->isIntegerTy() ||
+ !match(BitCast.getOperand(0), m_OneUse(m_BinOp(BO))) ||
+ !BO->isBitwiseLogicOp())
+ return nullptr;
+
+ // FIXME: This transform is restricted to vector types to avoid backend
+ // problems caused by creating potentially illegal operations. If a fix-up is
+ // added to handle that situation, we can remove this check.
+ if (!DestTy->isVectorTy() || !BO->getType()->isVectorTy())
+ return nullptr;
+
+ Value *X;
+ if (match(BO->getOperand(0), m_OneUse(m_BitCast(m_Value(X)))) &&
+ X->getType() == DestTy && !isa<Constant>(X)) {
+ // bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y))
+ Value *CastedOp1 = Builder.CreateBitCast(BO->getOperand(1), DestTy);
+ return BinaryOperator::Create(BO->getOpcode(), X, CastedOp1);
+ }
+
+ if (match(BO->getOperand(1), m_OneUse(m_BitCast(m_Value(X)))) &&
+ X->getType() == DestTy && !isa<Constant>(X)) {
+ // bitcast(logic(Y, bitcast(X))) --> logic'(bitcast(Y), X)
+ Value *CastedOp0 = Builder.CreateBitCast(BO->getOperand(0), DestTy);
+ return BinaryOperator::Create(BO->getOpcode(), CastedOp0, X);
+ }
+
+ return nullptr;
+}
+
+/// Change the type of a select if we can eliminate a bitcast.
+static Instruction *foldBitCastSelect(BitCastInst &BitCast,
+ InstCombiner::BuilderTy &Builder) {
+ Value *Cond, *TVal, *FVal;
+ if (!match(BitCast.getOperand(0),
+ m_OneUse(m_Select(m_Value(Cond), m_Value(TVal), m_Value(FVal)))))
+ return nullptr;
+
+ // A vector select must maintain the same number of elements in its operands.
+ Type *CondTy = Cond->getType();
+ Type *DestTy = BitCast.getType();
+ if (CondTy->isVectorTy()) {
+ if (!DestTy->isVectorTy())
+ return nullptr;
+ if (DestTy->getVectorNumElements() != CondTy->getVectorNumElements())
+ return nullptr;
+ }
+
+ // FIXME: This transform is restricted from changing the select between
+ // scalars and vectors to avoid backend problems caused by creating
+ // potentially illegal operations. If a fix-up is added to handle that
+ // situation, we can remove this check.
+ if (DestTy->isVectorTy() != TVal->getType()->isVectorTy())
+ return nullptr;
+
+ auto *Sel = cast<Instruction>(BitCast.getOperand(0));
+ Value *X;
+ if (match(TVal, m_OneUse(m_BitCast(m_Value(X)))) && X->getType() == DestTy &&
+ !isa<Constant>(X)) {
+ // bitcast(select(Cond, bitcast(X), Y)) --> select'(Cond, X, bitcast(Y))
+ Value *CastedVal = Builder.CreateBitCast(FVal, DestTy);
+ return SelectInst::Create(Cond, X, CastedVal, "", nullptr, Sel);
+ }
+
+ if (match(FVal, m_OneUse(m_BitCast(m_Value(X)))) && X->getType() == DestTy &&
+ !isa<Constant>(X)) {
+ // bitcast(select(Cond, Y, bitcast(X))) --> select'(Cond, bitcast(Y), X)
+ Value *CastedVal = Builder.CreateBitCast(TVal, DestTy);
+ return SelectInst::Create(Cond, CastedVal, X, "", nullptr, Sel);
+ }
+
+ return nullptr;
+}
+
+/// Check if all users of CI are StoreInsts.
+static bool hasStoreUsersOnly(CastInst &CI) {
+ for (User *U : CI.users()) {
+ if (!isa<StoreInst>(U))
+ return false;
+ }
+ return true;
+}
+
+/// This function handles following case
+///
+/// A -> B cast
+/// PHI
+/// B -> A cast
+///
+/// All the related PHI nodes can be replaced by new PHI nodes with type A.
+/// The uses of \p CI can be changed to the new PHI node corresponding to \p PN.
+Instruction *InstCombiner::optimizeBitCastFromPhi(CastInst &CI, PHINode *PN) {
+ // BitCast used by Store can be handled in InstCombineLoadStoreAlloca.cpp.
+ if (hasStoreUsersOnly(CI))
+ return nullptr;
+
+ Value *Src = CI.getOperand(0);
+ Type *SrcTy = Src->getType(); // Type B
+ Type *DestTy = CI.getType(); // Type A
+
+ SmallVector<PHINode *, 4> PhiWorklist;
+ SmallSetVector<PHINode *, 4> OldPhiNodes;
+
+ // Find all of the A->B casts and PHI nodes.
+ // We need to inpect all related PHI nodes, but PHIs can be cyclic, so
+ // OldPhiNodes is used to track all known PHI nodes, before adding a new
+ // PHI to PhiWorklist, it is checked against and added to OldPhiNodes first.
+ PhiWorklist.push_back(PN);
+ OldPhiNodes.insert(PN);
+ while (!PhiWorklist.empty()) {
+ auto *OldPN = PhiWorklist.pop_back_val();
+ for (Value *IncValue : OldPN->incoming_values()) {
+ if (isa<Constant>(IncValue))
+ continue;
+
+ if (auto *LI = dyn_cast<LoadInst>(IncValue)) {
+ // If there is a sequence of one or more load instructions, each loaded
+ // value is used as address of later load instruction, bitcast is
+ // necessary to change the value type, don't optimize it. For
+ // simplicity we give up if the load address comes from another load.
+ Value *Addr = LI->getOperand(0);
+ if (Addr == &CI || isa<LoadInst>(Addr))
+ return nullptr;
+ if (LI->hasOneUse() && LI->isSimple())
+ continue;
+ // If a LoadInst has more than one use, changing the type of loaded
+ // value may create another bitcast.
+ return nullptr;
+ }
+
+ if (auto *PNode = dyn_cast<PHINode>(IncValue)) {
+ if (OldPhiNodes.insert(PNode))
+ PhiWorklist.push_back(PNode);
+ continue;
+ }
+
+ auto *BCI = dyn_cast<BitCastInst>(IncValue);
+ // We can't handle other instructions.
+ if (!BCI)
+ return nullptr;
+
+ // Verify it's a A->B cast.
+ Type *TyA = BCI->getOperand(0)->getType();
+ Type *TyB = BCI->getType();
+ if (TyA != DestTy || TyB != SrcTy)
+ return nullptr;
+ }
+ }
+
+ // For each old PHI node, create a corresponding new PHI node with a type A.
+ SmallDenseMap<PHINode *, PHINode *> NewPNodes;
+ for (auto *OldPN : OldPhiNodes) {
+ Builder->SetInsertPoint(OldPN);
+ PHINode *NewPN = Builder->CreatePHI(DestTy, OldPN->getNumOperands());
+ NewPNodes[OldPN] = NewPN;
+ }
+
+ // Fill in the operands of new PHI nodes.
+ for (auto *OldPN : OldPhiNodes) {
+ PHINode *NewPN = NewPNodes[OldPN];
+ for (unsigned j = 0, e = OldPN->getNumOperands(); j != e; ++j) {
+ Value *V = OldPN->getOperand(j);
+ Value *NewV = nullptr;
+ if (auto *C = dyn_cast<Constant>(V)) {
+ NewV = ConstantExpr::getBitCast(C, DestTy);
+ } else if (auto *LI = dyn_cast<LoadInst>(V)) {
+ Builder->SetInsertPoint(LI->getNextNode());
+ NewV = Builder->CreateBitCast(LI, DestTy);
+ Worklist.Add(LI);
+ } else if (auto *BCI = dyn_cast<BitCastInst>(V)) {
+ NewV = BCI->getOperand(0);
+ } else if (auto *PrevPN = dyn_cast<PHINode>(V)) {
+ NewV = NewPNodes[PrevPN];
+ }
+ assert(NewV);
+ NewPN->addIncoming(NewV, OldPN->getIncomingBlock(j));
+ }
+ }
+
+ // If there is a store with type B, change it to type A.
+ for (User *U : PN->users()) {
+ auto *SI = dyn_cast<StoreInst>(U);
+ if (SI && SI->isSimple() && SI->getOperand(0) == PN) {
+ Builder->SetInsertPoint(SI);
+ auto *NewBC =
+ cast<BitCastInst>(Builder->CreateBitCast(NewPNodes[PN], SrcTy));
+ SI->setOperand(0, NewBC);
+ Worklist.Add(SI);
+ assert(hasStoreUsersOnly(*NewBC));
+ }
+ }
+
+ return replaceInstUsesWith(CI, NewPNodes[PN]);
+}
+
Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
// If the operands are integer typed then apply the integer transforms,
// otherwise just apply the common ones.
@@ -1912,9 +2112,20 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
}
}
+ // Handle the A->B->A cast, and there is an intervening PHI node.
+ if (PHINode *PN = dyn_cast<PHINode>(Src))
+ if (Instruction *I = optimizeBitCastFromPhi(CI, PN))
+ return I;
+
if (Instruction *I = canonicalizeBitCastExtElt(CI, *this, DL))
return I;
+ if (Instruction *I = foldBitCastBitwiseLogic(CI, *Builder))
+ return I;
+
+ if (Instruction *I = foldBitCastSelect(CI, *Builder))
+ return I;
+
if (SrcTy->isPointerTy())
return commonPointerCastTransforms(CI);
return commonCastTransforms(CI);
diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 961497f..428f94b 100644
--- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -35,17 +35,12 @@ using namespace PatternMatch;
// How many times is a select replaced by one of its operands?
STATISTIC(NumSel, "Number of select opts");
-// Initialization Routines
-static ConstantInt *getOne(Constant *C) {
- return ConstantInt::get(cast<IntegerType>(C->getType()), 1);
-}
-
-static ConstantInt *ExtractElement(Constant *V, Constant *Idx) {
+static ConstantInt *extractElement(Constant *V, Constant *Idx) {
return cast<ConstantInt>(ConstantExpr::getExtractElement(V, Idx));
}
-static bool HasAddOverflow(ConstantInt *Result,
+static bool hasAddOverflow(ConstantInt *Result,
ConstantInt *In1, ConstantInt *In2,
bool IsSigned) {
if (!IsSigned)
@@ -58,28 +53,28 @@ static bool HasAddOverflow(ConstantInt *Result,
/// Compute Result = In1+In2, returning true if the result overflowed for this
/// type.
-static bool AddWithOverflow(Constant *&Result, Constant *In1,
+static bool addWithOverflow(Constant *&Result, Constant *In1,
Constant *In2, bool IsSigned = false) {
Result = ConstantExpr::getAdd(In1, In2);
if (VectorType *VTy = dyn_cast<VectorType>(In1->getType())) {
for (unsigned i = 0, e = VTy->getNumElements(); i != e; ++i) {
Constant *Idx = ConstantInt::get(Type::getInt32Ty(In1->getContext()), i);
- if (HasAddOverflow(ExtractElement(Result, Idx),
- ExtractElement(In1, Idx),
- ExtractElement(In2, Idx),
+ if (hasAddOverflow(extractElement(Result, Idx),
+ extractElement(In1, Idx),
+ extractElement(In2, Idx),
IsSigned))
return true;
}
return false;
}
- return HasAddOverflow(cast<ConstantInt>(Result),
+ return hasAddOverflow(cast<ConstantInt>(Result),
cast<ConstantInt>(In1), cast<ConstantInt>(In2),
IsSigned);
}
-static bool HasSubOverflow(ConstantInt *Result,
+static bool hasSubOverflow(ConstantInt *Result,
ConstantInt *In1, ConstantInt *In2,
bool IsSigned) {
if (!IsSigned)
@@ -93,23 +88,23 @@ static bool HasSubOverflow(ConstantInt *Result,
/// Compute Result = In1-In2, returning true if the result overflowed for this
/// type.
-static bool SubWithOverflow(Constant *&Result, Constant *In1,
+static bool subWithOverflow(Constant *&Result, Constant *In1,
Constant *In2, bool IsSigned = false) {
Result = ConstantExpr::getSub(In1, In2);
if (VectorType *VTy = dyn_cast<VectorType>(In1->getType())) {
for (unsigned i = 0, e = VTy->getNumElements(); i != e; ++i) {
Constant *Idx = ConstantInt::get(Type::getInt32Ty(In1->getContext()), i);
- if (HasSubOverflow(ExtractElement(Result, Idx),
- ExtractElement(In1, Idx),
- ExtractElement(In2, Idx),
+ if (hasSubOverflow(extractElement(Result, Idx),
+ extractElement(In1, Idx),
+ extractElement(In2, Idx),
IsSigned))
return true;
}
return false;
}
- return HasSubOverflow(cast<ConstantInt>(Result),
+ return hasSubOverflow(cast<ConstantInt>(Result),
cast<ConstantInt>(In1), cast<ConstantInt>(In2),
IsSigned);
}
@@ -126,26 +121,26 @@ static bool isBranchOnSignBitCheck(ICmpInst &I, bool isSignBit) {
/// Given an exploded icmp instruction, return true if the comparison only
/// checks the sign bit. If it only checks the sign bit, set TrueIfSigned if the
/// result of the comparison is true when the input value is signed.
-static bool isSignBitCheck(ICmpInst::Predicate Pred, ConstantInt *RHS,
+static bool isSignBitCheck(ICmpInst::Predicate Pred, const APInt &RHS,
bool &TrueIfSigned) {
switch (Pred) {
case ICmpInst::ICMP_SLT: // True if LHS s< 0
TrueIfSigned = true;
- return RHS->isZero();
+ return RHS == 0;
case ICmpInst::ICMP_SLE: // True if LHS s<= RHS and RHS == -1
TrueIfSigned = true;
- return RHS->isAllOnesValue();
+ return RHS.isAllOnesValue();
case ICmpInst::ICMP_SGT: // True if LHS s> -1
TrueIfSigned = false;
- return RHS->isAllOnesValue();
+ return RHS.isAllOnesValue();
case ICmpInst::ICMP_UGT:
// True if LHS u> RHS and RHS == high-bit-mask - 1
TrueIfSigned = true;
- return RHS->isMaxValue(true);
+ return RHS.isMaxSignedValue();
case ICmpInst::ICMP_UGE:
// True if LHS u>= RHS and RHS == high-bit-mask (2^7, 2^15, 2^31, etc)
TrueIfSigned = true;
- return RHS->getValue().isSignBit();
+ return RHS.isSignBit();
default:
return false;
}
@@ -154,19 +149,20 @@ static bool isSignBitCheck(ICmpInst::Predicate Pred, ConstantInt *RHS,
/// Returns true if the exploded icmp can be expressed as a signed comparison
/// to zero and updates the predicate accordingly.
/// The signedness of the comparison is preserved.
-static bool isSignTest(ICmpInst::Predicate &Pred, const ConstantInt *RHS) {
+/// TODO: Refactor with decomposeBitTestICmp()?
+static bool isSignTest(ICmpInst::Predicate &Pred, const APInt &C) {
if (!ICmpInst::isSigned(Pred))
return false;
- if (RHS->isZero())
+ if (C == 0)
return ICmpInst::isRelational(Pred);
- if (RHS->isOne()) {
+ if (C == 1) {
if (Pred == ICmpInst::ICMP_SLT) {
Pred = ICmpInst::ICMP_SLE;
return true;
}
- } else if (RHS->isAllOnesValue()) {
+ } else if (C.isAllOnesValue()) {
if (Pred == ICmpInst::ICMP_SGT) {
Pred = ICmpInst::ICMP_SGE;
return true;
@@ -176,16 +172,10 @@ static bool isSignTest(ICmpInst::Predicate &Pred, const ConstantInt *RHS) {
return false;
}
-/// Return true if the constant is of the form 1+0+. This is the same as
-/// lowones(~X).
-static bool isHighOnes(const ConstantInt *CI) {
- return (~CI->getValue() + 1).isPowerOf2();
-}
-
/// Given a signed integer type and a set of known zero and one bits, compute
/// the maximum and minimum values that could have the specified known zero and
/// known one bits, returning them in Min/Max.
-static void ComputeSignedMinMaxValuesFromKnownBits(const APInt &KnownZero,
+static void computeSignedMinMaxValuesFromKnownBits(const APInt &KnownZero,
const APInt &KnownOne,
APInt &Min, APInt &Max) {
assert(KnownZero.getBitWidth() == KnownOne.getBitWidth() &&
@@ -208,7 +198,7 @@ static void ComputeSignedMinMaxValuesFromKnownBits(const APInt &KnownZero,
/// Given an unsigned integer type and a set of known zero and one bits, compute
/// the maximum and minimum values that could have the specified known zero and
/// known one bits, returning them in Min/Max.
-static void ComputeUnsignedMinMaxValuesFromKnownBits(const APInt &KnownZero,
+static void computeUnsignedMinMaxValuesFromKnownBits(const APInt &KnownZero,
const APInt &KnownOne,
APInt &Min, APInt &Max) {
assert(KnownZero.getBitWidth() == KnownOne.getBitWidth() &&
@@ -231,9 +221,10 @@ static void ComputeUnsignedMinMaxValuesFromKnownBits(const APInt &KnownZero,
///
/// If AndCst is non-null, then the loaded value is masked with that constant
/// before doing the comparison. This handles cases like "A[i]&4 == 0".
-Instruction *InstCombiner::
-FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, GlobalVariable *GV,
- CmpInst &ICI, ConstantInt *AndCst) {
+Instruction *InstCombiner::foldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP,
+ GlobalVariable *GV,
+ CmpInst &ICI,
+ ConstantInt *AndCst) {
Constant *Init = GV->getInitializer();
if (!isa<ConstantArray>(Init) && !isa<ConstantDataArray>(Init))
return nullptr;
@@ -319,7 +310,7 @@ FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, GlobalVariable *GV,
// Find out if the comparison would be true or false for the i'th element.
Constant *C = ConstantFoldCompareInstOperands(ICI.getPredicate(), Elt,
- CompareRHS, DL, TLI);
+ CompareRHS, DL, &TLI);
// If the result is undef for this element, ignore it.
if (isa<UndefValue>(C)) {
// Extend range state machines to cover this element in case there is an
@@ -509,7 +500,7 @@ FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, GlobalVariable *GV,
///
/// If we can't emit an optimized form for this expression, this returns null.
///
-static Value *EvaluateGEPOffsetExpression(User *GEP, InstCombiner &IC,
+static Value *evaluateGEPOffsetExpression(User *GEP, InstCombiner &IC,
const DataLayout &DL) {
gep_type_iterator GTI = gep_type_begin(GEP);
@@ -526,7 +517,7 @@ static Value *EvaluateGEPOffsetExpression(User *GEP, InstCombiner &IC,
if (CI->isZero()) continue;
// Handle a struct index, which adds its field offset to the pointer.
- if (StructType *STy = dyn_cast<StructType>(*GTI)) {
+ if (StructType *STy = GTI.getStructTypeOrNull()) {
Offset += DL.getStructLayout(STy)->getElementOffset(CI->getZExtValue());
} else {
uint64_t Size = DL.getTypeAllocSize(GTI.getIndexedType());
@@ -556,7 +547,7 @@ static Value *EvaluateGEPOffsetExpression(User *GEP, InstCombiner &IC,
if (CI->isZero()) continue;
// Handle a struct index, which adds its field offset to the pointer.
- if (StructType *STy = dyn_cast<StructType>(*GTI)) {
+ if (StructType *STy = GTI.getStructTypeOrNull()) {
Offset += DL.getStructLayout(STy)->getElementOffset(CI->getZExtValue());
} else {
uint64_t Size = DL.getTypeAllocSize(GTI.getIndexedType());
@@ -893,6 +884,10 @@ static Instruction *transformToIndexedCompare(GEPOperator *GEPLHS, Value *RHS,
if (!GEPLHS->hasAllConstantIndices())
return nullptr;
+ // Make sure the pointers have the same type.
+ if (GEPLHS->getType() != RHS->getType())
+ return nullptr;
+
Value *PtrBase, *Index;
std::tie(PtrBase, Index) = getAsConstantIndexedAddress(GEPLHS, DL);
@@ -919,7 +914,7 @@ static Instruction *transformToIndexedCompare(GEPOperator *GEPLHS, Value *RHS,
/// Fold comparisons between a GEP instruction and something else. At this point
/// we know that the GEP is on the LHS of the comparison.
-Instruction *InstCombiner::FoldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
+Instruction *InstCombiner::foldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
ICmpInst::Predicate Cond,
Instruction &I) {
// Don't transform signed compares of GEPs into index compares. Even if the
@@ -941,7 +936,7 @@ Instruction *InstCombiner::FoldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
// This transformation (ignoring the base and scales) is valid because we
// know pointers can't overflow since the gep is inbounds. See if we can
// output an optimized form.
- Value *Offset = EvaluateGEPOffsetExpression(GEPLHS, *this, DL);
+ Value *Offset = evaluateGEPOffsetExpression(GEPLHS, *this, DL);
// If not, synthesize the offset the hard way.
if (!Offset)
@@ -1003,12 +998,12 @@ Instruction *InstCombiner::FoldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
// If one of the GEPs has all zero indices, recurse.
if (GEPLHS->hasAllZeroIndices())
- return FoldGEPICmp(GEPRHS, GEPLHS->getOperand(0),
+ return foldGEPICmp(GEPRHS, GEPLHS->getOperand(0),
ICmpInst::getSwappedPredicate(Cond), I);
// If the other GEP has all zero indices, recurse.
if (GEPRHS->hasAllZeroIndices())
- return FoldGEPICmp(GEPLHS, GEPRHS->getOperand(0), Cond, I);
+ return foldGEPICmp(GEPLHS, GEPRHS->getOperand(0), Cond, I);
bool GEPsInBounds = GEPLHS->isInBounds() && GEPRHS->isInBounds();
if (GEPLHS->getNumOperands() == GEPRHS->getNumOperands()) {
@@ -1056,8 +1051,9 @@ Instruction *InstCombiner::FoldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
return transformToIndexedCompare(GEPLHS, RHS, Cond, DL);
}
-Instruction *InstCombiner::FoldAllocaCmp(ICmpInst &ICI, AllocaInst *Alloca,
- Value *Other) {
+Instruction *InstCombiner::foldAllocaCmp(ICmpInst &ICI,
+ const AllocaInst *Alloca,
+ const Value *Other) {
assert(ICI.isEquality() && "Cannot fold non-equality comparison.");
// It would be tempting to fold away comparisons between allocas and any
@@ -1076,8 +1072,8 @@ Instruction *InstCombiner::FoldAllocaCmp(ICmpInst &ICI, AllocaInst *Alloca,
unsigned MaxIter = 32; // Break cycles and bound to constant-time.
- SmallVector<Use *, 32> Worklist;
- for (Use &U : Alloca->uses()) {
+ SmallVector<const Use *, 32> Worklist;
+ for (const Use &U : Alloca->uses()) {
if (Worklist.size() >= MaxIter)
return nullptr;
Worklist.push_back(&U);
@@ -1086,8 +1082,8 @@ Instruction *InstCombiner::FoldAllocaCmp(ICmpInst &ICI, AllocaInst *Alloca,
unsigned NumCmps = 0;
while (!Worklist.empty()) {
assert(Worklist.size() <= MaxIter);
- Use *U = Worklist.pop_back_val();
- Value *V = U->getUser();
+ const Use *U = Worklist.pop_back_val();
+ const Value *V = U->getUser();
--MaxIter;
if (isa<BitCastInst>(V) || isa<GetElementPtrInst>(V) || isa<PHINode>(V) ||
@@ -1096,7 +1092,7 @@ Instruction *InstCombiner::FoldAllocaCmp(ICmpInst &ICI, AllocaInst *Alloca,
} else if (isa<LoadInst>(V)) {
// Loading from the pointer doesn't escape it.
continue;
- } else if (auto *SI = dyn_cast<StoreInst>(V)) {
+ } else if (const auto *SI = dyn_cast<StoreInst>(V)) {
// Storing *to* the pointer is fine, but storing the pointer escapes it.
if (SI->getValueOperand() == U->get())
return nullptr;
@@ -1105,7 +1101,7 @@ Instruction *InstCombiner::FoldAllocaCmp(ICmpInst &ICI, AllocaInst *Alloca,
if (NumCmps++)
return nullptr; // Found more than one cmp.
continue;
- } else if (auto *Intrin = dyn_cast<IntrinsicInst>(V)) {
+ } else if (const auto *Intrin = dyn_cast<IntrinsicInst>(V)) {
switch (Intrin->getIntrinsicID()) {
// These intrinsics don't escape or compare the pointer. Memset is safe
// because we don't allow ptrtoint. Memcpy and memmove are safe because
@@ -1120,7 +1116,7 @@ Instruction *InstCombiner::FoldAllocaCmp(ICmpInst &ICI, AllocaInst *Alloca,
} else {
return nullptr;
}
- for (Use &U : V->uses()) {
+ for (const Use &U : V->uses()) {
if (Worklist.size() >= MaxIter)
return nullptr;
Worklist.push_back(&U);
@@ -1134,9 +1130,9 @@ Instruction *InstCombiner::FoldAllocaCmp(ICmpInst &ICI, AllocaInst *Alloca,
}
/// Fold "icmp pred (X+CI), X".
-Instruction *InstCombiner::FoldICmpAddOpCst(Instruction &ICI,
- Value *X, ConstantInt *CI,
- ICmpInst::Predicate Pred) {
+Instruction *InstCombiner::foldICmpAddOpConst(Instruction &ICI,
+ Value *X, ConstantInt *CI,
+ ICmpInst::Predicate Pred) {
// From this point on, we know that (X+C <= X) --> (X+C < X) because C != 0,
// so the values can never be equal. Similarly for all other "or equals"
// operators.
@@ -1181,52 +1177,995 @@ Instruction *InstCombiner::FoldICmpAddOpCst(Instruction &ICI,
return new ICmpInst(ICmpInst::ICMP_SLT, X, ConstantExpr::getSub(SMax, C));
}
-/// Fold "icmp pred, ([su]div X, DivRHS), CmpRHS" where DivRHS and CmpRHS are
-/// both known to be integer constants.
-Instruction *InstCombiner::FoldICmpDivCst(ICmpInst &ICI, BinaryOperator *DivI,
- ConstantInt *DivRHS) {
- ConstantInt *CmpRHS = cast<ConstantInt>(ICI.getOperand(1));
- const APInt &CmpRHSV = CmpRHS->getValue();
+/// Handle "(icmp eq/ne (ashr/lshr AP2, A), AP1)" ->
+/// (icmp eq/ne A, Log2(AP2/AP1)) ->
+/// (icmp eq/ne A, Log2(AP2) - Log2(AP1)).
+Instruction *InstCombiner::foldICmpShrConstConst(ICmpInst &I, Value *A,
+ const APInt &AP1,
+ const APInt &AP2) {
+ assert(I.isEquality() && "Cannot fold icmp gt/lt");
+
+ auto getICmp = [&I](CmpInst::Predicate Pred, Value *LHS, Value *RHS) {
+ if (I.getPredicate() == I.ICMP_NE)
+ Pred = CmpInst::getInversePredicate(Pred);
+ return new ICmpInst(Pred, LHS, RHS);
+ };
+
+ // Don't bother doing any work for cases which InstSimplify handles.
+ if (AP2 == 0)
+ return nullptr;
+
+ bool IsAShr = isa<AShrOperator>(I.getOperand(0));
+ if (IsAShr) {
+ if (AP2.isAllOnesValue())
+ return nullptr;
+ if (AP2.isNegative() != AP1.isNegative())
+ return nullptr;
+ if (AP2.sgt(AP1))
+ return nullptr;
+ }
+
+ if (!AP1)
+ // 'A' must be large enough to shift out the highest set bit.
+ return getICmp(I.ICMP_UGT, A,
+ ConstantInt::get(A->getType(), AP2.logBase2()));
+
+ if (AP1 == AP2)
+ return getICmp(I.ICMP_EQ, A, ConstantInt::getNullValue(A->getType()));
+
+ int Shift;
+ if (IsAShr && AP1.isNegative())
+ Shift = AP1.countLeadingOnes() - AP2.countLeadingOnes();
+ else
+ Shift = AP1.countLeadingZeros() - AP2.countLeadingZeros();
+
+ if (Shift > 0) {
+ if (IsAShr && AP1 == AP2.ashr(Shift)) {
+ // There are multiple solutions if we are comparing against -1 and the LHS
+ // of the ashr is not a power of two.
+ if (AP1.isAllOnesValue() && !AP2.isPowerOf2())
+ return getICmp(I.ICMP_UGE, A, ConstantInt::get(A->getType(), Shift));
+ return getICmp(I.ICMP_EQ, A, ConstantInt::get(A->getType(), Shift));
+ } else if (AP1 == AP2.lshr(Shift)) {
+ return getICmp(I.ICMP_EQ, A, ConstantInt::get(A->getType(), Shift));
+ }
+ }
+
+ // Shifting const2 will never be equal to const1.
+ // FIXME: This should always be handled by InstSimplify?
+ auto *TorF = ConstantInt::get(I.getType(), I.getPredicate() == I.ICMP_NE);
+ return replaceInstUsesWith(I, TorF);
+}
+
+/// Handle "(icmp eq/ne (shl AP2, A), AP1)" ->
+/// (icmp eq/ne A, TrailingZeros(AP1) - TrailingZeros(AP2)).
+Instruction *InstCombiner::foldICmpShlConstConst(ICmpInst &I, Value *A,
+ const APInt &AP1,
+ const APInt &AP2) {
+ assert(I.isEquality() && "Cannot fold icmp gt/lt");
+
+ auto getICmp = [&I](CmpInst::Predicate Pred, Value *LHS, Value *RHS) {
+ if (I.getPredicate() == I.ICMP_NE)
+ Pred = CmpInst::getInversePredicate(Pred);
+ return new ICmpInst(Pred, LHS, RHS);
+ };
+
+ // Don't bother doing any work for cases which InstSimplify handles.
+ if (AP2 == 0)
+ return nullptr;
+
+ unsigned AP2TrailingZeros = AP2.countTrailingZeros();
+
+ if (!AP1 && AP2TrailingZeros != 0)
+ return getICmp(
+ I.ICMP_UGE, A,
+ ConstantInt::get(A->getType(), AP2.getBitWidth() - AP2TrailingZeros));
+
+ if (AP1 == AP2)
+ return getICmp(I.ICMP_EQ, A, ConstantInt::getNullValue(A->getType()));
+
+ // Get the distance between the lowest bits that are set.
+ int Shift = AP1.countTrailingZeros() - AP2TrailingZeros;
+
+ if (Shift > 0 && AP2.shl(Shift) == AP1)
+ return getICmp(I.ICMP_EQ, A, ConstantInt::get(A->getType(), Shift));
+
+ // Shifting const2 will never be equal to const1.
+ // FIXME: This should always be handled by InstSimplify?
+ auto *TorF = ConstantInt::get(I.getType(), I.getPredicate() == I.ICMP_NE);
+ return replaceInstUsesWith(I, TorF);
+}
+
+/// The caller has matched a pattern of the form:
+/// I = icmp ugt (add (add A, B), CI2), CI1
+/// If this is of the form:
+/// sum = a + b
+/// if (sum+128 >u 255)
+/// Then replace it with llvm.sadd.with.overflow.i8.
+///
+static Instruction *processUGT_ADDCST_ADD(ICmpInst &I, Value *A, Value *B,
+ ConstantInt *CI2, ConstantInt *CI1,
+ InstCombiner &IC) {
+ // The transformation we're trying to do here is to transform this into an
+ // llvm.sadd.with.overflow. To do this, we have to replace the original add
+ // with a narrower add, and discard the add-with-constant that is part of the
+ // range check (if we can't eliminate it, this isn't profitable).
+
+ // In order to eliminate the add-with-constant, the compare can be its only
+ // use.
+ Instruction *AddWithCst = cast<Instruction>(I.getOperand(0));
+ if (!AddWithCst->hasOneUse())
+ return nullptr;
+
+ // If CI2 is 2^7, 2^15, 2^31, then it might be an sadd.with.overflow.
+ if (!CI2->getValue().isPowerOf2())
+ return nullptr;
+ unsigned NewWidth = CI2->getValue().countTrailingZeros();
+ if (NewWidth != 7 && NewWidth != 15 && NewWidth != 31)
+ return nullptr;
+
+ // The width of the new add formed is 1 more than the bias.
+ ++NewWidth;
+
+ // Check to see that CI1 is an all-ones value with NewWidth bits.
+ if (CI1->getBitWidth() == NewWidth ||
+ CI1->getValue() != APInt::getLowBitsSet(CI1->getBitWidth(), NewWidth))
+ return nullptr;
+
+ // This is only really a signed overflow check if the inputs have been
+ // sign-extended; check for that condition. For example, if CI2 is 2^31 and
+ // the operands of the add are 64 bits wide, we need at least 33 sign bits.
+ unsigned NeededSignBits = CI1->getBitWidth() - NewWidth + 1;
+ if (IC.ComputeNumSignBits(A, 0, &I) < NeededSignBits ||
+ IC.ComputeNumSignBits(B, 0, &I) < NeededSignBits)
+ return nullptr;
+
+ // In order to replace the original add with a narrower
+ // llvm.sadd.with.overflow, the only uses allowed are the add-with-constant
+ // and truncates that discard the high bits of the add. Verify that this is
+ // the case.
+ Instruction *OrigAdd = cast<Instruction>(AddWithCst->getOperand(0));
+ for (User *U : OrigAdd->users()) {
+ if (U == AddWithCst)
+ continue;
+
+ // Only accept truncates for now. We would really like a nice recursive
+ // predicate like SimplifyDemandedBits, but which goes downwards the use-def
+ // chain to see which bits of a value are actually demanded. If the
+ // original add had another add which was then immediately truncated, we
+ // could still do the transformation.
+ TruncInst *TI = dyn_cast<TruncInst>(U);
+ if (!TI || TI->getType()->getPrimitiveSizeInBits() > NewWidth)
+ return nullptr;
+ }
+
+ // If the pattern matches, truncate the inputs to the narrower type and
+ // use the sadd_with_overflow intrinsic to efficiently compute both the
+ // result and the overflow bit.
+ Type *NewType = IntegerType::get(OrigAdd->getContext(), NewWidth);
+ Value *F = Intrinsic::getDeclaration(I.getModule(),
+ Intrinsic::sadd_with_overflow, NewType);
+
+ InstCombiner::BuilderTy *Builder = IC.Builder;
+
+ // Put the new code above the original add, in case there are any uses of the
+ // add between the add and the compare.
+ Builder->SetInsertPoint(OrigAdd);
+
+ Value *TruncA = Builder->CreateTrunc(A, NewType, A->getName() + ".trunc");
+ Value *TruncB = Builder->CreateTrunc(B, NewType, B->getName() + ".trunc");
+ CallInst *Call = Builder->CreateCall(F, {TruncA, TruncB}, "sadd");
+ Value *Add = Builder->CreateExtractValue(Call, 0, "sadd.result");
+ Value *ZExt = Builder->CreateZExt(Add, OrigAdd->getType());
+
+ // The inner add was the result of the narrow add, zero extended to the
+ // wider type. Replace it with the result computed by the intrinsic.
+ IC.replaceInstUsesWith(*OrigAdd, ZExt);
+
+ // The original icmp gets replaced with the overflow value.
+ return ExtractValueInst::Create(Call, 1, "sadd.overflow");
+}
+
+// Fold icmp Pred X, C.
+Instruction *InstCombiner::foldICmpWithConstant(ICmpInst &Cmp) {
+ CmpInst::Predicate Pred = Cmp.getPredicate();
+ Value *X = Cmp.getOperand(0);
+
+ const APInt *C;
+ if (!match(Cmp.getOperand(1), m_APInt(C)))
+ return nullptr;
+
+ Value *A = nullptr, *B = nullptr;
+
+ // Match the following pattern, which is a common idiom when writing
+ // overflow-safe integer arithmetic functions. The source performs an addition
+ // in wider type and explicitly checks for overflow using comparisons against
+ // INT_MIN and INT_MAX. Simplify by using the sadd_with_overflow intrinsic.
+ //
+ // TODO: This could probably be generalized to handle other overflow-safe
+ // operations if we worked out the formulas to compute the appropriate magic
+ // constants.
+ //
+ // sum = a + b
+ // if (sum+128 >u 255) ... -> llvm.sadd.with.overflow.i8
+ {
+ ConstantInt *CI2; // I = icmp ugt (add (add A, B), CI2), CI
+ if (Pred == ICmpInst::ICMP_UGT &&
+ match(X, m_Add(m_Add(m_Value(A), m_Value(B)), m_ConstantInt(CI2))))
+ if (Instruction *Res = processUGT_ADDCST_ADD(
+ Cmp, A, B, CI2, cast<ConstantInt>(Cmp.getOperand(1)), *this))
+ return Res;
+ }
+
+ // (icmp sgt smin(PosA, B) 0) -> (icmp sgt B 0)
+ if (*C == 0 && Pred == ICmpInst::ICMP_SGT) {
+ SelectPatternResult SPR = matchSelectPattern(X, A, B);
+ if (SPR.Flavor == SPF_SMIN) {
+ if (isKnownPositive(A, DL))
+ return new ICmpInst(Pred, B, Cmp.getOperand(1));
+ if (isKnownPositive(B, DL))
+ return new ICmpInst(Pred, A, Cmp.getOperand(1));
+ }
+ }
+
+ // FIXME: Use m_APInt to allow folds for splat constants.
+ ConstantInt *CI = dyn_cast<ConstantInt>(Cmp.getOperand(1));
+ if (!CI)
+ return nullptr;
+
+ // Canonicalize icmp instructions based on dominating conditions.
+ BasicBlock *Parent = Cmp.getParent();
+ BasicBlock *Dom = Parent->getSinglePredecessor();
+ auto *BI = Dom ? dyn_cast<BranchInst>(Dom->getTerminator()) : nullptr;
+ ICmpInst::Predicate Pred2;
+ BasicBlock *TrueBB, *FalseBB;
+ ConstantInt *CI2;
+ if (BI && match(BI, m_Br(m_ICmp(Pred2, m_Specific(X), m_ConstantInt(CI2)),
+ TrueBB, FalseBB)) &&
+ TrueBB != FalseBB) {
+ ConstantRange CR =
+ ConstantRange::makeAllowedICmpRegion(Pred, CI->getValue());
+ ConstantRange DominatingCR =
+ (Parent == TrueBB)
+ ? ConstantRange::makeExactICmpRegion(Pred2, CI2->getValue())
+ : ConstantRange::makeExactICmpRegion(
+ CmpInst::getInversePredicate(Pred2), CI2->getValue());
+ ConstantRange Intersection = DominatingCR.intersectWith(CR);
+ ConstantRange Difference = DominatingCR.difference(CR);
+ if (Intersection.isEmptySet())
+ return replaceInstUsesWith(Cmp, Builder->getFalse());
+ if (Difference.isEmptySet())
+ return replaceInstUsesWith(Cmp, Builder->getTrue());
+
+ // If this is a normal comparison, it demands all bits. If it is a sign
+ // bit comparison, it only demands the sign bit.
+ bool UnusedBit;
+ bool IsSignBit = isSignBitCheck(Pred, CI->getValue(), UnusedBit);
+
+ // Canonicalizing a sign bit comparison that gets used in a branch,
+ // pessimizes codegen by generating branch on zero instruction instead
+ // of a test and branch. So we avoid canonicalizing in such situations
+ // because test and branch instruction has better branch displacement
+ // than compare and branch instruction.
+ if (!isBranchOnSignBitCheck(Cmp, IsSignBit) && !Cmp.isEquality()) {
+ if (auto *AI = Intersection.getSingleElement())
+ return new ICmpInst(ICmpInst::ICMP_EQ, X, Builder->getInt(*AI));
+ if (auto *AD = Difference.getSingleElement())
+ return new ICmpInst(ICmpInst::ICMP_NE, X, Builder->getInt(*AD));
+ }
+ }
+
+ return nullptr;
+}
+
+/// Fold icmp (trunc X, Y), C.
+Instruction *InstCombiner::foldICmpTruncConstant(ICmpInst &Cmp,
+ Instruction *Trunc,
+ const APInt *C) {
+ ICmpInst::Predicate Pred = Cmp.getPredicate();
+ Value *X = Trunc->getOperand(0);
+ if (*C == 1 && C->getBitWidth() > 1) {
+ // icmp slt trunc(signum(V)) 1 --> icmp slt V, 1
+ Value *V = nullptr;
+ if (Pred == ICmpInst::ICMP_SLT && match(X, m_Signum(m_Value(V))))
+ return new ICmpInst(ICmpInst::ICMP_SLT, V,
+ ConstantInt::get(V->getType(), 1));
+ }
+
+ if (Cmp.isEquality() && Trunc->hasOneUse()) {
+ // Simplify icmp eq (trunc x to i8), 42 -> icmp eq x, 42|highbits if all
+ // of the high bits truncated out of x are known.
+ unsigned DstBits = Trunc->getType()->getScalarSizeInBits(),
+ SrcBits = X->getType()->getScalarSizeInBits();
+ APInt KnownZero(SrcBits, 0), KnownOne(SrcBits, 0);
+ computeKnownBits(X, KnownZero, KnownOne, 0, &Cmp);
+
+ // If all the high bits are known, we can do this xform.
+ if ((KnownZero | KnownOne).countLeadingOnes() >= SrcBits - DstBits) {
+ // Pull in the high bits from known-ones set.
+ APInt NewRHS = C->zext(SrcBits);
+ NewRHS |= KnownOne & APInt::getHighBitsSet(SrcBits, SrcBits - DstBits);
+ return new ICmpInst(Pred, X, ConstantInt::get(X->getType(), NewRHS));
+ }
+ }
+
+ return nullptr;
+}
+
+/// Fold icmp (xor X, Y), C.
+Instruction *InstCombiner::foldICmpXorConstant(ICmpInst &Cmp,
+ BinaryOperator *Xor,
+ const APInt *C) {
+ Value *X = Xor->getOperand(0);
+ Value *Y = Xor->getOperand(1);
+ const APInt *XorC;
+ if (!match(Y, m_APInt(XorC)))
+ return nullptr;
+
+ // If this is a comparison that tests the signbit (X < 0) or (x > -1),
+ // fold the xor.
+ ICmpInst::Predicate Pred = Cmp.getPredicate();
+ if ((Pred == ICmpInst::ICMP_SLT && *C == 0) ||
+ (Pred == ICmpInst::ICMP_SGT && C->isAllOnesValue())) {
+
+ // If the sign bit of the XorCst is not set, there is no change to
+ // the operation, just stop using the Xor.
+ if (!XorC->isNegative()) {
+ Cmp.setOperand(0, X);
+ Worklist.Add(Xor);
+ return &Cmp;
+ }
+
+ // Was the old condition true if the operand is positive?
+ bool isTrueIfPositive = Pred == ICmpInst::ICMP_SGT;
+
+ // If so, the new one isn't.
+ isTrueIfPositive ^= true;
+
+ Constant *CmpConstant = cast<Constant>(Cmp.getOperand(1));
+ if (isTrueIfPositive)
+ return new ICmpInst(ICmpInst::ICMP_SGT, X, SubOne(CmpConstant));
+ else
+ return new ICmpInst(ICmpInst::ICMP_SLT, X, AddOne(CmpConstant));
+ }
+
+ if (Xor->hasOneUse()) {
+ // (icmp u/s (xor X SignBit), C) -> (icmp s/u X, (xor C SignBit))
+ if (!Cmp.isEquality() && XorC->isSignBit()) {
+ Pred = Cmp.isSigned() ? Cmp.getUnsignedPredicate()
+ : Cmp.getSignedPredicate();
+ return new ICmpInst(Pred, X, ConstantInt::get(X->getType(), *C ^ *XorC));
+ }
+
+ // (icmp u/s (xor X ~SignBit), C) -> (icmp s/u X, (xor C ~SignBit))
+ if (!Cmp.isEquality() && XorC->isMaxSignedValue()) {
+ Pred = Cmp.isSigned() ? Cmp.getUnsignedPredicate()
+ : Cmp.getSignedPredicate();
+ Pred = Cmp.getSwappedPredicate(Pred);
+ return new ICmpInst(Pred, X, ConstantInt::get(X->getType(), *C ^ *XorC));
+ }
+ }
+
+ // (icmp ugt (xor X, C), ~C) -> (icmp ult X, C)
+ // iff -C is a power of 2
+ if (Pred == ICmpInst::ICMP_UGT && *XorC == ~(*C) && (*C + 1).isPowerOf2())
+ return new ICmpInst(ICmpInst::ICMP_ULT, X, Y);
+
+ // (icmp ult (xor X, C), -C) -> (icmp uge X, C)
+ // iff -C is a power of 2
+ if (Pred == ICmpInst::ICMP_ULT && *XorC == -(*C) && C->isPowerOf2())
+ return new ICmpInst(ICmpInst::ICMP_UGE, X, Y);
+
+ return nullptr;
+}
+
+/// Fold icmp (and (sh X, Y), C2), C1.
+Instruction *InstCombiner::foldICmpAndShift(ICmpInst &Cmp, BinaryOperator *And,
+ const APInt *C1, const APInt *C2) {
+ BinaryOperator *Shift = dyn_cast<BinaryOperator>(And->getOperand(0));
+ if (!Shift || !Shift->isShift())
+ return nullptr;
+
+ // If this is: (X >> C3) & C2 != C1 (where any shift and any compare could
+ // exist), turn it into (X & (C2 << C3)) != (C1 << C3). This happens a LOT in
+ // code produced by the clang front-end, for bitfield access.
+ // This seemingly simple opportunity to fold away a shift turns out to be
+ // rather complicated. See PR17827 for details.
+ unsigned ShiftOpcode = Shift->getOpcode();
+ bool IsShl = ShiftOpcode == Instruction::Shl;
+ const APInt *C3;
+ if (match(Shift->getOperand(1), m_APInt(C3))) {
+ bool CanFold = false;
+ if (ShiftOpcode == Instruction::AShr) {
+ // There may be some constraints that make this possible, but nothing
+ // simple has been discovered yet.
+ CanFold = false;
+ } else if (ShiftOpcode == Instruction::Shl) {
+ // For a left shift, we can fold if the comparison is not signed. We can
+ // also fold a signed comparison if the mask value and comparison value
+ // are not negative. These constraints may not be obvious, but we can
+ // prove that they are correct using an SMT solver.
+ if (!Cmp.isSigned() || (!C2->isNegative() && !C1->isNegative()))
+ CanFold = true;
+ } else if (ShiftOpcode == Instruction::LShr) {
+ // For a logical right shift, we can fold if the comparison is not signed.
+ // We can also fold a signed comparison if the shifted mask value and the
+ // shifted comparison value are not negative. These constraints may not be
+ // obvious, but we can prove that they are correct using an SMT solver.
+ if (!Cmp.isSigned() ||
+ (!C2->shl(*C3).isNegative() && !C1->shl(*C3).isNegative()))
+ CanFold = true;
+ }
+
+ if (CanFold) {
+ APInt NewCst = IsShl ? C1->lshr(*C3) : C1->shl(*C3);
+ APInt SameAsC1 = IsShl ? NewCst.shl(*C3) : NewCst.lshr(*C3);
+ // Check to see if we are shifting out any of the bits being compared.
+ if (SameAsC1 != *C1) {
+ // If we shifted bits out, the fold is not going to work out. As a
+ // special case, check to see if this means that the result is always
+ // true or false now.
+ if (Cmp.getPredicate() == ICmpInst::ICMP_EQ)
+ return replaceInstUsesWith(Cmp, ConstantInt::getFalse(Cmp.getType()));
+ if (Cmp.getPredicate() == ICmpInst::ICMP_NE)
+ return replaceInstUsesWith(Cmp, ConstantInt::getTrue(Cmp.getType()));
+ } else {
+ Cmp.setOperand(1, ConstantInt::get(And->getType(), NewCst));
+ APInt NewAndCst = IsShl ? C2->lshr(*C3) : C2->shl(*C3);
+ And->setOperand(1, ConstantInt::get(And->getType(), NewAndCst));
+ And->setOperand(0, Shift->getOperand(0));
+ Worklist.Add(Shift); // Shift is dead.
+ return &Cmp;
+ }
+ }
+ }
+
+ // Turn ((X >> Y) & C2) == 0 into (X & (C2 << Y)) == 0. The latter is
+ // preferable because it allows the C2 << Y expression to be hoisted out of a
+ // loop if Y is invariant and X is not.
+ if (Shift->hasOneUse() && *C1 == 0 && Cmp.isEquality() &&
+ !Shift->isArithmeticShift() && !isa<Constant>(Shift->getOperand(0))) {
+ // Compute C2 << Y.
+ Value *NewShift =
+ IsShl ? Builder->CreateLShr(And->getOperand(1), Shift->getOperand(1))
+ : Builder->CreateShl(And->getOperand(1), Shift->getOperand(1));
+
+ // Compute X & (C2 << Y).
+ Value *NewAnd = Builder->CreateAnd(Shift->getOperand(0), NewShift);
+ Cmp.setOperand(0, NewAnd);
+ return &Cmp;
+ }
+
+ return nullptr;
+}
+
+/// Fold icmp (and X, C2), C1.
+Instruction *InstCombiner::foldICmpAndConstConst(ICmpInst &Cmp,
+ BinaryOperator *And,
+ const APInt *C1) {
+ const APInt *C2;
+ if (!match(And->getOperand(1), m_APInt(C2)))
+ return nullptr;
+
+ if (!And->hasOneUse() || !And->getOperand(0)->hasOneUse())
+ return nullptr;
+
+ // If the LHS is an 'and' of a truncate and we can widen the and/compare to
+ // the input width without changing the value produced, eliminate the cast:
+ //
+ // icmp (and (trunc W), C2), C1 -> icmp (and W, C2'), C1'
+ //
+ // We can do this transformation if the constants do not have their sign bits
+ // set or if it is an equality comparison. Extending a relational comparison
+ // when we're checking the sign bit would not work.
+ Value *W;
+ if (match(And->getOperand(0), m_Trunc(m_Value(W))) &&
+ (Cmp.isEquality() || (!C1->isNegative() && !C2->isNegative()))) {
+ // TODO: Is this a good transform for vectors? Wider types may reduce
+ // throughput. Should this transform be limited (even for scalars) by using
+ // ShouldChangeType()?
+ if (!Cmp.getType()->isVectorTy()) {
+ Type *WideType = W->getType();
+ unsigned WideScalarBits = WideType->getScalarSizeInBits();
+ Constant *ZextC1 = ConstantInt::get(WideType, C1->zext(WideScalarBits));
+ Constant *ZextC2 = ConstantInt::get(WideType, C2->zext(WideScalarBits));
+ Value *NewAnd = Builder->CreateAnd(W, ZextC2, And->getName());
+ return new ICmpInst(Cmp.getPredicate(), NewAnd, ZextC1);
+ }
+ }
+
+ if (Instruction *I = foldICmpAndShift(Cmp, And, C1, C2))
+ return I;
+
+ // (icmp pred (and (or (lshr A, B), A), 1), 0) -->
+ // (icmp pred (and A, (or (shl 1, B), 1), 0))
+ //
+ // iff pred isn't signed
+ if (!Cmp.isSigned() && *C1 == 0 && match(And->getOperand(1), m_One())) {
+ Constant *One = cast<Constant>(And->getOperand(1));
+ Value *Or = And->getOperand(0);
+ Value *A, *B, *LShr;
+ if (match(Or, m_Or(m_Value(LShr), m_Value(A))) &&
+ match(LShr, m_LShr(m_Specific(A), m_Value(B)))) {
+ unsigned UsesRemoved = 0;
+ if (And->hasOneUse())
+ ++UsesRemoved;
+ if (Or->hasOneUse())
+ ++UsesRemoved;
+ if (LShr->hasOneUse())
+ ++UsesRemoved;
+
+ // Compute A & ((1 << B) | 1)
+ Value *NewOr = nullptr;
+ if (auto *C = dyn_cast<Constant>(B)) {
+ if (UsesRemoved >= 1)
+ NewOr = ConstantExpr::getOr(ConstantExpr::getNUWShl(One, C), One);
+ } else {
+ if (UsesRemoved >= 3)
+ NewOr = Builder->CreateOr(Builder->CreateShl(One, B, LShr->getName(),
+ /*HasNUW=*/true),
+ One, Or->getName());
+ }
+ if (NewOr) {
+ Value *NewAnd = Builder->CreateAnd(A, NewOr, And->getName());
+ Cmp.setOperand(0, NewAnd);
+ return &Cmp;
+ }
+ }
+ }
+
+ // (X & C2) > C1 --> (X & C2) != 0, if any bit set in (X & C2) will produce a
+ // result greater than C1.
+ unsigned NumTZ = C2->countTrailingZeros();
+ if (Cmp.getPredicate() == ICmpInst::ICMP_UGT && NumTZ < C2->getBitWidth() &&
+ APInt::getOneBitSet(C2->getBitWidth(), NumTZ).ugt(*C1)) {
+ Constant *Zero = Constant::getNullValue(And->getType());
+ return new ICmpInst(ICmpInst::ICMP_NE, And, Zero);
+ }
+
+ return nullptr;
+}
+
+/// Fold icmp (and X, Y), C.
+Instruction *InstCombiner::foldICmpAndConstant(ICmpInst &Cmp,
+ BinaryOperator *And,
+ const APInt *C) {
+ if (Instruction *I = foldICmpAndConstConst(Cmp, And, C))
+ return I;
+
+ // TODO: These all require that Y is constant too, so refactor with the above.
+
+ // Try to optimize things like "A[i] & 42 == 0" to index computations.
+ Value *X = And->getOperand(0);
+ Value *Y = And->getOperand(1);
+ if (auto *LI = dyn_cast<LoadInst>(X))
+ if (auto *GEP = dyn_cast<GetElementPtrInst>(LI->getOperand(0)))
+ if (auto *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0)))
+ if (GV->isConstant() && GV->hasDefinitiveInitializer() &&
+ !LI->isVolatile() && isa<ConstantInt>(Y)) {
+ ConstantInt *C2 = cast<ConstantInt>(Y);
+ if (Instruction *Res = foldCmpLoadFromIndexedGlobal(GEP, GV, Cmp, C2))
+ return Res;
+ }
+
+ if (!Cmp.isEquality())
+ return nullptr;
+
+ // X & -C == -C -> X > u ~C
+ // X & -C != -C -> X <= u ~C
+ // iff C is a power of 2
+ if (Cmp.getOperand(1) == Y && (-(*C)).isPowerOf2()) {
+ auto NewPred = Cmp.getPredicate() == CmpInst::ICMP_EQ ? CmpInst::ICMP_UGT
+ : CmpInst::ICMP_ULE;
+ return new ICmpInst(NewPred, X, SubOne(cast<Constant>(Cmp.getOperand(1))));
+ }
+
+ // (X & C2) == 0 -> (trunc X) >= 0
+ // (X & C2) != 0 -> (trunc X) < 0
+ // iff C2 is a power of 2 and it masks the sign bit of a legal integer type.
+ const APInt *C2;
+ if (And->hasOneUse() && *C == 0 && match(Y, m_APInt(C2))) {
+ int32_t ExactLogBase2 = C2->exactLogBase2();
+ if (ExactLogBase2 != -1 && DL.isLegalInteger(ExactLogBase2 + 1)) {
+ Type *NTy = IntegerType::get(Cmp.getContext(), ExactLogBase2 + 1);
+ if (And->getType()->isVectorTy())
+ NTy = VectorType::get(NTy, And->getType()->getVectorNumElements());
+ Value *Trunc = Builder->CreateTrunc(X, NTy);
+ auto NewPred = Cmp.getPredicate() == CmpInst::ICMP_EQ ? CmpInst::ICMP_SGE
+ : CmpInst::ICMP_SLT;
+ return new ICmpInst(NewPred, Trunc, Constant::getNullValue(NTy));
+ }
+ }
+
+ return nullptr;
+}
+
+/// Fold icmp (or X, Y), C.
+Instruction *InstCombiner::foldICmpOrConstant(ICmpInst &Cmp, BinaryOperator *Or,
+ const APInt *C) {
+ ICmpInst::Predicate Pred = Cmp.getPredicate();
+ if (*C == 1) {
+ // icmp slt signum(V) 1 --> icmp slt V, 1
+ Value *V = nullptr;
+ if (Pred == ICmpInst::ICMP_SLT && match(Or, m_Signum(m_Value(V))))
+ return new ICmpInst(ICmpInst::ICMP_SLT, V,
+ ConstantInt::get(V->getType(), 1));
+ }
+
+ if (!Cmp.isEquality() || *C != 0 || !Or->hasOneUse())
+ return nullptr;
+
+ Value *P, *Q;
+ if (match(Or, m_Or(m_PtrToInt(m_Value(P)), m_PtrToInt(m_Value(Q))))) {
+ // Simplify icmp eq (or (ptrtoint P), (ptrtoint Q)), 0
+ // -> and (icmp eq P, null), (icmp eq Q, null).
+ Value *CmpP =
+ Builder->CreateICmp(Pred, P, ConstantInt::getNullValue(P->getType()));
+ Value *CmpQ =
+ Builder->CreateICmp(Pred, Q, ConstantInt::getNullValue(Q->getType()));
+ auto LogicOpc = Pred == ICmpInst::Predicate::ICMP_EQ ? Instruction::And
+ : Instruction::Or;
+ return BinaryOperator::Create(LogicOpc, CmpP, CmpQ);
+ }
+
+ return nullptr;
+}
+
+/// Fold icmp (mul X, Y), C.
+Instruction *InstCombiner::foldICmpMulConstant(ICmpInst &Cmp,
+ BinaryOperator *Mul,
+ const APInt *C) {
+ const APInt *MulC;
+ if (!match(Mul->getOperand(1), m_APInt(MulC)))
+ return nullptr;
+
+ // If this is a test of the sign bit and the multiply is sign-preserving with
+ // a constant operand, use the multiply LHS operand instead.
+ ICmpInst::Predicate Pred = Cmp.getPredicate();
+ if (isSignTest(Pred, *C) && Mul->hasNoSignedWrap()) {
+ if (MulC->isNegative())
+ Pred = ICmpInst::getSwappedPredicate(Pred);
+ return new ICmpInst(Pred, Mul->getOperand(0),
+ Constant::getNullValue(Mul->getType()));
+ }
+
+ return nullptr;
+}
+
+/// Fold icmp (shl 1, Y), C.
+static Instruction *foldICmpShlOne(ICmpInst &Cmp, Instruction *Shl,
+ const APInt *C) {
+ Value *Y;
+ if (!match(Shl, m_Shl(m_One(), m_Value(Y))))
+ return nullptr;
+
+ Type *ShiftType = Shl->getType();
+ uint32_t TypeBits = C->getBitWidth();
+ bool CIsPowerOf2 = C->isPowerOf2();
+ ICmpInst::Predicate Pred = Cmp.getPredicate();
+ if (Cmp.isUnsigned()) {
+ // (1 << Y) pred C -> Y pred Log2(C)
+ if (!CIsPowerOf2) {
+ // (1 << Y) < 30 -> Y <= 4
+ // (1 << Y) <= 30 -> Y <= 4
+ // (1 << Y) >= 30 -> Y > 4
+ // (1 << Y) > 30 -> Y > 4
+ if (Pred == ICmpInst::ICMP_ULT)
+ Pred = ICmpInst::ICMP_ULE;
+ else if (Pred == ICmpInst::ICMP_UGE)
+ Pred = ICmpInst::ICMP_UGT;
+ }
+
+ // (1 << Y) >= 2147483648 -> Y >= 31 -> Y == 31
+ // (1 << Y) < 2147483648 -> Y < 31 -> Y != 31
+ unsigned CLog2 = C->logBase2();
+ if (CLog2 == TypeBits - 1) {
+ if (Pred == ICmpInst::ICMP_UGE)
+ Pred = ICmpInst::ICMP_EQ;
+ else if (Pred == ICmpInst::ICMP_ULT)
+ Pred = ICmpInst::ICMP_NE;
+ }
+ return new ICmpInst(Pred, Y, ConstantInt::get(ShiftType, CLog2));
+ } else if (Cmp.isSigned()) {
+ Constant *BitWidthMinusOne = ConstantInt::get(ShiftType, TypeBits - 1);
+ if (C->isAllOnesValue()) {
+ // (1 << Y) <= -1 -> Y == 31
+ if (Pred == ICmpInst::ICMP_SLE)
+ return new ICmpInst(ICmpInst::ICMP_EQ, Y, BitWidthMinusOne);
+
+ // (1 << Y) > -1 -> Y != 31
+ if (Pred == ICmpInst::ICMP_SGT)
+ return new ICmpInst(ICmpInst::ICMP_NE, Y, BitWidthMinusOne);
+ } else if (!(*C)) {
+ // (1 << Y) < 0 -> Y == 31
+ // (1 << Y) <= 0 -> Y == 31
+ if (Pred == ICmpInst::ICMP_SLT || Pred == ICmpInst::ICMP_SLE)
+ return new ICmpInst(ICmpInst::ICMP_EQ, Y, BitWidthMinusOne);
+
+ // (1 << Y) >= 0 -> Y != 31
+ // (1 << Y) > 0 -> Y != 31
+ if (Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_SGE)
+ return new ICmpInst(ICmpInst::ICMP_NE, Y, BitWidthMinusOne);
+ }
+ } else if (Cmp.isEquality() && CIsPowerOf2) {
+ return new ICmpInst(Pred, Y, ConstantInt::get(ShiftType, C->logBase2()));
+ }
+
+ return nullptr;
+}
+
+/// Fold icmp (shl X, Y), C.
+Instruction *InstCombiner::foldICmpShlConstant(ICmpInst &Cmp,
+ BinaryOperator *Shl,
+ const APInt *C) {
+ const APInt *ShiftVal;
+ if (Cmp.isEquality() && match(Shl->getOperand(0), m_APInt(ShiftVal)))
+ return foldICmpShlConstConst(Cmp, Shl->getOperand(1), *C, *ShiftVal);
+
+ const APInt *ShiftAmt;
+ if (!match(Shl->getOperand(1), m_APInt(ShiftAmt)))
+ return foldICmpShlOne(Cmp, Shl, C);
+
+ // Check that the shift amount is in range. If not, don't perform undefined
+ // shifts. When the shift is visited, it will be simplified.
+ unsigned TypeBits = C->getBitWidth();
+ if (ShiftAmt->uge(TypeBits))
+ return nullptr;
+
+ ICmpInst::Predicate Pred = Cmp.getPredicate();
+ Value *X = Shl->getOperand(0);
+ if (Cmp.isEquality()) {
+ // If the shift is NUW, then it is just shifting out zeros, no need for an
+ // AND.
+ Constant *LShrC = ConstantInt::get(Shl->getType(), C->lshr(*ShiftAmt));
+ if (Shl->hasNoUnsignedWrap())
+ return new ICmpInst(Pred, X, LShrC);
+
+ // If the shift is NSW and we compare to 0, then it is just shifting out
+ // sign bits, no need for an AND either.
+ if (Shl->hasNoSignedWrap() && *C == 0)
+ return new ICmpInst(Pred, X, LShrC);
+
+ if (Shl->hasOneUse()) {
+ // Otherwise, strength reduce the shift into an and.
+ Constant *Mask = ConstantInt::get(Shl->getType(),
+ APInt::getLowBitsSet(TypeBits, TypeBits - ShiftAmt->getZExtValue()));
+
+ Value *And = Builder->CreateAnd(X, Mask, Shl->getName() + ".mask");
+ return new ICmpInst(Pred, And, LShrC);
+ }
+ }
+
+ // If this is a signed comparison to 0 and the shift is sign preserving,
+ // use the shift LHS operand instead; isSignTest may change 'Pred', so only
+ // do that if we're sure to not continue on in this function.
+ if (Shl->hasNoSignedWrap() && isSignTest(Pred, *C))
+ return new ICmpInst(Pred, X, Constant::getNullValue(X->getType()));
+
+ // Otherwise, if this is a comparison of the sign bit, simplify to and/test.
+ bool TrueIfSigned = false;
+ if (Shl->hasOneUse() && isSignBitCheck(Pred, *C, TrueIfSigned)) {
+ // (X << 31) <s 0 --> (X & 1) != 0
+ Constant *Mask = ConstantInt::get(
+ X->getType(),
+ APInt::getOneBitSet(TypeBits, TypeBits - ShiftAmt->getZExtValue() - 1));
+ Value *And = Builder->CreateAnd(X, Mask, Shl->getName() + ".mask");
+ return new ICmpInst(TrueIfSigned ? ICmpInst::ICMP_NE : ICmpInst::ICMP_EQ,
+ And, Constant::getNullValue(And->getType()));
+ }
+
+ // When the shift is nuw and pred is >u or <=u, comparison only really happens
+ // in the pre-shifted bits. Since InstSimplify canonicalizes <=u into <u, the
+ // <=u case can be further converted to match <u (see below).
+ if (Shl->hasNoUnsignedWrap() &&
+ (Pred == ICmpInst::ICMP_UGT || Pred == ICmpInst::ICMP_ULT)) {
+ // Derivation for the ult case:
+ // (X << S) <=u C is equiv to X <=u (C >> S) for all C
+ // (X << S) <u (C + 1) is equiv to X <u (C >> S) + 1 if C <u ~0u
+ // (X << S) <u C is equiv to X <u ((C - 1) >> S) + 1 if C >u 0
+ assert((Pred != ICmpInst::ICMP_ULT || C->ugt(0)) &&
+ "Encountered `ult 0` that should have been eliminated by "
+ "InstSimplify.");
+ APInt ShiftedC = Pred == ICmpInst::ICMP_ULT ? (*C - 1).lshr(*ShiftAmt) + 1
+ : C->lshr(*ShiftAmt);
+ return new ICmpInst(Pred, X, ConstantInt::get(X->getType(), ShiftedC));
+ }
+
+ // Transform (icmp pred iM (shl iM %v, N), C)
+ // -> (icmp pred i(M-N) (trunc %v iM to i(M-N)), (trunc (C>>N))
+ // Transform the shl to a trunc if (trunc (C>>N)) has no loss and M-N.
+ // This enables us to get rid of the shift in favor of a trunc that may be
+ // free on the target. It has the additional benefit of comparing to a
+ // smaller constant that may be more target-friendly.
+ unsigned Amt = ShiftAmt->getLimitedValue(TypeBits - 1);
+ if (Shl->hasOneUse() && Amt != 0 && C->countTrailingZeros() >= Amt &&
+ DL.isLegalInteger(TypeBits - Amt)) {
+ Type *TruncTy = IntegerType::get(Cmp.getContext(), TypeBits - Amt);
+ if (X->getType()->isVectorTy())
+ TruncTy = VectorType::get(TruncTy, X->getType()->getVectorNumElements());
+ Constant *NewC =
+ ConstantInt::get(TruncTy, C->ashr(*ShiftAmt).trunc(TypeBits - Amt));
+ return new ICmpInst(Pred, Builder->CreateTrunc(X, TruncTy), NewC);
+ }
+
+ return nullptr;
+}
+
+/// Fold icmp ({al}shr X, Y), C.
+Instruction *InstCombiner::foldICmpShrConstant(ICmpInst &Cmp,
+ BinaryOperator *Shr,
+ const APInt *C) {
+ // An exact shr only shifts out zero bits, so:
+ // icmp eq/ne (shr X, Y), 0 --> icmp eq/ne X, 0
+ Value *X = Shr->getOperand(0);
+ CmpInst::Predicate Pred = Cmp.getPredicate();
+ if (Cmp.isEquality() && Shr->isExact() && Shr->hasOneUse() && *C == 0)
+ return new ICmpInst(Pred, X, Cmp.getOperand(1));
+
+ const APInt *ShiftVal;
+ if (Cmp.isEquality() && match(Shr->getOperand(0), m_APInt(ShiftVal)))
+ return foldICmpShrConstConst(Cmp, Shr->getOperand(1), *C, *ShiftVal);
+
+ const APInt *ShiftAmt;
+ if (!match(Shr->getOperand(1), m_APInt(ShiftAmt)))
+ return nullptr;
+
+ // Check that the shift amount is in range. If not, don't perform undefined
+ // shifts. When the shift is visited it will be simplified.
+ unsigned TypeBits = C->getBitWidth();
+ unsigned ShAmtVal = ShiftAmt->getLimitedValue(TypeBits);
+ if (ShAmtVal >= TypeBits || ShAmtVal == 0)
+ return nullptr;
+
+ bool IsAShr = Shr->getOpcode() == Instruction::AShr;
+ if (!Cmp.isEquality()) {
+ // If we have an unsigned comparison and an ashr, we can't simplify this.
+ // Similarly for signed comparisons with lshr.
+ if (Cmp.isSigned() != IsAShr)
+ return nullptr;
+
+ // Otherwise, all lshr and most exact ashr's are equivalent to a udiv/sdiv
+ // by a power of 2. Since we already have logic to simplify these,
+ // transform to div and then simplify the resultant comparison.
+ if (IsAShr && (!Shr->isExact() || ShAmtVal == TypeBits - 1))
+ return nullptr;
+
+ // Revisit the shift (to delete it).
+ Worklist.Add(Shr);
+
+ Constant *DivCst = ConstantInt::get(
+ Shr->getType(), APInt::getOneBitSet(TypeBits, ShAmtVal));
+
+ Value *Tmp = IsAShr ? Builder->CreateSDiv(X, DivCst, "", Shr->isExact())
+ : Builder->CreateUDiv(X, DivCst, "", Shr->isExact());
+
+ Cmp.setOperand(0, Tmp);
+
+ // If the builder folded the binop, just return it.
+ BinaryOperator *TheDiv = dyn_cast<BinaryOperator>(Tmp);
+ if (!TheDiv)
+ return &Cmp;
+
+ // Otherwise, fold this div/compare.
+ assert(TheDiv->getOpcode() == Instruction::SDiv ||
+ TheDiv->getOpcode() == Instruction::UDiv);
+
+ Instruction *Res = foldICmpDivConstant(Cmp, TheDiv, C);
+ assert(Res && "This div/cst should have folded!");
+ return Res;
+ }
+
+ // Handle equality comparisons of shift-by-constant.
+
+ // If the comparison constant changes with the shift, the comparison cannot
+ // succeed (bits of the comparison constant cannot match the shifted value).
+ // This should be known by InstSimplify and already be folded to true/false.
+ assert(((IsAShr && C->shl(ShAmtVal).ashr(ShAmtVal) == *C) ||
+ (!IsAShr && C->shl(ShAmtVal).lshr(ShAmtVal) == *C)) &&
+ "Expected icmp+shr simplify did not occur.");
+
+ // Check if the bits shifted out are known to be zero. If so, we can compare
+ // against the unshifted value:
+ // (X & 4) >> 1 == 2 --> (X & 4) == 4.
+ Constant *ShiftedCmpRHS = ConstantInt::get(Shr->getType(), *C << ShAmtVal);
+ if (Shr->hasOneUse()) {
+ if (Shr->isExact())
+ return new ICmpInst(Pred, X, ShiftedCmpRHS);
+
+ // Otherwise strength reduce the shift into an 'and'.
+ APInt Val(APInt::getHighBitsSet(TypeBits, TypeBits - ShAmtVal));
+ Constant *Mask = ConstantInt::get(Shr->getType(), Val);
+ Value *And = Builder->CreateAnd(X, Mask, Shr->getName() + ".mask");
+ return new ICmpInst(Pred, And, ShiftedCmpRHS);
+ }
+
+ return nullptr;
+}
+
+/// Fold icmp (udiv X, Y), C.
+Instruction *InstCombiner::foldICmpUDivConstant(ICmpInst &Cmp,
+ BinaryOperator *UDiv,
+ const APInt *C) {
+ const APInt *C2;
+ if (!match(UDiv->getOperand(0), m_APInt(C2)))
+ return nullptr;
+
+ assert(C2 != 0 && "udiv 0, X should have been simplified already.");
+
+ // (icmp ugt (udiv C2, Y), C) -> (icmp ule Y, C2/(C+1))
+ Value *Y = UDiv->getOperand(1);
+ if (Cmp.getPredicate() == ICmpInst::ICMP_UGT) {
+ assert(!C->isMaxValue() &&
+ "icmp ugt X, UINT_MAX should have been simplified already.");
+ return new ICmpInst(ICmpInst::ICMP_ULE, Y,
+ ConstantInt::get(Y->getType(), C2->udiv(*C + 1)));
+ }
+
+ // (icmp ult (udiv C2, Y), C) -> (icmp ugt Y, C2/C)
+ if (Cmp.getPredicate() == ICmpInst::ICMP_ULT) {
+ assert(C != 0 && "icmp ult X, 0 should have been simplified already.");
+ return new ICmpInst(ICmpInst::ICMP_UGT, Y,
+ ConstantInt::get(Y->getType(), C2->udiv(*C)));
+ }
+
+ return nullptr;
+}
+
+/// Fold icmp ({su}div X, Y), C.
+Instruction *InstCombiner::foldICmpDivConstant(ICmpInst &Cmp,
+ BinaryOperator *Div,
+ const APInt *C) {
+ // Fold: icmp pred ([us]div X, C2), C -> range test
+ // Fold this div into the comparison, producing a range check.
+ // Determine, based on the divide type, what the range is being
+ // checked. If there is an overflow on the low or high side, remember
+ // it, otherwise compute the range [low, hi) bounding the new value.
+ // See: InsertRangeTest above for the kinds of replacements possible.
+ const APInt *C2;
+ if (!match(Div->getOperand(1), m_APInt(C2)))
+ return nullptr;
// FIXME: If the operand types don't match the type of the divide
// then don't attempt this transform. The code below doesn't have the
// logic to deal with a signed divide and an unsigned compare (and
- // vice versa). This is because (x /s C1) <s C2 produces different
- // results than (x /s C1) <u C2 or (x /u C1) <s C2 or even
- // (x /u C1) <u C2. Simply casting the operands and result won't
+ // vice versa). This is because (x /s C2) <s C produces different
+ // results than (x /s C2) <u C or (x /u C2) <s C or even
+ // (x /u C2) <u C. Simply casting the operands and result won't
// work. :( The if statement below tests that condition and bails
// if it finds it.
- bool DivIsSigned = DivI->getOpcode() == Instruction::SDiv;
- if (!ICI.isEquality() && DivIsSigned != ICI.isSigned())
+ bool DivIsSigned = Div->getOpcode() == Instruction::SDiv;
+ if (!Cmp.isEquality() && DivIsSigned != Cmp.isSigned())
return nullptr;
- if (DivRHS->isZero())
- return nullptr; // The ProdOV computation fails on divide by zero.
- if (DivIsSigned && DivRHS->isAllOnesValue())
- return nullptr; // The overflow computation also screws up here
- if (DivRHS->isOne()) {
- // This eliminates some funny cases with INT_MIN.
- ICI.setOperand(0, DivI->getOperand(0)); // X/1 == X.
- return &ICI;
- }
-
- // Compute Prod = CI * DivRHS. We are essentially solving an equation
- // of form X/C1=C2. We solve for X by multiplying C1 (DivRHS) and
- // C2 (CI). By solving for X we can turn this into a range check
- // instead of computing a divide.
+
+ // The ProdOV computation fails on divide by 0 and divide by -1. Cases with
+ // INT_MIN will also fail if the divisor is 1. Although folds of all these
+ // division-by-constant cases should be present, we can not assert that they
+ // have happened before we reach this icmp instruction.
+ if (*C2 == 0 || *C2 == 1 || (DivIsSigned && C2->isAllOnesValue()))
+ return nullptr;
+
+ // TODO: We could do all of the computations below using APInt.
+ Constant *CmpRHS = cast<Constant>(Cmp.getOperand(1));
+ Constant *DivRHS = cast<Constant>(Div->getOperand(1));
+
+ // Compute Prod = CmpRHS * DivRHS. We are essentially solving an equation of
+ // form X / C2 = C. We solve for X by multiplying C2 (DivRHS) and C (CmpRHS).
+ // By solving for X, we can turn this into a range check instead of computing
+ // a divide.
Constant *Prod = ConstantExpr::getMul(CmpRHS, DivRHS);
- // Determine if the product overflows by seeing if the product is
- // not equal to the divide. Make sure we do the same kind of divide
- // as in the LHS instruction that we're folding.
- bool ProdOV = (DivIsSigned ? ConstantExpr::getSDiv(Prod, DivRHS) :
- ConstantExpr::getUDiv(Prod, DivRHS)) != CmpRHS;
+ // Determine if the product overflows by seeing if the product is not equal to
+ // the divide. Make sure we do the same kind of divide as in the LHS
+ // instruction that we're folding.
+ bool ProdOV = (DivIsSigned ? ConstantExpr::getSDiv(Prod, DivRHS)
+ : ConstantExpr::getUDiv(Prod, DivRHS)) != CmpRHS;
- // Get the ICmp opcode
- ICmpInst::Predicate Pred = ICI.getPredicate();
+ ICmpInst::Predicate Pred = Cmp.getPredicate();
// If the division is known to be exact, then there is no remainder from the
// divide, so the covered range size is unit, otherwise it is the divisor.
- ConstantInt *RangeSize = DivI->isExact() ? getOne(Prod) : DivRHS;
+ Constant *RangeSize =
+ Div->isExact() ? ConstantInt::get(Div->getType(), 1) : DivRHS;
// Figure out the interval that is being checked. For example, a comparison
// like "X /u 5 == 0" is really checking that X is in the interval [0, 5).
@@ -1245,1134 +2184,1094 @@ Instruction *InstCombiner::FoldICmpDivCst(ICmpInst &ICI, BinaryOperator *DivI,
if (!HiOverflow) {
// If this is not an exact divide, then many values in the range collapse
// to the same result value.
- HiOverflow = AddWithOverflow(HiBound, LoBound, RangeSize, false);
+ HiOverflow = addWithOverflow(HiBound, LoBound, RangeSize, false);
}
- } else if (DivRHS->getValue().isStrictlyPositive()) { // Divisor is > 0.
- if (CmpRHSV == 0) { // (X / pos) op 0
+ } else if (C2->isStrictlyPositive()) { // Divisor is > 0.
+ if (*C == 0) { // (X / pos) op 0
// Can't overflow. e.g. X/2 op 0 --> [-1, 2)
LoBound = ConstantExpr::getNeg(SubOne(RangeSize));
HiBound = RangeSize;
- } else if (CmpRHSV.isStrictlyPositive()) { // (X / pos) op pos
+ } else if (C->isStrictlyPositive()) { // (X / pos) op pos
LoBound = Prod; // e.g. X/5 op 3 --> [15, 20)
HiOverflow = LoOverflow = ProdOV;
if (!HiOverflow)
- HiOverflow = AddWithOverflow(HiBound, Prod, RangeSize, true);
+ HiOverflow = addWithOverflow(HiBound, Prod, RangeSize, true);
} else { // (X / pos) op neg
// e.g. X/5 op -3 --> [-15-4, -15+1) --> [-19, -14)
HiBound = AddOne(Prod);
LoOverflow = HiOverflow = ProdOV ? -1 : 0;
if (!LoOverflow) {
- ConstantInt *DivNeg =cast<ConstantInt>(ConstantExpr::getNeg(RangeSize));
- LoOverflow = AddWithOverflow(LoBound, HiBound, DivNeg, true) ? -1 : 0;
+ Constant *DivNeg = ConstantExpr::getNeg(RangeSize);
+ LoOverflow = addWithOverflow(LoBound, HiBound, DivNeg, true) ? -1 : 0;
}
}
- } else if (DivRHS->isNegative()) { // Divisor is < 0.
- if (DivI->isExact())
- RangeSize = cast<ConstantInt>(ConstantExpr::getNeg(RangeSize));
- if (CmpRHSV == 0) { // (X / neg) op 0
+ } else if (C2->isNegative()) { // Divisor is < 0.
+ if (Div->isExact())
+ RangeSize = ConstantExpr::getNeg(RangeSize);
+ if (*C == 0) { // (X / neg) op 0
// e.g. X/-5 op 0 --> [-4, 5)
LoBound = AddOne(RangeSize);
- HiBound = cast<ConstantInt>(ConstantExpr::getNeg(RangeSize));
+ HiBound = ConstantExpr::getNeg(RangeSize);
if (HiBound == DivRHS) { // -INTMIN = INTMIN
HiOverflow = 1; // [INTMIN+1, overflow)
HiBound = nullptr; // e.g. X/INTMIN = 0 --> X > INTMIN
}
- } else if (CmpRHSV.isStrictlyPositive()) { // (X / neg) op pos
+ } else if (C->isStrictlyPositive()) { // (X / neg) op pos
// e.g. X/-5 op 3 --> [-19, -14)
HiBound = AddOne(Prod);
HiOverflow = LoOverflow = ProdOV ? -1 : 0;
if (!LoOverflow)
- LoOverflow = AddWithOverflow(LoBound, HiBound, RangeSize, true) ? -1:0;
+ LoOverflow = addWithOverflow(LoBound, HiBound, RangeSize, true) ? -1:0;
} else { // (X / neg) op neg
LoBound = Prod; // e.g. X/-5 op -3 --> [15, 20)
LoOverflow = HiOverflow = ProdOV;
if (!HiOverflow)
- HiOverflow = SubWithOverflow(HiBound, Prod, RangeSize, true);
+ HiOverflow = subWithOverflow(HiBound, Prod, RangeSize, true);
}
// Dividing by a negative swaps the condition. LT <-> GT
Pred = ICmpInst::getSwappedPredicate(Pred);
}
- Value *X = DivI->getOperand(0);
+ Value *X = Div->getOperand(0);
switch (Pred) {
- default: llvm_unreachable("Unhandled icmp opcode!");
- case ICmpInst::ICMP_EQ:
- if (LoOverflow && HiOverflow)
- return replaceInstUsesWith(ICI, Builder->getFalse());
- if (HiOverflow)
- return new ICmpInst(DivIsSigned ? ICmpInst::ICMP_SGE :
- ICmpInst::ICMP_UGE, X, LoBound);
- if (LoOverflow)
- return new ICmpInst(DivIsSigned ? ICmpInst::ICMP_SLT :
- ICmpInst::ICMP_ULT, X, HiBound);
- return replaceInstUsesWith(ICI, InsertRangeTest(X, LoBound, HiBound,
- DivIsSigned, true));
- case ICmpInst::ICMP_NE:
- if (LoOverflow && HiOverflow)
- return replaceInstUsesWith(ICI, Builder->getTrue());
- if (HiOverflow)
- return new ICmpInst(DivIsSigned ? ICmpInst::ICMP_SLT :
- ICmpInst::ICMP_ULT, X, LoBound);
- if (LoOverflow)
- return new ICmpInst(DivIsSigned ? ICmpInst::ICMP_SGE :
- ICmpInst::ICMP_UGE, X, HiBound);
- return replaceInstUsesWith(ICI, InsertRangeTest(X, LoBound, HiBound,
- DivIsSigned, false));
- case ICmpInst::ICMP_ULT:
- case ICmpInst::ICMP_SLT:
- if (LoOverflow == +1) // Low bound is greater than input range.
- return replaceInstUsesWith(ICI, Builder->getTrue());
- if (LoOverflow == -1) // Low bound is less than input range.
- return replaceInstUsesWith(ICI, Builder->getFalse());
- return new ICmpInst(Pred, X, LoBound);
- case ICmpInst::ICMP_UGT:
- case ICmpInst::ICMP_SGT:
- if (HiOverflow == +1) // High bound greater than input range.
- return replaceInstUsesWith(ICI, Builder->getFalse());
- if (HiOverflow == -1) // High bound less than input range.
- return replaceInstUsesWith(ICI, Builder->getTrue());
- if (Pred == ICmpInst::ICMP_UGT)
- return new ICmpInst(ICmpInst::ICMP_UGE, X, HiBound);
- return new ICmpInst(ICmpInst::ICMP_SGE, X, HiBound);
+ default: llvm_unreachable("Unhandled icmp opcode!");
+ case ICmpInst::ICMP_EQ:
+ if (LoOverflow && HiOverflow)
+ return replaceInstUsesWith(Cmp, Builder->getFalse());
+ if (HiOverflow)
+ return new ICmpInst(DivIsSigned ? ICmpInst::ICMP_SGE :
+ ICmpInst::ICMP_UGE, X, LoBound);
+ if (LoOverflow)
+ return new ICmpInst(DivIsSigned ? ICmpInst::ICMP_SLT :
+ ICmpInst::ICMP_ULT, X, HiBound);
+ return replaceInstUsesWith(
+ Cmp, insertRangeTest(X, LoBound->getUniqueInteger(),
+ HiBound->getUniqueInteger(), DivIsSigned, true));
+ case ICmpInst::ICMP_NE:
+ if (LoOverflow && HiOverflow)
+ return replaceInstUsesWith(Cmp, Builder->getTrue());
+ if (HiOverflow)
+ return new ICmpInst(DivIsSigned ? ICmpInst::ICMP_SLT :
+ ICmpInst::ICMP_ULT, X, LoBound);
+ if (LoOverflow)
+ return new ICmpInst(DivIsSigned ? ICmpInst::ICMP_SGE :
+ ICmpInst::ICMP_UGE, X, HiBound);
+ return replaceInstUsesWith(Cmp,
+ insertRangeTest(X, LoBound->getUniqueInteger(),
+ HiBound->getUniqueInteger(),
+ DivIsSigned, false));
+ case ICmpInst::ICMP_ULT:
+ case ICmpInst::ICMP_SLT:
+ if (LoOverflow == +1) // Low bound is greater than input range.
+ return replaceInstUsesWith(Cmp, Builder->getTrue());
+ if (LoOverflow == -1) // Low bound is less than input range.
+ return replaceInstUsesWith(Cmp, Builder->getFalse());
+ return new ICmpInst(Pred, X, LoBound);
+ case ICmpInst::ICMP_UGT:
+ case ICmpInst::ICMP_SGT:
+ if (HiOverflow == +1) // High bound greater than input range.
+ return replaceInstUsesWith(Cmp, Builder->getFalse());
+ if (HiOverflow == -1) // High bound less than input range.
+ return replaceInstUsesWith(Cmp, Builder->getTrue());
+ if (Pred == ICmpInst::ICMP_UGT)
+ return new ICmpInst(ICmpInst::ICMP_UGE, X, HiBound);
+ return new ICmpInst(ICmpInst::ICMP_SGE, X, HiBound);
}
+
+ return nullptr;
}
-/// Handle "icmp(([al]shr X, cst1), cst2)".
-Instruction *InstCombiner::FoldICmpShrCst(ICmpInst &ICI, BinaryOperator *Shr,
- ConstantInt *ShAmt) {
- const APInt &CmpRHSV = cast<ConstantInt>(ICI.getOperand(1))->getValue();
+/// Fold icmp (sub X, Y), C.
+Instruction *InstCombiner::foldICmpSubConstant(ICmpInst &Cmp,
+ BinaryOperator *Sub,
+ const APInt *C) {
+ Value *X = Sub->getOperand(0), *Y = Sub->getOperand(1);
+ ICmpInst::Predicate Pred = Cmp.getPredicate();
- // Check that the shift amount is in range. If not, don't perform
- // undefined shifts. When the shift is visited it will be
- // simplified.
- uint32_t TypeBits = CmpRHSV.getBitWidth();
- uint32_t ShAmtVal = (uint32_t)ShAmt->getLimitedValue(TypeBits);
- if (ShAmtVal >= TypeBits || ShAmtVal == 0)
+ // The following transforms are only worth it if the only user of the subtract
+ // is the icmp.
+ if (!Sub->hasOneUse())
return nullptr;
- if (!ICI.isEquality()) {
- // If we have an unsigned comparison and an ashr, we can't simplify this.
- // Similarly for signed comparisons with lshr.
- if (ICI.isSigned() != (Shr->getOpcode() == Instruction::AShr))
- return nullptr;
-
- // Otherwise, all lshr and most exact ashr's are equivalent to a udiv/sdiv
- // by a power of 2. Since we already have logic to simplify these,
- // transform to div and then simplify the resultant comparison.
- if (Shr->getOpcode() == Instruction::AShr &&
- (!Shr->isExact() || ShAmtVal == TypeBits - 1))
- return nullptr;
-
- // Revisit the shift (to delete it).
- Worklist.Add(Shr);
-
- Constant *DivCst =
- ConstantInt::get(Shr->getType(), APInt::getOneBitSet(TypeBits, ShAmtVal));
+ if (Sub->hasNoSignedWrap()) {
+ // (icmp sgt (sub nsw X, Y), -1) -> (icmp sge X, Y)
+ if (Pred == ICmpInst::ICMP_SGT && C->isAllOnesValue())
+ return new ICmpInst(ICmpInst::ICMP_SGE, X, Y);
- Value *Tmp =
- Shr->getOpcode() == Instruction::AShr ?
- Builder->CreateSDiv(Shr->getOperand(0), DivCst, "", Shr->isExact()) :
- Builder->CreateUDiv(Shr->getOperand(0), DivCst, "", Shr->isExact());
+ // (icmp sgt (sub nsw X, Y), 0) -> (icmp sgt X, Y)
+ if (Pred == ICmpInst::ICMP_SGT && *C == 0)
+ return new ICmpInst(ICmpInst::ICMP_SGT, X, Y);
- ICI.setOperand(0, Tmp);
+ // (icmp slt (sub nsw X, Y), 0) -> (icmp slt X, Y)
+ if (Pred == ICmpInst::ICMP_SLT && *C == 0)
+ return new ICmpInst(ICmpInst::ICMP_SLT, X, Y);
- // If the builder folded the binop, just return it.
- BinaryOperator *TheDiv = dyn_cast<BinaryOperator>(Tmp);
- if (!TheDiv)
- return &ICI;
-
- // Otherwise, fold this div/compare.
- assert(TheDiv->getOpcode() == Instruction::SDiv ||
- TheDiv->getOpcode() == Instruction::UDiv);
-
- Instruction *Res = FoldICmpDivCst(ICI, TheDiv, cast<ConstantInt>(DivCst));
- assert(Res && "This div/cst should have folded!");
- return Res;
+ // (icmp slt (sub nsw X, Y), 1) -> (icmp sle X, Y)
+ if (Pred == ICmpInst::ICMP_SLT && *C == 1)
+ return new ICmpInst(ICmpInst::ICMP_SLE, X, Y);
}
- // If we are comparing against bits always shifted out, the
- // comparison cannot succeed.
- APInt Comp = CmpRHSV << ShAmtVal;
- ConstantInt *ShiftedCmpRHS = Builder->getInt(Comp);
- if (Shr->getOpcode() == Instruction::LShr)
- Comp = Comp.lshr(ShAmtVal);
- else
- Comp = Comp.ashr(ShAmtVal);
+ const APInt *C2;
+ if (!match(X, m_APInt(C2)))
+ return nullptr;
- if (Comp != CmpRHSV) { // Comparing against a bit that we know is zero.
- bool IsICMP_NE = ICI.getPredicate() == ICmpInst::ICMP_NE;
- Constant *Cst = Builder->getInt1(IsICMP_NE);
- return replaceInstUsesWith(ICI, Cst);
- }
+ // C2 - Y <u C -> (Y | (C - 1)) == C2
+ // iff (C2 & (C - 1)) == C - 1 and C is a power of 2
+ if (Pred == ICmpInst::ICMP_ULT && C->isPowerOf2() &&
+ (*C2 & (*C - 1)) == (*C - 1))
+ return new ICmpInst(ICmpInst::ICMP_EQ, Builder->CreateOr(Y, *C - 1), X);
- // Otherwise, check to see if the bits shifted out are known to be zero.
- // If so, we can compare against the unshifted value:
- // (X & 4) >> 1 == 2 --> (X & 4) == 4.
- if (Shr->hasOneUse() && Shr->isExact())
- return new ICmpInst(ICI.getPredicate(), Shr->getOperand(0), ShiftedCmpRHS);
+ // C2 - Y >u C -> (Y | C) != C2
+ // iff C2 & C == C and C + 1 is a power of 2
+ if (Pred == ICmpInst::ICMP_UGT && (*C + 1).isPowerOf2() && (*C2 & *C) == *C)
+ return new ICmpInst(ICmpInst::ICMP_NE, Builder->CreateOr(Y, *C), X);
- if (Shr->hasOneUse()) {
- // Otherwise strength reduce the shift into an and.
- APInt Val(APInt::getHighBitsSet(TypeBits, TypeBits - ShAmtVal));
- Constant *Mask = Builder->getInt(Val);
-
- Value *And = Builder->CreateAnd(Shr->getOperand(0),
- Mask, Shr->getName()+".mask");
- return new ICmpInst(ICI.getPredicate(), And, ShiftedCmpRHS);
- }
return nullptr;
}
-/// Handle "(icmp eq/ne (ashr/lshr const2, A), const1)" ->
-/// (icmp eq/ne A, Log2(const2/const1)) ->
-/// (icmp eq/ne A, Log2(const2) - Log2(const1)).
-Instruction *InstCombiner::FoldICmpCstShrCst(ICmpInst &I, Value *Op, Value *A,
- ConstantInt *CI1,
- ConstantInt *CI2) {
- assert(I.isEquality() && "Cannot fold icmp gt/lt");
-
- auto getConstant = [&I, this](bool IsTrue) {
- if (I.getPredicate() == I.ICMP_NE)
- IsTrue = !IsTrue;
- return replaceInstUsesWith(I, ConstantInt::get(I.getType(), IsTrue));
- };
-
- auto getICmp = [&I](CmpInst::Predicate Pred, Value *LHS, Value *RHS) {
- if (I.getPredicate() == I.ICMP_NE)
- Pred = CmpInst::getInversePredicate(Pred);
- return new ICmpInst(Pred, LHS, RHS);
- };
+/// Fold icmp (add X, Y), C.
+Instruction *InstCombiner::foldICmpAddConstant(ICmpInst &Cmp,
+ BinaryOperator *Add,
+ const APInt *C) {
+ Value *Y = Add->getOperand(1);
+ const APInt *C2;
+ if (Cmp.isEquality() || !match(Y, m_APInt(C2)))
+ return nullptr;
- const APInt &AP1 = CI1->getValue();
- const APInt &AP2 = CI2->getValue();
+ // Fold icmp pred (add X, C2), C.
+ Value *X = Add->getOperand(0);
+ Type *Ty = Add->getType();
+ auto CR =
+ ConstantRange::makeExactICmpRegion(Cmp.getPredicate(), *C).subtract(*C2);
+ const APInt &Upper = CR.getUpper();
+ const APInt &Lower = CR.getLower();
+ if (Cmp.isSigned()) {
+ if (Lower.isSignBit())
+ return new ICmpInst(ICmpInst::ICMP_SLT, X, ConstantInt::get(Ty, Upper));
+ if (Upper.isSignBit())
+ return new ICmpInst(ICmpInst::ICMP_SGE, X, ConstantInt::get(Ty, Lower));
+ } else {
+ if (Lower.isMinValue())
+ return new ICmpInst(ICmpInst::ICMP_ULT, X, ConstantInt::get(Ty, Upper));
+ if (Upper.isMinValue())
+ return new ICmpInst(ICmpInst::ICMP_UGE, X, ConstantInt::get(Ty, Lower));
+ }
- // Don't bother doing any work for cases which InstSimplify handles.
- if (AP2 == 0)
+ if (!Add->hasOneUse())
return nullptr;
- bool IsAShr = isa<AShrOperator>(Op);
- if (IsAShr) {
- if (AP2.isAllOnesValue())
- return nullptr;
- if (AP2.isNegative() != AP1.isNegative())
- return nullptr;
- if (AP2.sgt(AP1))
- return nullptr;
- }
- if (!AP1)
- // 'A' must be large enough to shift out the highest set bit.
- return getICmp(I.ICMP_UGT, A,
- ConstantInt::get(A->getType(), AP2.logBase2()));
+ // X+C <u C2 -> (X & -C2) == C
+ // iff C & (C2-1) == 0
+ // C2 is a power of 2
+ if (Cmp.getPredicate() == ICmpInst::ICMP_ULT && C->isPowerOf2() &&
+ (*C2 & (*C - 1)) == 0)
+ return new ICmpInst(ICmpInst::ICMP_EQ, Builder->CreateAnd(X, -(*C)),
+ ConstantExpr::getNeg(cast<Constant>(Y)));
+
+ // X+C >u C2 -> (X & ~C2) != C
+ // iff C & C2 == 0
+ // C2+1 is a power of 2
+ if (Cmp.getPredicate() == ICmpInst::ICMP_UGT && (*C + 1).isPowerOf2() &&
+ (*C2 & *C) == 0)
+ return new ICmpInst(ICmpInst::ICMP_NE, Builder->CreateAnd(X, ~(*C)),
+ ConstantExpr::getNeg(cast<Constant>(Y)));
- if (AP1 == AP2)
- return getICmp(I.ICMP_EQ, A, ConstantInt::getNullValue(A->getType()));
+ return nullptr;
+}
- int Shift;
- if (IsAShr && AP1.isNegative())
- Shift = AP1.countLeadingOnes() - AP2.countLeadingOnes();
- else
- Shift = AP1.countLeadingZeros() - AP2.countLeadingZeros();
+/// Try to fold integer comparisons with a constant operand: icmp Pred X, C
+/// where X is some kind of instruction.
+Instruction *InstCombiner::foldICmpInstWithConstant(ICmpInst &Cmp) {
+ const APInt *C;
+ if (!match(Cmp.getOperand(1), m_APInt(C)))
+ return nullptr;
- if (Shift > 0) {
- if (IsAShr && AP1 == AP2.ashr(Shift)) {
- // There are multiple solutions if we are comparing against -1 and the LHS
- // of the ashr is not a power of two.
- if (AP1.isAllOnesValue() && !AP2.isPowerOf2())
- return getICmp(I.ICMP_UGE, A, ConstantInt::get(A->getType(), Shift));
- return getICmp(I.ICMP_EQ, A, ConstantInt::get(A->getType(), Shift));
- } else if (AP1 == AP2.lshr(Shift)) {
- return getICmp(I.ICMP_EQ, A, ConstantInt::get(A->getType(), Shift));
+ BinaryOperator *BO;
+ if (match(Cmp.getOperand(0), m_BinOp(BO))) {
+ switch (BO->getOpcode()) {
+ case Instruction::Xor:
+ if (Instruction *I = foldICmpXorConstant(Cmp, BO, C))
+ return I;
+ break;
+ case Instruction::And:
+ if (Instruction *I = foldICmpAndConstant(Cmp, BO, C))
+ return I;
+ break;
+ case Instruction::Or:
+ if (Instruction *I = foldICmpOrConstant(Cmp, BO, C))
+ return I;
+ break;
+ case Instruction::Mul:
+ if (Instruction *I = foldICmpMulConstant(Cmp, BO, C))
+ return I;
+ break;
+ case Instruction::Shl:
+ if (Instruction *I = foldICmpShlConstant(Cmp, BO, C))
+ return I;
+ break;
+ case Instruction::LShr:
+ case Instruction::AShr:
+ if (Instruction *I = foldICmpShrConstant(Cmp, BO, C))
+ return I;
+ break;
+ case Instruction::UDiv:
+ if (Instruction *I = foldICmpUDivConstant(Cmp, BO, C))
+ return I;
+ LLVM_FALLTHROUGH;
+ case Instruction::SDiv:
+ if (Instruction *I = foldICmpDivConstant(Cmp, BO, C))
+ return I;
+ break;
+ case Instruction::Sub:
+ if (Instruction *I = foldICmpSubConstant(Cmp, BO, C))
+ return I;
+ break;
+ case Instruction::Add:
+ if (Instruction *I = foldICmpAddConstant(Cmp, BO, C))
+ return I;
+ break;
+ default:
+ break;
}
+ // TODO: These folds could be refactored to be part of the above calls.
+ if (Instruction *I = foldICmpBinOpEqualityWithConstant(Cmp, BO, C))
+ return I;
}
- // Shifting const2 will never be equal to const1.
- return getConstant(false);
-}
-/// Handle "(icmp eq/ne (shl const2, A), const1)" ->
-/// (icmp eq/ne A, TrailingZeros(const1) - TrailingZeros(const2)).
-Instruction *InstCombiner::FoldICmpCstShlCst(ICmpInst &I, Value *Op, Value *A,
- ConstantInt *CI1,
- ConstantInt *CI2) {
- assert(I.isEquality() && "Cannot fold icmp gt/lt");
+ Instruction *LHSI;
+ if (match(Cmp.getOperand(0), m_Instruction(LHSI)) &&
+ LHSI->getOpcode() == Instruction::Trunc)
+ if (Instruction *I = foldICmpTruncConstant(Cmp, LHSI, C))
+ return I;
- auto getConstant = [&I, this](bool IsTrue) {
- if (I.getPredicate() == I.ICMP_NE)
- IsTrue = !IsTrue;
- return replaceInstUsesWith(I, ConstantInt::get(I.getType(), IsTrue));
- };
+ if (Instruction *I = foldICmpIntrinsicWithConstant(Cmp, C))
+ return I;
- auto getICmp = [&I](CmpInst::Predicate Pred, Value *LHS, Value *RHS) {
- if (I.getPredicate() == I.ICMP_NE)
- Pred = CmpInst::getInversePredicate(Pred);
- return new ICmpInst(Pred, LHS, RHS);
- };
-
- const APInt &AP1 = CI1->getValue();
- const APInt &AP2 = CI2->getValue();
+ return nullptr;
+}
- // Don't bother doing any work for cases which InstSimplify handles.
- if (AP2 == 0)
+/// Fold an icmp equality instruction with binary operator LHS and constant RHS:
+/// icmp eq/ne BO, C.
+Instruction *InstCombiner::foldICmpBinOpEqualityWithConstant(ICmpInst &Cmp,
+ BinaryOperator *BO,
+ const APInt *C) {
+ // TODO: Some of these folds could work with arbitrary constants, but this
+ // function is limited to scalar and vector splat constants.
+ if (!Cmp.isEquality())
return nullptr;
- unsigned AP2TrailingZeros = AP2.countTrailingZeros();
-
- if (!AP1 && AP2TrailingZeros != 0)
- return getICmp(I.ICMP_UGE, A,
- ConstantInt::get(A->getType(), AP2.getBitWidth() - AP2TrailingZeros));
+ ICmpInst::Predicate Pred = Cmp.getPredicate();
+ bool isICMP_NE = Pred == ICmpInst::ICMP_NE;
+ Constant *RHS = cast<Constant>(Cmp.getOperand(1));
+ Value *BOp0 = BO->getOperand(0), *BOp1 = BO->getOperand(1);
+
+ switch (BO->getOpcode()) {
+ case Instruction::SRem:
+ // If we have a signed (X % (2^c)) == 0, turn it into an unsigned one.
+ if (*C == 0 && BO->hasOneUse()) {
+ const APInt *BOC;
+ if (match(BOp1, m_APInt(BOC)) && BOC->sgt(1) && BOC->isPowerOf2()) {
+ Value *NewRem = Builder->CreateURem(BOp0, BOp1, BO->getName());
+ return new ICmpInst(Pred, NewRem,
+ Constant::getNullValue(BO->getType()));
+ }
+ }
+ break;
+ case Instruction::Add: {
+ // Replace ((add A, B) != C) with (A != C-B) if B & C are constants.
+ const APInt *BOC;
+ if (match(BOp1, m_APInt(BOC))) {
+ if (BO->hasOneUse()) {
+ Constant *SubC = ConstantExpr::getSub(RHS, cast<Constant>(BOp1));
+ return new ICmpInst(Pred, BOp0, SubC);
+ }
+ } else if (*C == 0) {
+ // Replace ((add A, B) != 0) with (A != -B) if A or B is
+ // efficiently invertible, or if the add has just this one use.
+ if (Value *NegVal = dyn_castNegVal(BOp1))
+ return new ICmpInst(Pred, BOp0, NegVal);
+ if (Value *NegVal = dyn_castNegVal(BOp0))
+ return new ICmpInst(Pred, NegVal, BOp1);
+ if (BO->hasOneUse()) {
+ Value *Neg = Builder->CreateNeg(BOp1);
+ Neg->takeName(BO);
+ return new ICmpInst(Pred, BOp0, Neg);
+ }
+ }
+ break;
+ }
+ case Instruction::Xor:
+ if (BO->hasOneUse()) {
+ if (Constant *BOC = dyn_cast<Constant>(BOp1)) {
+ // For the xor case, we can xor two constants together, eliminating
+ // the explicit xor.
+ return new ICmpInst(Pred, BOp0, ConstantExpr::getXor(RHS, BOC));
+ } else if (*C == 0) {
+ // Replace ((xor A, B) != 0) with (A != B)
+ return new ICmpInst(Pred, BOp0, BOp1);
+ }
+ }
+ break;
+ case Instruction::Sub:
+ if (BO->hasOneUse()) {
+ const APInt *BOC;
+ if (match(BOp0, m_APInt(BOC))) {
+ // Replace ((sub BOC, B) != C) with (B != BOC-C).
+ Constant *SubC = ConstantExpr::getSub(cast<Constant>(BOp0), RHS);
+ return new ICmpInst(Pred, BOp1, SubC);
+ } else if (*C == 0) {
+ // Replace ((sub A, B) != 0) with (A != B).
+ return new ICmpInst(Pred, BOp0, BOp1);
+ }
+ }
+ break;
+ case Instruction::Or: {
+ const APInt *BOC;
+ if (match(BOp1, m_APInt(BOC)) && BO->hasOneUse() && RHS->isAllOnesValue()) {
+ // Comparing if all bits outside of a constant mask are set?
+ // Replace (X | C) == -1 with (X & ~C) == ~C.
+ // This removes the -1 constant.
+ Constant *NotBOC = ConstantExpr::getNot(cast<Constant>(BOp1));
+ Value *And = Builder->CreateAnd(BOp0, NotBOC);
+ return new ICmpInst(Pred, And, NotBOC);
+ }
+ break;
+ }
+ case Instruction::And: {
+ const APInt *BOC;
+ if (match(BOp1, m_APInt(BOC))) {
+ // If we have ((X & C) == C), turn it into ((X & C) != 0).
+ if (C == BOC && C->isPowerOf2())
+ return new ICmpInst(isICMP_NE ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_NE,
+ BO, Constant::getNullValue(RHS->getType()));
+
+ // Don't perform the following transforms if the AND has multiple uses
+ if (!BO->hasOneUse())
+ break;
- if (AP1 == AP2)
- return getICmp(I.ICMP_EQ, A, ConstantInt::getNullValue(A->getType()));
+ // Replace (and X, (1 << size(X)-1) != 0) with x s< 0
+ if (BOC->isSignBit()) {
+ Constant *Zero = Constant::getNullValue(BOp0->getType());
+ auto NewPred = isICMP_NE ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_SGE;
+ return new ICmpInst(NewPred, BOp0, Zero);
+ }
- // Get the distance between the lowest bits that are set.
- int Shift = AP1.countTrailingZeros() - AP2TrailingZeros;
+ // ((X & ~7) == 0) --> X < 8
+ if (*C == 0 && (~(*BOC) + 1).isPowerOf2()) {
+ Constant *NegBOC = ConstantExpr::getNeg(cast<Constant>(BOp1));
+ auto NewPred = isICMP_NE ? ICmpInst::ICMP_UGE : ICmpInst::ICMP_ULT;
+ return new ICmpInst(NewPred, BOp0, NegBOC);
+ }
+ }
+ break;
+ }
+ case Instruction::Mul:
+ if (*C == 0 && BO->hasNoSignedWrap()) {
+ const APInt *BOC;
+ if (match(BOp1, m_APInt(BOC)) && *BOC != 0) {
+ // The trivial case (mul X, 0) is handled by InstSimplify.
+ // General case : (mul X, C) != 0 iff X != 0
+ // (mul X, C) == 0 iff X == 0
+ return new ICmpInst(Pred, BOp0, Constant::getNullValue(RHS->getType()));
+ }
+ }
+ break;
+ case Instruction::UDiv:
+ if (*C == 0) {
+ // (icmp eq/ne (udiv A, B), 0) -> (icmp ugt/ule i32 B, A)
+ auto NewPred = isICMP_NE ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_UGT;
+ return new ICmpInst(NewPred, BOp1, BOp0);
+ }
+ break;
+ default:
+ break;
+ }
+ return nullptr;
+}
- if (Shift > 0 && AP2.shl(Shift) == AP1)
- return getICmp(I.ICMP_EQ, A, ConstantInt::get(A->getType(), Shift));
+/// Fold an icmp with LLVM intrinsic and constant operand: icmp Pred II, C.
+Instruction *InstCombiner::foldICmpIntrinsicWithConstant(ICmpInst &Cmp,
+ const APInt *C) {
+ IntrinsicInst *II = dyn_cast<IntrinsicInst>(Cmp.getOperand(0));
+ if (!II || !Cmp.isEquality())
+ return nullptr;
- // Shifting const2 will never be equal to const1.
- return getConstant(false);
+ // Handle icmp {eq|ne} <intrinsic>, intcst.
+ switch (II->getIntrinsicID()) {
+ case Intrinsic::bswap:
+ Worklist.Add(II);
+ Cmp.setOperand(0, II->getArgOperand(0));
+ Cmp.setOperand(1, Builder->getInt(C->byteSwap()));
+ return &Cmp;
+ case Intrinsic::ctlz:
+ case Intrinsic::cttz:
+ // ctz(A) == bitwidth(A) -> A == 0 and likewise for !=
+ if (*C == C->getBitWidth()) {
+ Worklist.Add(II);
+ Cmp.setOperand(0, II->getArgOperand(0));
+ Cmp.setOperand(1, ConstantInt::getNullValue(II->getType()));
+ return &Cmp;
+ }
+ break;
+ case Intrinsic::ctpop: {
+ // popcount(A) == 0 -> A == 0 and likewise for !=
+ // popcount(A) == bitwidth(A) -> A == -1 and likewise for !=
+ bool IsZero = *C == 0;
+ if (IsZero || *C == C->getBitWidth()) {
+ Worklist.Add(II);
+ Cmp.setOperand(0, II->getArgOperand(0));
+ auto *NewOp = IsZero ? Constant::getNullValue(II->getType())
+ : Constant::getAllOnesValue(II->getType());
+ Cmp.setOperand(1, NewOp);
+ return &Cmp;
+ }
+ break;
+ }
+ default:
+ break;
+ }
+ return nullptr;
}
-/// Handle "icmp (instr, intcst)".
-Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
- Instruction *LHSI,
- ConstantInt *RHS) {
- const APInt &RHSV = RHS->getValue();
+/// Handle icmp with constant (but not simple integer constant) RHS.
+Instruction *InstCombiner::foldICmpInstWithConstantNotInt(ICmpInst &I) {
+ Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+ Constant *RHSC = dyn_cast<Constant>(Op1);
+ Instruction *LHSI = dyn_cast<Instruction>(Op0);
+ if (!RHSC || !LHSI)
+ return nullptr;
switch (LHSI->getOpcode()) {
- case Instruction::Trunc:
- if (RHS->isOne() && RHSV.getBitWidth() > 1) {
- // icmp slt trunc(signum(V)) 1 --> icmp slt V, 1
- Value *V = nullptr;
- if (ICI.getPredicate() == ICmpInst::ICMP_SLT &&
- match(LHSI->getOperand(0), m_Signum(m_Value(V))))
- return new ICmpInst(ICmpInst::ICMP_SLT, V,
- ConstantInt::get(V->getType(), 1));
+ case Instruction::GetElementPtr:
+ // icmp pred GEP (P, int 0, int 0, int 0), null -> icmp pred P, null
+ if (RHSC->isNullValue() &&
+ cast<GetElementPtrInst>(LHSI)->hasAllZeroIndices())
+ return new ICmpInst(
+ I.getPredicate(), LHSI->getOperand(0),
+ Constant::getNullValue(LHSI->getOperand(0)->getType()));
+ break;
+ case Instruction::PHI:
+ // Only fold icmp into the PHI if the phi and icmp are in the same
+ // block. If in the same block, we're encouraging jump threading. If
+ // not, we are just pessimizing the code by making an i1 phi.
+ if (LHSI->getParent() == I.getParent())
+ if (Instruction *NV = FoldOpIntoPhi(I))
+ return NV;
+ break;
+ case Instruction::Select: {
+ // If either operand of the select is a constant, we can fold the
+ // comparison into the select arms, which will cause one to be
+ // constant folded and the select turned into a bitwise or.
+ Value *Op1 = nullptr, *Op2 = nullptr;
+ ConstantInt *CI = nullptr;
+ if (Constant *C = dyn_cast<Constant>(LHSI->getOperand(1))) {
+ Op1 = ConstantExpr::getICmp(I.getPredicate(), C, RHSC);
+ CI = dyn_cast<ConstantInt>(Op1);
}
- if (ICI.isEquality() && LHSI->hasOneUse()) {
- // Simplify icmp eq (trunc x to i8), 42 -> icmp eq x, 42|highbits if all
- // of the high bits truncated out of x are known.
- unsigned DstBits = LHSI->getType()->getPrimitiveSizeInBits(),
- SrcBits = LHSI->getOperand(0)->getType()->getPrimitiveSizeInBits();
- APInt KnownZero(SrcBits, 0), KnownOne(SrcBits, 0);
- computeKnownBits(LHSI->getOperand(0), KnownZero, KnownOne, 0, &ICI);
-
- // If all the high bits are known, we can do this xform.
- if ((KnownZero|KnownOne).countLeadingOnes() >= SrcBits-DstBits) {
- // Pull in the high bits from known-ones set.
- APInt NewRHS = RHS->getValue().zext(SrcBits);
- NewRHS |= KnownOne & APInt::getHighBitsSet(SrcBits, SrcBits-DstBits);
- return new ICmpInst(ICI.getPredicate(), LHSI->getOperand(0),
- Builder->getInt(NewRHS));
- }
+ if (Constant *C = dyn_cast<Constant>(LHSI->getOperand(2))) {
+ Op2 = ConstantExpr::getICmp(I.getPredicate(), C, RHSC);
+ CI = dyn_cast<ConstantInt>(Op2);
+ }
+
+ // We only want to perform this transformation if it will not lead to
+ // additional code. This is true if either both sides of the select
+ // fold to a constant (in which case the icmp is replaced with a select
+ // which will usually simplify) or this is the only user of the
+ // select (in which case we are trading a select+icmp for a simpler
+ // select+icmp) or all uses of the select can be replaced based on
+ // dominance information ("Global cases").
+ bool Transform = false;
+ if (Op1 && Op2)
+ Transform = true;
+ else if (Op1 || Op2) {
+ // Local case
+ if (LHSI->hasOneUse())
+ Transform = true;
+ // Global cases
+ else if (CI && !CI->isZero())
+ // When Op1 is constant try replacing select with second operand.
+ // Otherwise Op2 is constant and try replacing select with first
+ // operand.
+ Transform =
+ replacedSelectWithOperand(cast<SelectInst>(LHSI), &I, Op1 ? 2 : 1);
+ }
+ if (Transform) {
+ if (!Op1)
+ Op1 = Builder->CreateICmp(I.getPredicate(), LHSI->getOperand(1), RHSC,
+ I.getName());
+ if (!Op2)
+ Op2 = Builder->CreateICmp(I.getPredicate(), LHSI->getOperand(2), RHSC,
+ I.getName());
+ return SelectInst::Create(LHSI->getOperand(0), Op1, Op2);
}
break;
+ }
+ case Instruction::IntToPtr:
+ // icmp pred inttoptr(X), null -> icmp pred X, 0
+ if (RHSC->isNullValue() &&
+ DL.getIntPtrType(RHSC->getType()) == LHSI->getOperand(0)->getType())
+ return new ICmpInst(
+ I.getPredicate(), LHSI->getOperand(0),
+ Constant::getNullValue(LHSI->getOperand(0)->getType()));
+ break;
- case Instruction::Xor: // (icmp pred (xor X, XorCst), CI)
- if (ConstantInt *XorCst = dyn_cast<ConstantInt>(LHSI->getOperand(1))) {
- // If this is a comparison that tests the signbit (X < 0) or (x > -1),
- // fold the xor.
- if ((ICI.getPredicate() == ICmpInst::ICMP_SLT && RHSV == 0) ||
- (ICI.getPredicate() == ICmpInst::ICMP_SGT && RHSV.isAllOnesValue())) {
- Value *CompareVal = LHSI->getOperand(0);
-
- // If the sign bit of the XorCst is not set, there is no change to
- // the operation, just stop using the Xor.
- if (!XorCst->isNegative()) {
- ICI.setOperand(0, CompareVal);
- Worklist.Add(LHSI);
- return &ICI;
- }
+ case Instruction::Load:
+ // Try to optimize things like "A[i] > 4" to index computations.
+ if (GetElementPtrInst *GEP =
+ dyn_cast<GetElementPtrInst>(LHSI->getOperand(0))) {
+ if (GlobalVariable *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0)))
+ if (GV->isConstant() && GV->hasDefinitiveInitializer() &&
+ !cast<LoadInst>(LHSI)->isVolatile())
+ if (Instruction *Res = foldCmpLoadFromIndexedGlobal(GEP, GV, I))
+ return Res;
+ }
+ break;
+ }
- // Was the old condition true if the operand is positive?
- bool isTrueIfPositive = ICI.getPredicate() == ICmpInst::ICMP_SGT;
+ return nullptr;
+}
- // If so, the new one isn't.
- isTrueIfPositive ^= true;
+/// Try to fold icmp (binop), X or icmp X, (binop).
+Instruction *InstCombiner::foldICmpBinOp(ICmpInst &I) {
+ Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
- if (isTrueIfPositive)
- return new ICmpInst(ICmpInst::ICMP_SGT, CompareVal,
- SubOne(RHS));
- else
- return new ICmpInst(ICmpInst::ICMP_SLT, CompareVal,
- AddOne(RHS));
- }
+ // Special logic for binary operators.
+ BinaryOperator *BO0 = dyn_cast<BinaryOperator>(Op0);
+ BinaryOperator *BO1 = dyn_cast<BinaryOperator>(Op1);
+ if (!BO0 && !BO1)
+ return nullptr;
- if (LHSI->hasOneUse()) {
- // (icmp u/s (xor A SignBit), C) -> (icmp s/u A, (xor C SignBit))
- if (!ICI.isEquality() && XorCst->getValue().isSignBit()) {
- const APInt &SignBit = XorCst->getValue();
- ICmpInst::Predicate Pred = ICI.isSigned()
- ? ICI.getUnsignedPredicate()
- : ICI.getSignedPredicate();
- return new ICmpInst(Pred, LHSI->getOperand(0),
- Builder->getInt(RHSV ^ SignBit));
- }
+ CmpInst::Predicate Pred = I.getPredicate();
+ bool NoOp0WrapProblem = false, NoOp1WrapProblem = false;
+ if (BO0 && isa<OverflowingBinaryOperator>(BO0))
+ NoOp0WrapProblem =
+ ICmpInst::isEquality(Pred) ||
+ (CmpInst::isUnsigned(Pred) && BO0->hasNoUnsignedWrap()) ||
+ (CmpInst::isSigned(Pred) && BO0->hasNoSignedWrap());
+ if (BO1 && isa<OverflowingBinaryOperator>(BO1))
+ NoOp1WrapProblem =
+ ICmpInst::isEquality(Pred) ||
+ (CmpInst::isUnsigned(Pred) && BO1->hasNoUnsignedWrap()) ||
+ (CmpInst::isSigned(Pred) && BO1->hasNoSignedWrap());
- // (icmp u/s (xor A ~SignBit), C) -> (icmp s/u (xor C ~SignBit), A)
- if (!ICI.isEquality() && XorCst->isMaxValue(true)) {
- const APInt &NotSignBit = XorCst->getValue();
- ICmpInst::Predicate Pred = ICI.isSigned()
- ? ICI.getUnsignedPredicate()
- : ICI.getSignedPredicate();
- Pred = ICI.getSwappedPredicate(Pred);
- return new ICmpInst(Pred, LHSI->getOperand(0),
- Builder->getInt(RHSV ^ NotSignBit));
- }
- }
+ // Analyze the case when either Op0 or Op1 is an add instruction.
+ // Op0 = A + B (or A and B are null); Op1 = C + D (or C and D are null).
+ Value *A = nullptr, *B = nullptr, *C = nullptr, *D = nullptr;
+ if (BO0 && BO0->getOpcode() == Instruction::Add) {
+ A = BO0->getOperand(0);
+ B = BO0->getOperand(1);
+ }
+ if (BO1 && BO1->getOpcode() == Instruction::Add) {
+ C = BO1->getOperand(0);
+ D = BO1->getOperand(1);
+ }
- // (icmp ugt (xor X, C), ~C) -> (icmp ult X, C)
- // iff -C is a power of 2
- if (ICI.getPredicate() == ICmpInst::ICMP_UGT &&
- XorCst->getValue() == ~RHSV && (RHSV + 1).isPowerOf2())
- return new ICmpInst(ICmpInst::ICMP_ULT, LHSI->getOperand(0), XorCst);
-
- // (icmp ult (xor X, C), -C) -> (icmp uge X, C)
- // iff -C is a power of 2
- if (ICI.getPredicate() == ICmpInst::ICMP_ULT &&
- XorCst->getValue() == -RHSV && RHSV.isPowerOf2())
- return new ICmpInst(ICmpInst::ICMP_UGE, LHSI->getOperand(0), XorCst);
+ // icmp (X+cst) < 0 --> X < -cst
+ if (NoOp0WrapProblem && ICmpInst::isSigned(Pred) && match(Op1, m_Zero()))
+ if (ConstantInt *RHSC = dyn_cast_or_null<ConstantInt>(B))
+ if (!RHSC->isMinValue(/*isSigned=*/true))
+ return new ICmpInst(Pred, A, ConstantExpr::getNeg(RHSC));
+
+ // icmp (X+Y), X -> icmp Y, 0 for equalities or if there is no overflow.
+ if ((A == Op1 || B == Op1) && NoOp0WrapProblem)
+ return new ICmpInst(Pred, A == Op1 ? B : A,
+ Constant::getNullValue(Op1->getType()));
+
+ // icmp X, (X+Y) -> icmp 0, Y for equalities or if there is no overflow.
+ if ((C == Op0 || D == Op0) && NoOp1WrapProblem)
+ return new ICmpInst(Pred, Constant::getNullValue(Op0->getType()),
+ C == Op0 ? D : C);
+
+ // icmp (X+Y), (X+Z) -> icmp Y, Z for equalities or if there is no overflow.
+ if (A && C && (A == C || A == D || B == C || B == D) && NoOp0WrapProblem &&
+ NoOp1WrapProblem &&
+ // Try not to increase register pressure.
+ BO0->hasOneUse() && BO1->hasOneUse()) {
+ // Determine Y and Z in the form icmp (X+Y), (X+Z).
+ Value *Y, *Z;
+ if (A == C) {
+ // C + B == C + D -> B == D
+ Y = B;
+ Z = D;
+ } else if (A == D) {
+ // D + B == C + D -> B == C
+ Y = B;
+ Z = C;
+ } else if (B == C) {
+ // A + C == C + D -> A == D
+ Y = A;
+ Z = D;
+ } else {
+ assert(B == D);
+ // A + D == C + D -> A == C
+ Y = A;
+ Z = C;
}
- break;
- case Instruction::And: // (icmp pred (and X, AndCst), RHS)
- if (LHSI->hasOneUse() && isa<ConstantInt>(LHSI->getOperand(1)) &&
- LHSI->getOperand(0)->hasOneUse()) {
- ConstantInt *AndCst = cast<ConstantInt>(LHSI->getOperand(1));
-
- // If the LHS is an AND of a truncating cast, we can widen the
- // and/compare to be the input width without changing the value
- // produced, eliminating a cast.
- if (TruncInst *Cast = dyn_cast<TruncInst>(LHSI->getOperand(0))) {
- // We can do this transformation if either the AND constant does not
- // have its sign bit set or if it is an equality comparison.
- // Extending a relational comparison when we're checking the sign
- // bit would not work.
- if (ICI.isEquality() ||
- (!AndCst->isNegative() && RHSV.isNonNegative())) {
- Value *NewAnd =
- Builder->CreateAnd(Cast->getOperand(0),
- ConstantExpr::getZExt(AndCst, Cast->getSrcTy()));
- NewAnd->takeName(LHSI);
- return new ICmpInst(ICI.getPredicate(), NewAnd,
- ConstantExpr::getZExt(RHS, Cast->getSrcTy()));
- }
- }
-
- // If the LHS is an AND of a zext, and we have an equality compare, we can
- // shrink the and/compare to the smaller type, eliminating the cast.
- if (ZExtInst *Cast = dyn_cast<ZExtInst>(LHSI->getOperand(0))) {
- IntegerType *Ty = cast<IntegerType>(Cast->getSrcTy());
- // Make sure we don't compare the upper bits, SimplifyDemandedBits
- // should fold the icmp to true/false in that case.
- if (ICI.isEquality() && RHSV.getActiveBits() <= Ty->getBitWidth()) {
- Value *NewAnd =
- Builder->CreateAnd(Cast->getOperand(0),
- ConstantExpr::getTrunc(AndCst, Ty));
- NewAnd->takeName(LHSI);
- return new ICmpInst(ICI.getPredicate(), NewAnd,
- ConstantExpr::getTrunc(RHS, Ty));
- }
- }
-
- // If this is: (X >> C1) & C2 != C3 (where any shift and any compare
- // could exist), turn it into (X & (C2 << C1)) != (C3 << C1). This
- // happens a LOT in code produced by the C front-end, for bitfield
- // access.
- BinaryOperator *Shift = dyn_cast<BinaryOperator>(LHSI->getOperand(0));
- if (Shift && !Shift->isShift())
- Shift = nullptr;
-
- ConstantInt *ShAmt;
- ShAmt = Shift ? dyn_cast<ConstantInt>(Shift->getOperand(1)) : nullptr;
-
- // This seemingly simple opportunity to fold away a shift turns out to
- // be rather complicated. See PR17827
- // ( http://llvm.org/bugs/show_bug.cgi?id=17827 ) for details.
- if (ShAmt) {
- bool CanFold = false;
- unsigned ShiftOpcode = Shift->getOpcode();
- if (ShiftOpcode == Instruction::AShr) {
- // There may be some constraints that make this possible,
- // but nothing simple has been discovered yet.
- CanFold = false;
- } else if (ShiftOpcode == Instruction::Shl) {
- // For a left shift, we can fold if the comparison is not signed.
- // We can also fold a signed comparison if the mask value and
- // comparison value are not negative. These constraints may not be
- // obvious, but we can prove that they are correct using an SMT
- // solver.
- if (!ICI.isSigned() || (!AndCst->isNegative() && !RHS->isNegative()))
- CanFold = true;
- } else if (ShiftOpcode == Instruction::LShr) {
- // For a logical right shift, we can fold if the comparison is not
- // signed. We can also fold a signed comparison if the shifted mask
- // value and the shifted comparison value are not negative.
- // These constraints may not be obvious, but we can prove that they
- // are correct using an SMT solver.
- if (!ICI.isSigned())
- CanFold = true;
- else {
- ConstantInt *ShiftedAndCst =
- cast<ConstantInt>(ConstantExpr::getShl(AndCst, ShAmt));
- ConstantInt *ShiftedRHSCst =
- cast<ConstantInt>(ConstantExpr::getShl(RHS, ShAmt));
-
- if (!ShiftedAndCst->isNegative() && !ShiftedRHSCst->isNegative())
- CanFold = true;
- }
- }
+ return new ICmpInst(Pred, Y, Z);
+ }
- if (CanFold) {
- Constant *NewCst;
- if (ShiftOpcode == Instruction::Shl)
- NewCst = ConstantExpr::getLShr(RHS, ShAmt);
- else
- NewCst = ConstantExpr::getShl(RHS, ShAmt);
-
- // Check to see if we are shifting out any of the bits being
- // compared.
- if (ConstantExpr::get(ShiftOpcode, NewCst, ShAmt) != RHS) {
- // If we shifted bits out, the fold is not going to work out.
- // As a special case, check to see if this means that the
- // result is always true or false now.
- if (ICI.getPredicate() == ICmpInst::ICMP_EQ)
- return replaceInstUsesWith(ICI, Builder->getFalse());
- if (ICI.getPredicate() == ICmpInst::ICMP_NE)
- return replaceInstUsesWith(ICI, Builder->getTrue());
+ // icmp slt (X + -1), Y -> icmp sle X, Y
+ if (A && NoOp0WrapProblem && Pred == CmpInst::ICMP_SLT &&
+ match(B, m_AllOnes()))
+ return new ICmpInst(CmpInst::ICMP_SLE, A, Op1);
+
+ // icmp sge (X + -1), Y -> icmp sgt X, Y
+ if (A && NoOp0WrapProblem && Pred == CmpInst::ICMP_SGE &&
+ match(B, m_AllOnes()))
+ return new ICmpInst(CmpInst::ICMP_SGT, A, Op1);
+
+ // icmp sle (X + 1), Y -> icmp slt X, Y
+ if (A && NoOp0WrapProblem && Pred == CmpInst::ICMP_SLE && match(B, m_One()))
+ return new ICmpInst(CmpInst::ICMP_SLT, A, Op1);
+
+ // icmp sgt (X + 1), Y -> icmp sge X, Y
+ if (A && NoOp0WrapProblem && Pred == CmpInst::ICMP_SGT && match(B, m_One()))
+ return new ICmpInst(CmpInst::ICMP_SGE, A, Op1);
+
+ // icmp sgt X, (Y + -1) -> icmp sge X, Y
+ if (C && NoOp1WrapProblem && Pred == CmpInst::ICMP_SGT &&
+ match(D, m_AllOnes()))
+ return new ICmpInst(CmpInst::ICMP_SGE, Op0, C);
+
+ // icmp sle X, (Y + -1) -> icmp slt X, Y
+ if (C && NoOp1WrapProblem && Pred == CmpInst::ICMP_SLE &&
+ match(D, m_AllOnes()))
+ return new ICmpInst(CmpInst::ICMP_SLT, Op0, C);
+
+ // icmp sge X, (Y + 1) -> icmp sgt X, Y
+ if (C && NoOp1WrapProblem && Pred == CmpInst::ICMP_SGE && match(D, m_One()))
+ return new ICmpInst(CmpInst::ICMP_SGT, Op0, C);
+
+ // icmp slt X, (Y + 1) -> icmp sle X, Y
+ if (C && NoOp1WrapProblem && Pred == CmpInst::ICMP_SLT && match(D, m_One()))
+ return new ICmpInst(CmpInst::ICMP_SLE, Op0, C);
+
+ // if C1 has greater magnitude than C2:
+ // icmp (X + C1), (Y + C2) -> icmp (X + C3), Y
+ // s.t. C3 = C1 - C2
+ //
+ // if C2 has greater magnitude than C1:
+ // icmp (X + C1), (Y + C2) -> icmp X, (Y + C3)
+ // s.t. C3 = C2 - C1
+ if (A && C && NoOp0WrapProblem && NoOp1WrapProblem &&
+ (BO0->hasOneUse() || BO1->hasOneUse()) && !I.isUnsigned())
+ if (ConstantInt *C1 = dyn_cast<ConstantInt>(B))
+ if (ConstantInt *C2 = dyn_cast<ConstantInt>(D)) {
+ const APInt &AP1 = C1->getValue();
+ const APInt &AP2 = C2->getValue();
+ if (AP1.isNegative() == AP2.isNegative()) {
+ APInt AP1Abs = C1->getValue().abs();
+ APInt AP2Abs = C2->getValue().abs();
+ if (AP1Abs.uge(AP2Abs)) {
+ ConstantInt *C3 = Builder->getInt(AP1 - AP2);
+ Value *NewAdd = Builder->CreateNSWAdd(A, C3);
+ return new ICmpInst(Pred, NewAdd, C);
} else {
- ICI.setOperand(1, NewCst);
- Constant *NewAndCst;
- if (ShiftOpcode == Instruction::Shl)
- NewAndCst = ConstantExpr::getLShr(AndCst, ShAmt);
- else
- NewAndCst = ConstantExpr::getShl(AndCst, ShAmt);
- LHSI->setOperand(1, NewAndCst);
- LHSI->setOperand(0, Shift->getOperand(0));
- Worklist.Add(Shift); // Shift is dead.
- return &ICI;
+ ConstantInt *C3 = Builder->getInt(AP2 - AP1);
+ Value *NewAdd = Builder->CreateNSWAdd(C, C3);
+ return new ICmpInst(Pred, A, NewAdd);
}
}
}
- // Turn ((X >> Y) & C) == 0 into (X & (C << Y)) == 0. The later is
- // preferable because it allows the C<<Y expression to be hoisted out
- // of a loop if Y is invariant and X is not.
- if (Shift && Shift->hasOneUse() && RHSV == 0 &&
- ICI.isEquality() && !Shift->isArithmeticShift() &&
- !isa<Constant>(Shift->getOperand(0))) {
- // Compute C << Y.
- Value *NS;
- if (Shift->getOpcode() == Instruction::LShr) {
- NS = Builder->CreateShl(AndCst, Shift->getOperand(1));
- } else {
- // Insert a logical shift.
- NS = Builder->CreateLShr(AndCst, Shift->getOperand(1));
- }
+ // Analyze the case when either Op0 or Op1 is a sub instruction.
+ // Op0 = A - B (or A and B are null); Op1 = C - D (or C and D are null).
+ A = nullptr;
+ B = nullptr;
+ C = nullptr;
+ D = nullptr;
+ if (BO0 && BO0->getOpcode() == Instruction::Sub) {
+ A = BO0->getOperand(0);
+ B = BO0->getOperand(1);
+ }
+ if (BO1 && BO1->getOpcode() == Instruction::Sub) {
+ C = BO1->getOperand(0);
+ D = BO1->getOperand(1);
+ }
- // Compute X & (C << Y).
- Value *NewAnd =
- Builder->CreateAnd(Shift->getOperand(0), NS, LHSI->getName());
+ // icmp (X-Y), X -> icmp 0, Y for equalities or if there is no overflow.
+ if (A == Op1 && NoOp0WrapProblem)
+ return new ICmpInst(Pred, Constant::getNullValue(Op1->getType()), B);
+
+ // icmp X, (X-Y) -> icmp Y, 0 for equalities or if there is no overflow.
+ if (C == Op0 && NoOp1WrapProblem)
+ return new ICmpInst(Pred, D, Constant::getNullValue(Op0->getType()));
+
+ // icmp (Y-X), (Z-X) -> icmp Y, Z for equalities or if there is no overflow.
+ if (B && D && B == D && NoOp0WrapProblem && NoOp1WrapProblem &&
+ // Try not to increase register pressure.
+ BO0->hasOneUse() && BO1->hasOneUse())
+ return new ICmpInst(Pred, A, C);
+
+ // icmp (X-Y), (X-Z) -> icmp Z, Y for equalities or if there is no overflow.
+ if (A && C && A == C && NoOp0WrapProblem && NoOp1WrapProblem &&
+ // Try not to increase register pressure.
+ BO0->hasOneUse() && BO1->hasOneUse())
+ return new ICmpInst(Pred, D, B);
+
+ // icmp (0-X) < cst --> x > -cst
+ if (NoOp0WrapProblem && ICmpInst::isSigned(Pred)) {
+ Value *X;
+ if (match(BO0, m_Neg(m_Value(X))))
+ if (ConstantInt *RHSC = dyn_cast<ConstantInt>(Op1))
+ if (!RHSC->isMinValue(/*isSigned=*/true))
+ return new ICmpInst(I.getSwappedPredicate(), X,
+ ConstantExpr::getNeg(RHSC));
+ }
- ICI.setOperand(0, NewAnd);
- return &ICI;
- }
+ BinaryOperator *SRem = nullptr;
+ // icmp (srem X, Y), Y
+ if (BO0 && BO0->getOpcode() == Instruction::SRem && Op1 == BO0->getOperand(1))
+ SRem = BO0;
+ // icmp Y, (srem X, Y)
+ else if (BO1 && BO1->getOpcode() == Instruction::SRem &&
+ Op0 == BO1->getOperand(1))
+ SRem = BO1;
+ if (SRem) {
+ // We don't check hasOneUse to avoid increasing register pressure because
+ // the value we use is the same value this instruction was already using.
+ switch (SRem == BO0 ? ICmpInst::getSwappedPredicate(Pred) : Pred) {
+ default:
+ break;
+ case ICmpInst::ICMP_EQ:
+ return replaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
+ case ICmpInst::ICMP_NE:
+ return replaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
+ case ICmpInst::ICMP_SGT:
+ case ICmpInst::ICMP_SGE:
+ return new ICmpInst(ICmpInst::ICMP_SGT, SRem->getOperand(1),
+ Constant::getAllOnesValue(SRem->getType()));
+ case ICmpInst::ICMP_SLT:
+ case ICmpInst::ICMP_SLE:
+ return new ICmpInst(ICmpInst::ICMP_SLT, SRem->getOperand(1),
+ Constant::getNullValue(SRem->getType()));
+ }
+ }
- // (icmp pred (and (or (lshr X, Y), X), 1), 0) -->
- // (icmp pred (and X, (or (shl 1, Y), 1), 0))
- //
- // iff pred isn't signed
- {
- Value *X, *Y, *LShr;
- if (!ICI.isSigned() && RHSV == 0) {
- if (match(LHSI->getOperand(1), m_One())) {
- Constant *One = cast<Constant>(LHSI->getOperand(1));
- Value *Or = LHSI->getOperand(0);
- if (match(Or, m_Or(m_Value(LShr), m_Value(X))) &&
- match(LShr, m_LShr(m_Specific(X), m_Value(Y)))) {
- unsigned UsesRemoved = 0;
- if (LHSI->hasOneUse())
- ++UsesRemoved;
- if (Or->hasOneUse())
- ++UsesRemoved;
- if (LShr->hasOneUse())
- ++UsesRemoved;
- Value *NewOr = nullptr;
- // Compute X & ((1 << Y) | 1)
- if (auto *C = dyn_cast<Constant>(Y)) {
- if (UsesRemoved >= 1)
- NewOr =
- ConstantExpr::getOr(ConstantExpr::getNUWShl(One, C), One);
- } else {
- if (UsesRemoved >= 3)
- NewOr = Builder->CreateOr(Builder->CreateShl(One, Y,
- LShr->getName(),
- /*HasNUW=*/true),
- One, Or->getName());
- }
- if (NewOr) {
- Value *NewAnd = Builder->CreateAnd(X, NewOr, LHSI->getName());
- ICI.setOperand(0, NewAnd);
- return &ICI;
- }
- }
- }
+ if (BO0 && BO1 && BO0->getOpcode() == BO1->getOpcode() && BO0->hasOneUse() &&
+ BO1->hasOneUse() && BO0->getOperand(1) == BO1->getOperand(1)) {
+ switch (BO0->getOpcode()) {
+ default:
+ break;
+ case Instruction::Add:
+ case Instruction::Sub:
+ case Instruction::Xor:
+ if (I.isEquality()) // a+x icmp eq/ne b+x --> a icmp b
+ return new ICmpInst(I.getPredicate(), BO0->getOperand(0),
+ BO1->getOperand(0));
+ // icmp u/s (a ^ signbit), (b ^ signbit) --> icmp s/u a, b
+ if (ConstantInt *CI = dyn_cast<ConstantInt>(BO0->getOperand(1))) {
+ if (CI->getValue().isSignBit()) {
+ ICmpInst::Predicate Pred =
+ I.isSigned() ? I.getUnsignedPredicate() : I.getSignedPredicate();
+ return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0));
}
- }
- // Replace ((X & AndCst) > RHSV) with ((X & AndCst) != 0), if any
- // bit set in (X & AndCst) will produce a result greater than RHSV.
- if (ICI.getPredicate() == ICmpInst::ICMP_UGT) {
- unsigned NTZ = AndCst->getValue().countTrailingZeros();
- if ((NTZ < AndCst->getBitWidth()) &&
- APInt::getOneBitSet(AndCst->getBitWidth(), NTZ).ugt(RHSV))
- return new ICmpInst(ICmpInst::ICMP_NE, LHSI,
- Constant::getNullValue(RHS->getType()));
+ if (BO0->getOpcode() == Instruction::Xor && CI->isMaxValue(true)) {
+ ICmpInst::Predicate Pred =
+ I.isSigned() ? I.getUnsignedPredicate() : I.getSignedPredicate();
+ Pred = I.getSwappedPredicate(Pred);
+ return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0));
+ }
}
- }
-
- // Try to optimize things like "A[i]&42 == 0" to index computations.
- if (LoadInst *LI = dyn_cast<LoadInst>(LHSI->getOperand(0))) {
- if (GetElementPtrInst *GEP =
- dyn_cast<GetElementPtrInst>(LI->getOperand(0)))
- if (GlobalVariable *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0)))
- if (GV->isConstant() && GV->hasDefinitiveInitializer() &&
- !LI->isVolatile() && isa<ConstantInt>(LHSI->getOperand(1))) {
- ConstantInt *C = cast<ConstantInt>(LHSI->getOperand(1));
- if (Instruction *Res = FoldCmpLoadFromIndexedGlobal(GEP, GV,ICI, C))
- return Res;
- }
- }
+ break;
+ case Instruction::Mul:
+ if (!I.isEquality())
+ break;
- // X & -C == -C -> X > u ~C
- // X & -C != -C -> X <= u ~C
- // iff C is a power of 2
- if (ICI.isEquality() && RHS == LHSI->getOperand(1) && (-RHSV).isPowerOf2())
- return new ICmpInst(
- ICI.getPredicate() == ICmpInst::ICMP_EQ ? ICmpInst::ICMP_UGT
- : ICmpInst::ICMP_ULE,
- LHSI->getOperand(0), SubOne(RHS));
-
- // (icmp eq (and %A, C), 0) -> (icmp sgt (trunc %A), -1)
- // iff C is a power of 2
- if (ICI.isEquality() && LHSI->hasOneUse() && match(RHS, m_Zero())) {
- if (auto *CI = dyn_cast<ConstantInt>(LHSI->getOperand(1))) {
- const APInt &AI = CI->getValue();
- int32_t ExactLogBase2 = AI.exactLogBase2();
- if (ExactLogBase2 != -1 && DL.isLegalInteger(ExactLogBase2 + 1)) {
- Type *NTy = IntegerType::get(ICI.getContext(), ExactLogBase2 + 1);
- Value *Trunc = Builder->CreateTrunc(LHSI->getOperand(0), NTy);
- return new ICmpInst(ICI.getPredicate() == ICmpInst::ICMP_EQ
- ? ICmpInst::ICMP_SGE
- : ICmpInst::ICMP_SLT,
- Trunc, Constant::getNullValue(NTy));
+ if (ConstantInt *CI = dyn_cast<ConstantInt>(BO0->getOperand(1))) {
+ // a * Cst icmp eq/ne b * Cst --> a & Mask icmp b & Mask
+ // Mask = -1 >> count-trailing-zeros(Cst).
+ if (!CI->isZero() && !CI->isOne()) {
+ const APInt &AP = CI->getValue();
+ ConstantInt *Mask = ConstantInt::get(
+ I.getContext(),
+ APInt::getLowBitsSet(AP.getBitWidth(),
+ AP.getBitWidth() - AP.countTrailingZeros()));
+ Value *And1 = Builder->CreateAnd(BO0->getOperand(0), Mask);
+ Value *And2 = Builder->CreateAnd(BO1->getOperand(0), Mask);
+ return new ICmpInst(I.getPredicate(), And1, And2);
}
}
+ break;
+ case Instruction::UDiv:
+ case Instruction::LShr:
+ if (I.isSigned())
+ break;
+ LLVM_FALLTHROUGH;
+ case Instruction::SDiv:
+ case Instruction::AShr:
+ if (!BO0->isExact() || !BO1->isExact())
+ break;
+ return new ICmpInst(I.getPredicate(), BO0->getOperand(0),
+ BO1->getOperand(0));
+ case Instruction::Shl: {
+ bool NUW = BO0->hasNoUnsignedWrap() && BO1->hasNoUnsignedWrap();
+ bool NSW = BO0->hasNoSignedWrap() && BO1->hasNoSignedWrap();
+ if (!NUW && !NSW)
+ break;
+ if (!NSW && I.isSigned())
+ break;
+ return new ICmpInst(I.getPredicate(), BO0->getOperand(0),
+ BO1->getOperand(0));
}
- break;
-
- case Instruction::Or: {
- if (RHS->isOne()) {
- // icmp slt signum(V) 1 --> icmp slt V, 1
- Value *V = nullptr;
- if (ICI.getPredicate() == ICmpInst::ICMP_SLT &&
- match(LHSI, m_Signum(m_Value(V))))
- return new ICmpInst(ICmpInst::ICMP_SLT, V,
- ConstantInt::get(V->getType(), 1));
}
+ }
- if (!ICI.isEquality() || !RHS->isNullValue() || !LHSI->hasOneUse())
- break;
- Value *P, *Q;
- if (match(LHSI, m_Or(m_PtrToInt(m_Value(P)), m_PtrToInt(m_Value(Q))))) {
- // Simplify icmp eq (or (ptrtoint P), (ptrtoint Q)), 0
- // -> and (icmp eq P, null), (icmp eq Q, null).
- Value *ICIP = Builder->CreateICmp(ICI.getPredicate(), P,
- Constant::getNullValue(P->getType()));
- Value *ICIQ = Builder->CreateICmp(ICI.getPredicate(), Q,
- Constant::getNullValue(Q->getType()));
- Instruction *Op;
- if (ICI.getPredicate() == ICmpInst::ICMP_EQ)
- Op = BinaryOperator::CreateAnd(ICIP, ICIQ);
- else
- Op = BinaryOperator::CreateOr(ICIP, ICIQ);
- return Op;
+ if (BO0) {
+ // Transform A & (L - 1) `ult` L --> L != 0
+ auto LSubOne = m_Add(m_Specific(Op1), m_AllOnes());
+ auto BitwiseAnd =
+ m_CombineOr(m_And(m_Value(), LSubOne), m_And(LSubOne, m_Value()));
+
+ if (match(BO0, BitwiseAnd) && I.getPredicate() == ICmpInst::ICMP_ULT) {
+ auto *Zero = Constant::getNullValue(BO0->getType());
+ return new ICmpInst(ICmpInst::ICMP_NE, Op1, Zero);
}
- break;
}
- case Instruction::Mul: { // (icmp pred (mul X, Val), CI)
- ConstantInt *Val = dyn_cast<ConstantInt>(LHSI->getOperand(1));
- if (!Val) break;
+ return nullptr;
+}
- // If this is a signed comparison to 0 and the mul is sign preserving,
- // use the mul LHS operand instead.
- ICmpInst::Predicate pred = ICI.getPredicate();
- if (isSignTest(pred, RHS) && !Val->isZero() &&
- cast<BinaryOperator>(LHSI)->hasNoSignedWrap())
- return new ICmpInst(Val->isNegative() ?
- ICmpInst::getSwappedPredicate(pred) : pred,
- LHSI->getOperand(0),
- Constant::getNullValue(RHS->getType()));
+/// Fold icmp Pred min|max(X, Y), X.
+static Instruction *foldICmpWithMinMax(ICmpInst &Cmp) {
+ ICmpInst::Predicate Pred = Cmp.getPredicate();
+ Value *Op0 = Cmp.getOperand(0);
+ Value *X = Cmp.getOperand(1);
+
+ // Canonicalize minimum or maximum operand to LHS of the icmp.
+ if (match(X, m_c_SMin(m_Specific(Op0), m_Value())) ||
+ match(X, m_c_SMax(m_Specific(Op0), m_Value())) ||
+ match(X, m_c_UMin(m_Specific(Op0), m_Value())) ||
+ match(X, m_c_UMax(m_Specific(Op0), m_Value()))) {
+ std::swap(Op0, X);
+ Pred = Cmp.getSwappedPredicate();
+ }
- break;
+ Value *Y;
+ if (match(Op0, m_c_SMin(m_Specific(X), m_Value(Y)))) {
+ // smin(X, Y) == X --> X s<= Y
+ // smin(X, Y) s>= X --> X s<= Y
+ if (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_SGE)
+ return new ICmpInst(ICmpInst::ICMP_SLE, X, Y);
+
+ // smin(X, Y) != X --> X s> Y
+ // smin(X, Y) s< X --> X s> Y
+ if (Pred == CmpInst::ICMP_NE || Pred == CmpInst::ICMP_SLT)
+ return new ICmpInst(ICmpInst::ICMP_SGT, X, Y);
+
+ // These cases should be handled in InstSimplify:
+ // smin(X, Y) s<= X --> true
+ // smin(X, Y) s> X --> false
+ return nullptr;
}
- case Instruction::Shl: { // (icmp pred (shl X, ShAmt), CI)
- uint32_t TypeBits = RHSV.getBitWidth();
- ConstantInt *ShAmt = dyn_cast<ConstantInt>(LHSI->getOperand(1));
- if (!ShAmt) {
- Value *X;
- // (1 << X) pred P2 -> X pred Log2(P2)
- if (match(LHSI, m_Shl(m_One(), m_Value(X)))) {
- bool RHSVIsPowerOf2 = RHSV.isPowerOf2();
- ICmpInst::Predicate Pred = ICI.getPredicate();
- if (ICI.isUnsigned()) {
- if (!RHSVIsPowerOf2) {
- // (1 << X) < 30 -> X <= 4
- // (1 << X) <= 30 -> X <= 4
- // (1 << X) >= 30 -> X > 4
- // (1 << X) > 30 -> X > 4
- if (Pred == ICmpInst::ICMP_ULT)
- Pred = ICmpInst::ICMP_ULE;
- else if (Pred == ICmpInst::ICMP_UGE)
- Pred = ICmpInst::ICMP_UGT;
- }
- unsigned RHSLog2 = RHSV.logBase2();
-
- // (1 << X) >= 2147483648 -> X >= 31 -> X == 31
- // (1 << X) < 2147483648 -> X < 31 -> X != 31
- if (RHSLog2 == TypeBits-1) {
- if (Pred == ICmpInst::ICMP_UGE)
- Pred = ICmpInst::ICMP_EQ;
- else if (Pred == ICmpInst::ICMP_ULT)
- Pred = ICmpInst::ICMP_NE;
- }
+ if (match(Op0, m_c_SMax(m_Specific(X), m_Value(Y)))) {
+ // smax(X, Y) == X --> X s>= Y
+ // smax(X, Y) s<= X --> X s>= Y
+ if (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_SLE)
+ return new ICmpInst(ICmpInst::ICMP_SGE, X, Y);
- return new ICmpInst(Pred, X,
- ConstantInt::get(RHS->getType(), RHSLog2));
- } else if (ICI.isSigned()) {
- if (RHSV.isAllOnesValue()) {
- // (1 << X) <= -1 -> X == 31
- if (Pred == ICmpInst::ICMP_SLE)
- return new ICmpInst(ICmpInst::ICMP_EQ, X,
- ConstantInt::get(RHS->getType(), TypeBits-1));
-
- // (1 << X) > -1 -> X != 31
- if (Pred == ICmpInst::ICMP_SGT)
- return new ICmpInst(ICmpInst::ICMP_NE, X,
- ConstantInt::get(RHS->getType(), TypeBits-1));
- } else if (!RHSV) {
- // (1 << X) < 0 -> X == 31
- // (1 << X) <= 0 -> X == 31
- if (Pred == ICmpInst::ICMP_SLT || Pred == ICmpInst::ICMP_SLE)
- return new ICmpInst(ICmpInst::ICMP_EQ, X,
- ConstantInt::get(RHS->getType(), TypeBits-1));
-
- // (1 << X) >= 0 -> X != 31
- // (1 << X) > 0 -> X != 31
- if (Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_SGE)
- return new ICmpInst(ICmpInst::ICMP_NE, X,
- ConstantInt::get(RHS->getType(), TypeBits-1));
- }
- } else if (ICI.isEquality()) {
- if (RHSVIsPowerOf2)
- return new ICmpInst(
- Pred, X, ConstantInt::get(RHS->getType(), RHSV.logBase2()));
- }
- }
- break;
- }
+ // smax(X, Y) != X --> X s< Y
+ // smax(X, Y) s> X --> X s< Y
+ if (Pred == CmpInst::ICMP_NE || Pred == CmpInst::ICMP_SGT)
+ return new ICmpInst(ICmpInst::ICMP_SLT, X, Y);
- // Check that the shift amount is in range. If not, don't perform
- // undefined shifts. When the shift is visited it will be
- // simplified.
- if (ShAmt->uge(TypeBits))
- break;
+ // These cases should be handled in InstSimplify:
+ // smax(X, Y) s>= X --> true
+ // smax(X, Y) s< X --> false
+ return nullptr;
+ }
- if (ICI.isEquality()) {
- // If we are comparing against bits always shifted out, the
- // comparison cannot succeed.
- Constant *Comp =
- ConstantExpr::getShl(ConstantExpr::getLShr(RHS, ShAmt),
- ShAmt);
- if (Comp != RHS) {// Comparing against a bit that we know is zero.
- bool IsICMP_NE = ICI.getPredicate() == ICmpInst::ICMP_NE;
- Constant *Cst = Builder->getInt1(IsICMP_NE);
- return replaceInstUsesWith(ICI, Cst);
- }
+ if (match(Op0, m_c_UMin(m_Specific(X), m_Value(Y)))) {
+ // umin(X, Y) == X --> X u<= Y
+ // umin(X, Y) u>= X --> X u<= Y
+ if (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_UGE)
+ return new ICmpInst(ICmpInst::ICMP_ULE, X, Y);
- // If the shift is NUW, then it is just shifting out zeros, no need for an
- // AND.
- if (cast<BinaryOperator>(LHSI)->hasNoUnsignedWrap())
- return new ICmpInst(ICI.getPredicate(), LHSI->getOperand(0),
- ConstantExpr::getLShr(RHS, ShAmt));
-
- // If the shift is NSW and we compare to 0, then it is just shifting out
- // sign bits, no need for an AND either.
- if (cast<BinaryOperator>(LHSI)->hasNoSignedWrap() && RHSV == 0)
- return new ICmpInst(ICI.getPredicate(), LHSI->getOperand(0),
- ConstantExpr::getLShr(RHS, ShAmt));
-
- if (LHSI->hasOneUse()) {
- // Otherwise strength reduce the shift into an and.
- uint32_t ShAmtVal = (uint32_t)ShAmt->getLimitedValue(TypeBits);
- Constant *Mask = Builder->getInt(APInt::getLowBitsSet(TypeBits,
- TypeBits - ShAmtVal));
-
- Value *And =
- Builder->CreateAnd(LHSI->getOperand(0),Mask, LHSI->getName()+".mask");
- return new ICmpInst(ICI.getPredicate(), And,
- ConstantExpr::getLShr(RHS, ShAmt));
- }
- }
+ // umin(X, Y) != X --> X u> Y
+ // umin(X, Y) u< X --> X u> Y
+ if (Pred == CmpInst::ICMP_NE || Pred == CmpInst::ICMP_ULT)
+ return new ICmpInst(ICmpInst::ICMP_UGT, X, Y);
- // If this is a signed comparison to 0 and the shift is sign preserving,
- // use the shift LHS operand instead.
- ICmpInst::Predicate pred = ICI.getPredicate();
- if (isSignTest(pred, RHS) &&
- cast<BinaryOperator>(LHSI)->hasNoSignedWrap())
- return new ICmpInst(pred,
- LHSI->getOperand(0),
- Constant::getNullValue(RHS->getType()));
-
- // Otherwise, if this is a comparison of the sign bit, simplify to and/test.
- bool TrueIfSigned = false;
- if (LHSI->hasOneUse() &&
- isSignBitCheck(ICI.getPredicate(), RHS, TrueIfSigned)) {
- // (X << 31) <s 0 --> (X&1) != 0
- Constant *Mask = ConstantInt::get(LHSI->getOperand(0)->getType(),
- APInt::getOneBitSet(TypeBits,
- TypeBits-ShAmt->getZExtValue()-1));
- Value *And =
- Builder->CreateAnd(LHSI->getOperand(0), Mask, LHSI->getName()+".mask");
- return new ICmpInst(TrueIfSigned ? ICmpInst::ICMP_NE : ICmpInst::ICMP_EQ,
- And, Constant::getNullValue(And->getType()));
- }
+ // These cases should be handled in InstSimplify:
+ // umin(X, Y) u<= X --> true
+ // umin(X, Y) u> X --> false
+ return nullptr;
+ }
- // Transform (icmp pred iM (shl iM %v, N), CI)
- // -> (icmp pred i(M-N) (trunc %v iM to i(M-N)), (trunc (CI>>N))
- // Transform the shl to a trunc if (trunc (CI>>N)) has no loss and M-N.
- // This enables to get rid of the shift in favor of a trunc which can be
- // free on the target. It has the additional benefit of comparing to a
- // smaller constant, which will be target friendly.
- unsigned Amt = ShAmt->getLimitedValue(TypeBits-1);
- if (LHSI->hasOneUse() &&
- Amt != 0 && RHSV.countTrailingZeros() >= Amt) {
- Type *NTy = IntegerType::get(ICI.getContext(), TypeBits - Amt);
- Constant *NCI = ConstantExpr::getTrunc(
- ConstantExpr::getAShr(RHS,
- ConstantInt::get(RHS->getType(), Amt)),
- NTy);
- return new ICmpInst(ICI.getPredicate(),
- Builder->CreateTrunc(LHSI->getOperand(0), NTy),
- NCI);
- }
+ if (match(Op0, m_c_UMax(m_Specific(X), m_Value(Y)))) {
+ // umax(X, Y) == X --> X u>= Y
+ // umax(X, Y) u<= X --> X u>= Y
+ if (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_ULE)
+ return new ICmpInst(ICmpInst::ICMP_UGE, X, Y);
- break;
+ // umax(X, Y) != X --> X u< Y
+ // umax(X, Y) u> X --> X u< Y
+ if (Pred == CmpInst::ICMP_NE || Pred == CmpInst::ICMP_UGT)
+ return new ICmpInst(ICmpInst::ICMP_ULT, X, Y);
+
+ // These cases should be handled in InstSimplify:
+ // umax(X, Y) u>= X --> true
+ // umax(X, Y) u< X --> false
+ return nullptr;
}
- case Instruction::LShr: // (icmp pred (shr X, ShAmt), CI)
- case Instruction::AShr: {
- // Handle equality comparisons of shift-by-constant.
- BinaryOperator *BO = cast<BinaryOperator>(LHSI);
- if (ConstantInt *ShAmt = dyn_cast<ConstantInt>(LHSI->getOperand(1))) {
- if (Instruction *Res = FoldICmpShrCst(ICI, BO, ShAmt))
- return Res;
- }
+ return nullptr;
+}
+
+Instruction *InstCombiner::foldICmpEquality(ICmpInst &I) {
+ if (!I.isEquality())
+ return nullptr;
- // Handle exact shr's.
- if (ICI.isEquality() && BO->isExact() && BO->hasOneUse()) {
- if (RHSV.isMinValue())
- return new ICmpInst(ICI.getPredicate(), BO->getOperand(0), RHS);
+ Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+ Value *A, *B, *C, *D;
+ if (match(Op0, m_Xor(m_Value(A), m_Value(B)))) {
+ if (A == Op1 || B == Op1) { // (A^B) == A -> B == 0
+ Value *OtherVal = A == Op1 ? B : A;
+ return new ICmpInst(I.getPredicate(), OtherVal,
+ Constant::getNullValue(A->getType()));
}
- break;
- }
- case Instruction::UDiv:
- if (ConstantInt *DivLHS = dyn_cast<ConstantInt>(LHSI->getOperand(0))) {
- Value *X = LHSI->getOperand(1);
- const APInt &C1 = RHS->getValue();
- const APInt &C2 = DivLHS->getValue();
- assert(C2 != 0 && "udiv 0, X should have been simplified already.");
- // (icmp ugt (udiv C2, X), C1) -> (icmp ule X, C2/(C1+1))
- if (ICI.getPredicate() == ICmpInst::ICMP_UGT) {
- assert(!C1.isMaxValue() &&
- "icmp ugt X, UINT_MAX should have been simplified already.");
- return new ICmpInst(ICmpInst::ICMP_ULE, X,
- ConstantInt::get(X->getType(), C2.udiv(C1 + 1)));
- }
- // (icmp ult (udiv C2, X), C1) -> (icmp ugt X, C2/C1)
- if (ICI.getPredicate() == ICmpInst::ICMP_ULT) {
- assert(C1 != 0 && "icmp ult X, 0 should have been simplified already.");
- return new ICmpInst(ICmpInst::ICMP_UGT, X,
- ConstantInt::get(X->getType(), C2.udiv(C1)));
+ if (match(Op1, m_Xor(m_Value(C), m_Value(D)))) {
+ // A^c1 == C^c2 --> A == C^(c1^c2)
+ ConstantInt *C1, *C2;
+ if (match(B, m_ConstantInt(C1)) && match(D, m_ConstantInt(C2)) &&
+ Op1->hasOneUse()) {
+ Constant *NC = Builder->getInt(C1->getValue() ^ C2->getValue());
+ Value *Xor = Builder->CreateXor(C, NC);
+ return new ICmpInst(I.getPredicate(), A, Xor);
}
+
+ // A^B == A^D -> B == D
+ if (A == C)
+ return new ICmpInst(I.getPredicate(), B, D);
+ if (A == D)
+ return new ICmpInst(I.getPredicate(), B, C);
+ if (B == C)
+ return new ICmpInst(I.getPredicate(), A, D);
+ if (B == D)
+ return new ICmpInst(I.getPredicate(), A, C);
}
- // fall-through
- case Instruction::SDiv:
- // Fold: icmp pred ([us]div X, C1), C2 -> range test
- // Fold this div into the comparison, producing a range check.
- // Determine, based on the divide type, what the range is being
- // checked. If there is an overflow on the low or high side, remember
- // it, otherwise compute the range [low, hi) bounding the new value.
- // See: InsertRangeTest above for the kinds of replacements possible.
- if (ConstantInt *DivRHS = dyn_cast<ConstantInt>(LHSI->getOperand(1)))
- if (Instruction *R = FoldICmpDivCst(ICI, cast<BinaryOperator>(LHSI),
- DivRHS))
- return R;
- break;
+ }
- case Instruction::Sub: {
- ConstantInt *LHSC = dyn_cast<ConstantInt>(LHSI->getOperand(0));
- if (!LHSC) break;
- const APInt &LHSV = LHSC->getValue();
-
- // C1-X <u C2 -> (X|(C2-1)) == C1
- // iff C1 & (C2-1) == C2-1
- // C2 is a power of 2
- if (ICI.getPredicate() == ICmpInst::ICMP_ULT && LHSI->hasOneUse() &&
- RHSV.isPowerOf2() && (LHSV & (RHSV - 1)) == (RHSV - 1))
- return new ICmpInst(ICmpInst::ICMP_EQ,
- Builder->CreateOr(LHSI->getOperand(1), RHSV - 1),
- LHSC);
-
- // C1-X >u C2 -> (X|C2) != C1
- // iff C1 & C2 == C2
- // C2+1 is a power of 2
- if (ICI.getPredicate() == ICmpInst::ICMP_UGT && LHSI->hasOneUse() &&
- (RHSV + 1).isPowerOf2() && (LHSV & RHSV) == RHSV)
- return new ICmpInst(ICmpInst::ICMP_NE,
- Builder->CreateOr(LHSI->getOperand(1), RHSV), LHSC);
- break;
+ if (match(Op1, m_Xor(m_Value(A), m_Value(B))) && (A == Op0 || B == Op0)) {
+ // A == (A^B) -> B == 0
+ Value *OtherVal = A == Op0 ? B : A;
+ return new ICmpInst(I.getPredicate(), OtherVal,
+ Constant::getNullValue(A->getType()));
}
- case Instruction::Add:
- // Fold: icmp pred (add X, C1), C2
- if (!ICI.isEquality()) {
- ConstantInt *LHSC = dyn_cast<ConstantInt>(LHSI->getOperand(1));
- if (!LHSC) break;
- const APInt &LHSV = LHSC->getValue();
+ // (X&Z) == (Y&Z) -> (X^Y) & Z == 0
+ if (match(Op0, m_OneUse(m_And(m_Value(A), m_Value(B)))) &&
+ match(Op1, m_OneUse(m_And(m_Value(C), m_Value(D))))) {
+ Value *X = nullptr, *Y = nullptr, *Z = nullptr;
+
+ if (A == C) {
+ X = B;
+ Y = D;
+ Z = A;
+ } else if (A == D) {
+ X = B;
+ Y = C;
+ Z = A;
+ } else if (B == C) {
+ X = A;
+ Y = D;
+ Z = B;
+ } else if (B == D) {
+ X = A;
+ Y = C;
+ Z = B;
+ }
- ConstantRange CR = ICI.makeConstantRange(ICI.getPredicate(), RHSV)
- .subtract(LHSV);
+ if (X) { // Build (X^Y) & Z
+ Op1 = Builder->CreateXor(X, Y);
+ Op1 = Builder->CreateAnd(Op1, Z);
+ I.setOperand(0, Op1);
+ I.setOperand(1, Constant::getNullValue(Op1->getType()));
+ return &I;
+ }
+ }
- if (ICI.isSigned()) {
- if (CR.getLower().isSignBit()) {
- return new ICmpInst(ICmpInst::ICMP_SLT, LHSI->getOperand(0),
- Builder->getInt(CR.getUpper()));
- } else if (CR.getUpper().isSignBit()) {
- return new ICmpInst(ICmpInst::ICMP_SGE, LHSI->getOperand(0),
- Builder->getInt(CR.getLower()));
- }
- } else {
- if (CR.getLower().isMinValue()) {
- return new ICmpInst(ICmpInst::ICMP_ULT, LHSI->getOperand(0),
- Builder->getInt(CR.getUpper()));
- } else if (CR.getUpper().isMinValue()) {
- return new ICmpInst(ICmpInst::ICMP_UGE, LHSI->getOperand(0),
- Builder->getInt(CR.getLower()));
- }
- }
+ // Transform (zext A) == (B & (1<<X)-1) --> A == (trunc B)
+ // and (B & (1<<X)-1) == (zext A) --> A == (trunc B)
+ ConstantInt *Cst1;
+ if ((Op0->hasOneUse() && match(Op0, m_ZExt(m_Value(A))) &&
+ match(Op1, m_And(m_Value(B), m_ConstantInt(Cst1)))) ||
+ (Op1->hasOneUse() && match(Op0, m_And(m_Value(B), m_ConstantInt(Cst1))) &&
+ match(Op1, m_ZExt(m_Value(A))))) {
+ APInt Pow2 = Cst1->getValue() + 1;
+ if (Pow2.isPowerOf2() && isa<IntegerType>(A->getType()) &&
+ Pow2.logBase2() == cast<IntegerType>(A->getType())->getBitWidth())
+ return new ICmpInst(I.getPredicate(), A,
+ Builder->CreateTrunc(B, A->getType()));
+ }
- // X-C1 <u C2 -> (X & -C2) == C1
- // iff C1 & (C2-1) == 0
- // C2 is a power of 2
- if (ICI.getPredicate() == ICmpInst::ICMP_ULT && LHSI->hasOneUse() &&
- RHSV.isPowerOf2() && (LHSV & (RHSV - 1)) == 0)
- return new ICmpInst(ICmpInst::ICMP_EQ,
- Builder->CreateAnd(LHSI->getOperand(0), -RHSV),
- ConstantExpr::getNeg(LHSC));
-
- // X-C1 >u C2 -> (X & ~C2) != C1
- // iff C1 & C2 == 0
- // C2+1 is a power of 2
- if (ICI.getPredicate() == ICmpInst::ICMP_UGT && LHSI->hasOneUse() &&
- (RHSV + 1).isPowerOf2() && (LHSV & RHSV) == 0)
- return new ICmpInst(ICmpInst::ICMP_NE,
- Builder->CreateAnd(LHSI->getOperand(0), ~RHSV),
- ConstantExpr::getNeg(LHSC));
+ // (A >> C) == (B >> C) --> (A^B) u< (1 << C)
+ // For lshr and ashr pairs.
+ if ((match(Op0, m_OneUse(m_LShr(m_Value(A), m_ConstantInt(Cst1)))) &&
+ match(Op1, m_OneUse(m_LShr(m_Value(B), m_Specific(Cst1))))) ||
+ (match(Op0, m_OneUse(m_AShr(m_Value(A), m_ConstantInt(Cst1)))) &&
+ match(Op1, m_OneUse(m_AShr(m_Value(B), m_Specific(Cst1)))))) {
+ unsigned TypeBits = Cst1->getBitWidth();
+ unsigned ShAmt = (unsigned)Cst1->getLimitedValue(TypeBits);
+ if (ShAmt < TypeBits && ShAmt != 0) {
+ ICmpInst::Predicate Pred = I.getPredicate() == ICmpInst::ICMP_NE
+ ? ICmpInst::ICMP_UGE
+ : ICmpInst::ICMP_ULT;
+ Value *Xor = Builder->CreateXor(A, B, I.getName() + ".unshifted");
+ APInt CmpVal = APInt::getOneBitSet(TypeBits, ShAmt);
+ return new ICmpInst(Pred, Xor, Builder->getInt(CmpVal));
}
- break;
}
- // Simplify icmp_eq and icmp_ne instructions with integer constant RHS.
- if (ICI.isEquality()) {
- bool isICMP_NE = ICI.getPredicate() == ICmpInst::ICMP_NE;
-
- // If the first operand is (add|sub|and|or|xor|rem) with a constant, and
- // the second operand is a constant, simplify a bit.
- if (BinaryOperator *BO = dyn_cast<BinaryOperator>(LHSI)) {
- switch (BO->getOpcode()) {
- case Instruction::SRem:
- // If we have a signed (X % (2^c)) == 0, turn it into an unsigned one.
- if (RHSV == 0 && isa<ConstantInt>(BO->getOperand(1)) &&BO->hasOneUse()){
- const APInt &V = cast<ConstantInt>(BO->getOperand(1))->getValue();
- if (V.sgt(1) && V.isPowerOf2()) {
- Value *NewRem =
- Builder->CreateURem(BO->getOperand(0), BO->getOperand(1),
- BO->getName());
- return new ICmpInst(ICI.getPredicate(), NewRem,
- Constant::getNullValue(BO->getType()));
- }
- }
- break;
- case Instruction::Add:
- // Replace ((add A, B) != C) with (A != C-B) if B & C are constants.
- if (ConstantInt *BOp1C = dyn_cast<ConstantInt>(BO->getOperand(1))) {
- if (BO->hasOneUse())
- return new ICmpInst(ICI.getPredicate(), BO->getOperand(0),
- ConstantExpr::getSub(RHS, BOp1C));
- } else if (RHSV == 0) {
- // Replace ((add A, B) != 0) with (A != -B) if A or B is
- // efficiently invertible, or if the add has just this one use.
- Value *BOp0 = BO->getOperand(0), *BOp1 = BO->getOperand(1);
-
- if (Value *NegVal = dyn_castNegVal(BOp1))
- return new ICmpInst(ICI.getPredicate(), BOp0, NegVal);
- if (Value *NegVal = dyn_castNegVal(BOp0))
- return new ICmpInst(ICI.getPredicate(), NegVal, BOp1);
- if (BO->hasOneUse()) {
- Value *Neg = Builder->CreateNeg(BOp1);
- Neg->takeName(BO);
- return new ICmpInst(ICI.getPredicate(), BOp0, Neg);
- }
- }
- break;
- case Instruction::Xor:
- if (BO->hasOneUse()) {
- if (Constant *BOC = dyn_cast<Constant>(BO->getOperand(1))) {
- // For the xor case, we can xor two constants together, eliminating
- // the explicit xor.
- return new ICmpInst(ICI.getPredicate(), BO->getOperand(0),
- ConstantExpr::getXor(RHS, BOC));
- } else if (RHSV == 0) {
- // Replace ((xor A, B) != 0) with (A != B)
- return new ICmpInst(ICI.getPredicate(), BO->getOperand(0),
- BO->getOperand(1));
- }
- }
- break;
- case Instruction::Sub:
- if (BO->hasOneUse()) {
- if (ConstantInt *BOp0C = dyn_cast<ConstantInt>(BO->getOperand(0))) {
- // Replace ((sub A, B) != C) with (B != A-C) if A & C are constants.
- return new ICmpInst(ICI.getPredicate(), BO->getOperand(1),
- ConstantExpr::getSub(BOp0C, RHS));
- } else if (RHSV == 0) {
- // Replace ((sub A, B) != 0) with (A != B)
- return new ICmpInst(ICI.getPredicate(), BO->getOperand(0),
- BO->getOperand(1));
- }
- }
- break;
- case Instruction::Or:
- // If bits are being or'd in that are not present in the constant we
- // are comparing against, then the comparison could never succeed!
- if (ConstantInt *BOC = dyn_cast<ConstantInt>(BO->getOperand(1))) {
- Constant *NotCI = ConstantExpr::getNot(RHS);
- if (!ConstantExpr::getAnd(BOC, NotCI)->isNullValue())
- return replaceInstUsesWith(ICI, Builder->getInt1(isICMP_NE));
-
- // Comparing if all bits outside of a constant mask are set?
- // Replace (X | C) == -1 with (X & ~C) == ~C.
- // This removes the -1 constant.
- if (BO->hasOneUse() && RHS->isAllOnesValue()) {
- Constant *NotBOC = ConstantExpr::getNot(BOC);
- Value *And = Builder->CreateAnd(BO->getOperand(0), NotBOC);
- return new ICmpInst(ICI.getPredicate(), And, NotBOC);
- }
- }
- break;
+ // (A << C) == (B << C) --> ((A^B) & (~0U >> C)) == 0
+ if (match(Op0, m_OneUse(m_Shl(m_Value(A), m_ConstantInt(Cst1)))) &&
+ match(Op1, m_OneUse(m_Shl(m_Value(B), m_Specific(Cst1))))) {
+ unsigned TypeBits = Cst1->getBitWidth();
+ unsigned ShAmt = (unsigned)Cst1->getLimitedValue(TypeBits);
+ if (ShAmt < TypeBits && ShAmt != 0) {
+ Value *Xor = Builder->CreateXor(A, B, I.getName() + ".unshifted");
+ APInt AndVal = APInt::getLowBitsSet(TypeBits, TypeBits - ShAmt);
+ Value *And = Builder->CreateAnd(Xor, Builder->getInt(AndVal),
+ I.getName() + ".mask");
+ return new ICmpInst(I.getPredicate(), And,
+ Constant::getNullValue(Cst1->getType()));
+ }
+ }
- case Instruction::And:
- if (ConstantInt *BOC = dyn_cast<ConstantInt>(BO->getOperand(1))) {
- // If bits are being compared against that are and'd out, then the
- // comparison can never succeed!
- if ((RHSV & ~BOC->getValue()) != 0)
- return replaceInstUsesWith(ICI, Builder->getInt1(isICMP_NE));
-
- // If we have ((X & C) == C), turn it into ((X & C) != 0).
- if (RHS == BOC && RHSV.isPowerOf2())
- return new ICmpInst(isICMP_NE ? ICmpInst::ICMP_EQ :
- ICmpInst::ICMP_NE, LHSI,
- Constant::getNullValue(RHS->getType()));
-
- // Don't perform the following transforms if the AND has multiple uses
- if (!BO->hasOneUse())
- break;
+ // Transform "icmp eq (trunc (lshr(X, cst1)), cst" to
+ // "icmp (and X, mask), cst"
+ uint64_t ShAmt = 0;
+ if (Op0->hasOneUse() &&
+ match(Op0, m_Trunc(m_OneUse(m_LShr(m_Value(A), m_ConstantInt(ShAmt))))) &&
+ match(Op1, m_ConstantInt(Cst1)) &&
+ // Only do this when A has multiple uses. This is most important to do
+ // when it exposes other optimizations.
+ !A->hasOneUse()) {
+ unsigned ASize = cast<IntegerType>(A->getType())->getPrimitiveSizeInBits();
+
+ if (ShAmt < ASize) {
+ APInt MaskV =
+ APInt::getLowBitsSet(ASize, Op0->getType()->getPrimitiveSizeInBits());
+ MaskV <<= ShAmt;
- // Replace (and X, (1 << size(X)-1) != 0) with x s< 0
- if (BOC->getValue().isSignBit()) {
- Value *X = BO->getOperand(0);
- Constant *Zero = Constant::getNullValue(X->getType());
- ICmpInst::Predicate pred = isICMP_NE ?
- ICmpInst::ICMP_SLT : ICmpInst::ICMP_SGE;
- return new ICmpInst(pred, X, Zero);
- }
+ APInt CmpV = Cst1->getValue().zext(ASize);
+ CmpV <<= ShAmt;
- // ((X & ~7) == 0) --> X < 8
- if (RHSV == 0 && isHighOnes(BOC)) {
- Value *X = BO->getOperand(0);
- Constant *NegX = ConstantExpr::getNeg(BOC);
- ICmpInst::Predicate pred = isICMP_NE ?
- ICmpInst::ICMP_UGE : ICmpInst::ICMP_ULT;
- return new ICmpInst(pred, X, NegX);
- }
- }
- break;
- case Instruction::Mul:
- if (RHSV == 0 && BO->hasNoSignedWrap()) {
- if (ConstantInt *BOC = dyn_cast<ConstantInt>(BO->getOperand(1))) {
- // The trivial case (mul X, 0) is handled by InstSimplify
- // General case : (mul X, C) != 0 iff X != 0
- // (mul X, C) == 0 iff X == 0
- if (!BOC->isZero())
- return new ICmpInst(ICI.getPredicate(), BO->getOperand(0),
- Constant::getNullValue(RHS->getType()));
- }
- }
- break;
- default: break;
- }
- } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(LHSI)) {
- // Handle icmp {eq|ne} <intrinsic>, intcst.
- switch (II->getIntrinsicID()) {
- case Intrinsic::bswap:
- Worklist.Add(II);
- ICI.setOperand(0, II->getArgOperand(0));
- ICI.setOperand(1, Builder->getInt(RHSV.byteSwap()));
- return &ICI;
- case Intrinsic::ctlz:
- case Intrinsic::cttz:
- // ctz(A) == bitwidth(a) -> A == 0 and likewise for !=
- if (RHSV == RHS->getType()->getBitWidth()) {
- Worklist.Add(II);
- ICI.setOperand(0, II->getArgOperand(0));
- ICI.setOperand(1, ConstantInt::get(RHS->getType(), 0));
- return &ICI;
- }
- break;
- case Intrinsic::ctpop:
- // popcount(A) == 0 -> A == 0 and likewise for !=
- if (RHS->isZero()) {
- Worklist.Add(II);
- ICI.setOperand(0, II->getArgOperand(0));
- ICI.setOperand(1, RHS);
- return &ICI;
- }
- break;
- default:
- break;
- }
+ Value *Mask = Builder->CreateAnd(A, Builder->getInt(MaskV));
+ return new ICmpInst(I.getPredicate(), Mask, Builder->getInt(CmpV));
}
}
+
return nullptr;
}
/// Handle icmp (cast x to y), (cast/cst). We only handle extending casts so
/// far.
-Instruction *InstCombiner::visitICmpInstWithCastAndCast(ICmpInst &ICmp) {
+Instruction *InstCombiner::foldICmpWithCastAndCast(ICmpInst &ICmp) {
const CastInst *LHSCI = cast<CastInst>(ICmp.getOperand(0));
Value *LHSCIOp = LHSCI->getOperand(0);
Type *SrcTy = LHSCIOp->getType();
@@ -2485,92 +3384,6 @@ Instruction *InstCombiner::visitICmpInstWithCastAndCast(ICmpInst &ICmp) {
return BinaryOperator::CreateNot(Result);
}
-/// The caller has matched a pattern of the form:
-/// I = icmp ugt (add (add A, B), CI2), CI1
-/// If this is of the form:
-/// sum = a + b
-/// if (sum+128 >u 255)
-/// Then replace it with llvm.sadd.with.overflow.i8.
-///
-static Instruction *ProcessUGT_ADDCST_ADD(ICmpInst &I, Value *A, Value *B,
- ConstantInt *CI2, ConstantInt *CI1,
- InstCombiner &IC) {
- // The transformation we're trying to do here is to transform this into an
- // llvm.sadd.with.overflow. To do this, we have to replace the original add
- // with a narrower add, and discard the add-with-constant that is part of the
- // range check (if we can't eliminate it, this isn't profitable).
-
- // In order to eliminate the add-with-constant, the compare can be its only
- // use.
- Instruction *AddWithCst = cast<Instruction>(I.getOperand(0));
- if (!AddWithCst->hasOneUse()) return nullptr;
-
- // If CI2 is 2^7, 2^15, 2^31, then it might be an sadd.with.overflow.
- if (!CI2->getValue().isPowerOf2()) return nullptr;
- unsigned NewWidth = CI2->getValue().countTrailingZeros();
- if (NewWidth != 7 && NewWidth != 15 && NewWidth != 31) return nullptr;
-
- // The width of the new add formed is 1 more than the bias.
- ++NewWidth;
-
- // Check to see that CI1 is an all-ones value with NewWidth bits.
- if (CI1->getBitWidth() == NewWidth ||
- CI1->getValue() != APInt::getLowBitsSet(CI1->getBitWidth(), NewWidth))
- return nullptr;
-
- // This is only really a signed overflow check if the inputs have been
- // sign-extended; check for that condition. For example, if CI2 is 2^31 and
- // the operands of the add are 64 bits wide, we need at least 33 sign bits.
- unsigned NeededSignBits = CI1->getBitWidth() - NewWidth + 1;
- if (IC.ComputeNumSignBits(A, 0, &I) < NeededSignBits ||
- IC.ComputeNumSignBits(B, 0, &I) < NeededSignBits)
- return nullptr;
-
- // In order to replace the original add with a narrower
- // llvm.sadd.with.overflow, the only uses allowed are the add-with-constant
- // and truncates that discard the high bits of the add. Verify that this is
- // the case.
- Instruction *OrigAdd = cast<Instruction>(AddWithCst->getOperand(0));
- for (User *U : OrigAdd->users()) {
- if (U == AddWithCst) continue;
-
- // Only accept truncates for now. We would really like a nice recursive
- // predicate like SimplifyDemandedBits, but which goes downwards the use-def
- // chain to see which bits of a value are actually demanded. If the
- // original add had another add which was then immediately truncated, we
- // could still do the transformation.
- TruncInst *TI = dyn_cast<TruncInst>(U);
- if (!TI || TI->getType()->getPrimitiveSizeInBits() > NewWidth)
- return nullptr;
- }
-
- // If the pattern matches, truncate the inputs to the narrower type and
- // use the sadd_with_overflow intrinsic to efficiently compute both the
- // result and the overflow bit.
- Type *NewType = IntegerType::get(OrigAdd->getContext(), NewWidth);
- Value *F = Intrinsic::getDeclaration(I.getModule(),
- Intrinsic::sadd_with_overflow, NewType);
-
- InstCombiner::BuilderTy *Builder = IC.Builder;
-
- // Put the new code above the original add, in case there are any uses of the
- // add between the add and the compare.
- Builder->SetInsertPoint(OrigAdd);
-
- Value *TruncA = Builder->CreateTrunc(A, NewType, A->getName()+".trunc");
- Value *TruncB = Builder->CreateTrunc(B, NewType, B->getName()+".trunc");
- CallInst *Call = Builder->CreateCall(F, {TruncA, TruncB}, "sadd");
- Value *Add = Builder->CreateExtractValue(Call, 0, "sadd.result");
- Value *ZExt = Builder->CreateZExt(Add, OrigAdd->getType());
-
- // The inner add was the result of the narrow add, zero extended to the
- // wider type. Replace it with the result computed by the intrinsic.
- IC.replaceInstUsesWith(*OrigAdd, ZExt);
-
- // The original icmp gets replaced with the overflow value.
- return ExtractValueInst::Create(Call, 1, "sadd.overflow");
-}
-
bool InstCombiner::OptimizeOverflowCheck(OverflowCheckFlavor OCF, Value *LHS,
Value *RHS, Instruction &OrigI,
Value *&Result, Constant *&Overflow) {
@@ -2603,8 +3416,10 @@ bool InstCombiner::OptimizeOverflowCheck(OverflowCheckFlavor OCF, Value *LHS,
if (OR == OverflowResult::AlwaysOverflows)
return SetResult(Builder->CreateAdd(LHS, RHS), Builder->getTrue(), true);
+
+ // Fall through uadd into sadd
+ LLVM_FALLTHROUGH;
}
- // FALL THROUGH uadd into sadd
case OCF_SIGNED_ADD: {
// X + 0 -> {X, false}
if (match(RHS, m_Zero()))
@@ -2644,7 +3459,8 @@ bool InstCombiner::OptimizeOverflowCheck(OverflowCheckFlavor OCF, Value *LHS,
true);
if (OR == OverflowResult::AlwaysOverflows)
return SetResult(Builder->CreateMul(LHS, RHS), Builder->getTrue(), true);
- } // FALL THROUGH
+ LLVM_FALLTHROUGH;
+ }
case OCF_SIGNED_MUL:
// X * undef -> undef
if (isa<UndefValue>(RHS))
@@ -2682,7 +3498,7 @@ bool InstCombiner::OptimizeOverflowCheck(OverflowCheckFlavor OCF, Value *LHS,
/// \param OtherVal The other argument of compare instruction.
/// \returns Instruction which must replace the compare instruction, NULL if no
/// replacement required.
-static Instruction *ProcessUMulZExtIdiom(ICmpInst &I, Value *MulVal,
+static Instruction *processUMulZExtIdiom(ICmpInst &I, Value *MulVal,
Value *OtherVal, InstCombiner &IC) {
// Don't bother doing this transformation for pointers, don't do it for
// vectors.
@@ -2906,8 +3722,8 @@ static Instruction *ProcessUMulZExtIdiom(ICmpInst &I, Value *MulVal,
/// When performing a comparison against a constant, it is possible that not all
/// the bits in the LHS are demanded. This helper method computes the mask that
/// IS demanded.
-static APInt DemandedBitsLHSMask(ICmpInst &I,
- unsigned BitWidth, bool isSignCheck) {
+static APInt getDemandedBitsLHSMask(ICmpInst &I, unsigned BitWidth,
+ bool isSignCheck) {
if (isSignCheck)
return APInt::getSignBit(BitWidth);
@@ -2981,7 +3797,7 @@ static bool swapMayExposeCSEOpportunities(const Value * Op0,
}
/// \brief Check that one use is in the same block as the definition and all
-/// other uses are in blocks dominated by a given block
+/// other uses are in blocks dominated by a given block.
///
/// \param DI Definition
/// \param UI Use
@@ -2994,21 +3810,18 @@ bool InstCombiner::dominatesAllUses(const Instruction *DI,
const Instruction *UI,
const BasicBlock *DB) const {
assert(DI && UI && "Instruction not defined\n");
- // ignore incomplete definitions
+ // Ignore incomplete definitions.
if (!DI->getParent())
return false;
- // DI and UI must be in the same block
+ // DI and UI must be in the same block.
if (DI->getParent() != UI->getParent())
return false;
- // Protect from self-referencing blocks
+ // Protect from self-referencing blocks.
if (DI->getParent() == DB)
return false;
- // DominatorTree available?
- if (!DT)
- return false;
for (const User *U : DI->users()) {
auto *Usr = cast<Instruction>(U);
- if (Usr != UI && !DT->dominates(DB, Usr->getParent()))
+ if (Usr != UI && !DT.dominates(DB, Usr->getParent()))
return false;
}
return true;
@@ -3067,8 +3880,7 @@ static bool isChainSelectCmpBranch(const SelectInst *SI) {
/// are equal, the optimization can work only for EQ predicates. This is not a
/// major restriction since a NE compare should be 'normalized' to an equal
/// compare, which usually happens in the combiner and test case
-/// select-cmp-br.ll
-/// checks for it.
+/// select-cmp-br.ll checks for it.
bool InstCombiner::replacedSelectWithOperand(SelectInst *SI,
const ICmpInst *Icmp,
const unsigned SIOpd) {
@@ -3076,7 +3888,7 @@ bool InstCombiner::replacedSelectWithOperand(SelectInst *SI,
if (isChainSelectCmpBranch(SI) && Icmp->getPredicate() == ICmpInst::ICMP_EQ) {
BasicBlock *Succ = SI->getParent()->getTerminator()->getSuccessor(1);
// The check for the unique predecessor is not the best that can be
- // done. But it protects efficiently against cases like when SI's
+ // done. But it protects efficiently against cases like when SI's
// home block has two successors, Succ and Succ1, and Succ1 predecessor
// of Succ. Then SI can't be replaced by SIOpd because the use that gets
// replaced can be reached on either path. So the uniqueness check
@@ -3093,6 +3905,229 @@ bool InstCombiner::replacedSelectWithOperand(SelectInst *SI,
return false;
}
+/// Try to fold the comparison based on range information we can get by checking
+/// whether bits are known to be zero or one in the inputs.
+Instruction *InstCombiner::foldICmpUsingKnownBits(ICmpInst &I) {
+ Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+ Type *Ty = Op0->getType();
+ ICmpInst::Predicate Pred = I.getPredicate();
+
+ // Get scalar or pointer size.
+ unsigned BitWidth = Ty->isIntOrIntVectorTy()
+ ? Ty->getScalarSizeInBits()
+ : DL.getTypeSizeInBits(Ty->getScalarType());
+
+ if (!BitWidth)
+ return nullptr;
+
+ // If this is a normal comparison, it demands all bits. If it is a sign bit
+ // comparison, it only demands the sign bit.
+ bool IsSignBit = false;
+ const APInt *CmpC;
+ if (match(Op1, m_APInt(CmpC))) {
+ bool UnusedBit;
+ IsSignBit = isSignBitCheck(Pred, *CmpC, UnusedBit);
+ }
+
+ APInt Op0KnownZero(BitWidth, 0), Op0KnownOne(BitWidth, 0);
+ APInt Op1KnownZero(BitWidth, 0), Op1KnownOne(BitWidth, 0);
+
+ if (SimplifyDemandedBits(I.getOperandUse(0),
+ getDemandedBitsLHSMask(I, BitWidth, IsSignBit),
+ Op0KnownZero, Op0KnownOne, 0))
+ return &I;
+
+ if (SimplifyDemandedBits(I.getOperandUse(1), APInt::getAllOnesValue(BitWidth),
+ Op1KnownZero, Op1KnownOne, 0))
+ return &I;
+
+ // Given the known and unknown bits, compute a range that the LHS could be
+ // in. Compute the Min, Max and RHS values based on the known bits. For the
+ // EQ and NE we use unsigned values.
+ APInt Op0Min(BitWidth, 0), Op0Max(BitWidth, 0);
+ APInt Op1Min(BitWidth, 0), Op1Max(BitWidth, 0);
+ if (I.isSigned()) {
+ computeSignedMinMaxValuesFromKnownBits(Op0KnownZero, Op0KnownOne, Op0Min,
+ Op0Max);
+ computeSignedMinMaxValuesFromKnownBits(Op1KnownZero, Op1KnownOne, Op1Min,
+ Op1Max);
+ } else {
+ computeUnsignedMinMaxValuesFromKnownBits(Op0KnownZero, Op0KnownOne, Op0Min,
+ Op0Max);
+ computeUnsignedMinMaxValuesFromKnownBits(Op1KnownZero, Op1KnownOne, Op1Min,
+ Op1Max);
+ }
+
+ // If Min and Max are known to be the same, then SimplifyDemandedBits
+ // figured out that the LHS is a constant. Constant fold this now, so that
+ // code below can assume that Min != Max.
+ if (!isa<Constant>(Op0) && Op0Min == Op0Max)
+ return new ICmpInst(Pred, ConstantInt::get(Op0->getType(), Op0Min), Op1);
+ if (!isa<Constant>(Op1) && Op1Min == Op1Max)
+ return new ICmpInst(Pred, Op0, ConstantInt::get(Op1->getType(), Op1Min));
+
+ // Based on the range information we know about the LHS, see if we can
+ // simplify this comparison. For example, (x&4) < 8 is always true.
+ switch (Pred) {
+ default:
+ llvm_unreachable("Unknown icmp opcode!");
+ case ICmpInst::ICMP_EQ:
+ case ICmpInst::ICMP_NE: {
+ if (Op0Max.ult(Op1Min) || Op0Min.ugt(Op1Max)) {
+ return Pred == CmpInst::ICMP_EQ
+ ? replaceInstUsesWith(I, ConstantInt::getFalse(I.getType()))
+ : replaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
+ }
+
+ // If all bits are known zero except for one, then we know at most one bit
+ // is set. If the comparison is against zero, then this is a check to see if
+ // *that* bit is set.
+ APInt Op0KnownZeroInverted = ~Op0KnownZero;
+ if (~Op1KnownZero == 0) {
+ // If the LHS is an AND with the same constant, look through it.
+ Value *LHS = nullptr;
+ const APInt *LHSC;
+ if (!match(Op0, m_And(m_Value(LHS), m_APInt(LHSC))) ||
+ *LHSC != Op0KnownZeroInverted)
+ LHS = Op0;
+
+ Value *X;
+ if (match(LHS, m_Shl(m_One(), m_Value(X)))) {
+ APInt ValToCheck = Op0KnownZeroInverted;
+ Type *XTy = X->getType();
+ if (ValToCheck.isPowerOf2()) {
+ // ((1 << X) & 8) == 0 -> X != 3
+ // ((1 << X) & 8) != 0 -> X == 3
+ auto *CmpC = ConstantInt::get(XTy, ValToCheck.countTrailingZeros());
+ auto NewPred = ICmpInst::getInversePredicate(Pred);
+ return new ICmpInst(NewPred, X, CmpC);
+ } else if ((++ValToCheck).isPowerOf2()) {
+ // ((1 << X) & 7) == 0 -> X >= 3
+ // ((1 << X) & 7) != 0 -> X < 3
+ auto *CmpC = ConstantInt::get(XTy, ValToCheck.countTrailingZeros());
+ auto NewPred =
+ Pred == CmpInst::ICMP_EQ ? CmpInst::ICMP_UGE : CmpInst::ICMP_ULT;
+ return new ICmpInst(NewPred, X, CmpC);
+ }
+ }
+
+ // Check if the LHS is 8 >>u x and the result is a power of 2 like 1.
+ const APInt *CI;
+ if (Op0KnownZeroInverted == 1 &&
+ match(LHS, m_LShr(m_Power2(CI), m_Value(X)))) {
+ // ((8 >>u X) & 1) == 0 -> X != 3
+ // ((8 >>u X) & 1) != 0 -> X == 3
+ unsigned CmpVal = CI->countTrailingZeros();
+ auto NewPred = ICmpInst::getInversePredicate(Pred);
+ return new ICmpInst(NewPred, X, ConstantInt::get(X->getType(), CmpVal));
+ }
+ }
+ break;
+ }
+ case ICmpInst::ICMP_ULT: {
+ if (Op0Max.ult(Op1Min)) // A <u B -> true if max(A) < min(B)
+ return replaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
+ if (Op0Min.uge(Op1Max)) // A <u B -> false if min(A) >= max(B)
+ return replaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
+ if (Op1Min == Op0Max) // A <u B -> A != B if max(A) == min(B)
+ return new ICmpInst(ICmpInst::ICMP_NE, Op0, Op1);
+
+ const APInt *CmpC;
+ if (match(Op1, m_APInt(CmpC))) {
+ // A <u C -> A == C-1 if min(A)+1 == C
+ if (Op1Max == Op0Min + 1) {
+ Constant *CMinus1 = ConstantInt::get(Op0->getType(), *CmpC - 1);
+ return new ICmpInst(ICmpInst::ICMP_EQ, Op0, CMinus1);
+ }
+ }
+ break;
+ }
+ case ICmpInst::ICMP_UGT: {
+ if (Op0Min.ugt(Op1Max)) // A >u B -> true if min(A) > max(B)
+ return replaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
+
+ if (Op0Max.ule(Op1Min)) // A >u B -> false if max(A) <= max(B)
+ return replaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
+
+ if (Op1Max == Op0Min) // A >u B -> A != B if min(A) == max(B)
+ return new ICmpInst(ICmpInst::ICMP_NE, Op0, Op1);
+
+ const APInt *CmpC;
+ if (match(Op1, m_APInt(CmpC))) {
+ // A >u C -> A == C+1 if max(a)-1 == C
+ if (*CmpC == Op0Max - 1)
+ return new ICmpInst(ICmpInst::ICMP_EQ, Op0,
+ ConstantInt::get(Op1->getType(), *CmpC + 1));
+ }
+ break;
+ }
+ case ICmpInst::ICMP_SLT:
+ if (Op0Max.slt(Op1Min)) // A <s B -> true if max(A) < min(C)
+ return replaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
+ if (Op0Min.sge(Op1Max)) // A <s B -> false if min(A) >= max(C)
+ return replaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
+ if (Op1Min == Op0Max) // A <s B -> A != B if max(A) == min(B)
+ return new ICmpInst(ICmpInst::ICMP_NE, Op0, Op1);
+ if (ConstantInt *CI = dyn_cast<ConstantInt>(Op1)) {
+ if (Op1Max == Op0Min + 1) // A <s C -> A == C-1 if min(A)+1 == C
+ return new ICmpInst(ICmpInst::ICMP_EQ, Op0,
+ Builder->getInt(CI->getValue() - 1));
+ }
+ break;
+ case ICmpInst::ICMP_SGT:
+ if (Op0Min.sgt(Op1Max)) // A >s B -> true if min(A) > max(B)
+ return replaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
+ if (Op0Max.sle(Op1Min)) // A >s B -> false if max(A) <= min(B)
+ return replaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
+
+ if (Op1Max == Op0Min) // A >s B -> A != B if min(A) == max(B)
+ return new ICmpInst(ICmpInst::ICMP_NE, Op0, Op1);
+ if (ConstantInt *CI = dyn_cast<ConstantInt>(Op1)) {
+ if (Op1Min == Op0Max - 1) // A >s C -> A == C+1 if max(A)-1 == C
+ return new ICmpInst(ICmpInst::ICMP_EQ, Op0,
+ Builder->getInt(CI->getValue() + 1));
+ }
+ break;
+ case ICmpInst::ICMP_SGE:
+ assert(!isa<ConstantInt>(Op1) && "ICMP_SGE with ConstantInt not folded!");
+ if (Op0Min.sge(Op1Max)) // A >=s B -> true if min(A) >= max(B)
+ return replaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
+ if (Op0Max.slt(Op1Min)) // A >=s B -> false if max(A) < min(B)
+ return replaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
+ break;
+ case ICmpInst::ICMP_SLE:
+ assert(!isa<ConstantInt>(Op1) && "ICMP_SLE with ConstantInt not folded!");
+ if (Op0Max.sle(Op1Min)) // A <=s B -> true if max(A) <= min(B)
+ return replaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
+ if (Op0Min.sgt(Op1Max)) // A <=s B -> false if min(A) > max(B)
+ return replaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
+ break;
+ case ICmpInst::ICMP_UGE:
+ assert(!isa<ConstantInt>(Op1) && "ICMP_UGE with ConstantInt not folded!");
+ if (Op0Min.uge(Op1Max)) // A >=u B -> true if min(A) >= max(B)
+ return replaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
+ if (Op0Max.ult(Op1Min)) // A >=u B -> false if max(A) < min(B)
+ return replaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
+ break;
+ case ICmpInst::ICMP_ULE:
+ assert(!isa<ConstantInt>(Op1) && "ICMP_ULE with ConstantInt not folded!");
+ if (Op0Max.ule(Op1Min)) // A <=u B -> true if max(A) <= min(B)
+ return replaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
+ if (Op0Min.ugt(Op1Max)) // A <=u B -> false if min(A) > max(B)
+ return replaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
+ break;
+ }
+
+ // Turn a signed comparison into an unsigned one if both operands are known to
+ // have the same sign.
+ if (I.isSigned() &&
+ ((Op0KnownZero.isNegative() && Op1KnownZero.isNegative()) ||
+ (Op0KnownOne.isNegative() && Op1KnownOne.isNegative())))
+ return new ICmpInst(I.getUnsignedPredicate(), Op0, Op1);
+
+ return nullptr;
+}
+
/// If we have an icmp le or icmp ge instruction with a constant operand, turn
/// it into the appropriate icmp lt or icmp gt instruction. This transform
/// allows them to be folded in visitICmpInst.
@@ -3131,6 +4166,7 @@ static ICmpInst *canonicalizeCmpWithConstant(ICmpInst &I) {
if (isa<UndefValue>(Elt))
continue;
+
// Bail out if we can't determine if this constant is min/max or if we
// know that this constant is min/max.
auto *CI = dyn_cast<ConstantInt>(Elt);
@@ -3167,7 +4203,7 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
}
if (Value *V =
- SimplifyICmpInst(I.getPredicate(), Op0, Op1, DL, TLI, DT, AC, &I))
+ SimplifyICmpInst(I.getPredicate(), Op0, Op1, DL, &TLI, &DT, &AC, &I))
return replaceInstUsesWith(I, V);
// comparing -val or val with non-zero is the same as just comparing val
@@ -3202,28 +4238,28 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
case ICmpInst::ICMP_UGT:
std::swap(Op0, Op1); // Change icmp ugt -> icmp ult
- // FALL THROUGH
+ LLVM_FALLTHROUGH;
case ICmpInst::ICMP_ULT:{ // icmp ult i1 A, B -> ~A & B
Value *Not = Builder->CreateNot(Op0, I.getName() + "tmp");
return BinaryOperator::CreateAnd(Not, Op1);
}
case ICmpInst::ICMP_SGT:
std::swap(Op0, Op1); // Change icmp sgt -> icmp slt
- // FALL THROUGH
+ LLVM_FALLTHROUGH;
case ICmpInst::ICMP_SLT: { // icmp slt i1 A, B -> A & ~B
Value *Not = Builder->CreateNot(Op1, I.getName() + "tmp");
return BinaryOperator::CreateAnd(Not, Op0);
}
case ICmpInst::ICMP_UGE:
std::swap(Op0, Op1); // Change icmp uge -> icmp ule
- // FALL THROUGH
+ LLVM_FALLTHROUGH;
case ICmpInst::ICMP_ULE: { // icmp ule i1 A, B -> ~A | B
Value *Not = Builder->CreateNot(Op0, I.getName() + "tmp");
return BinaryOperator::CreateOr(Not, Op1);
}
case ICmpInst::ICMP_SGE:
std::swap(Op0, Op1); // Change icmp sge -> icmp sle
- // FALL THROUGH
+ LLVM_FALLTHROUGH;
case ICmpInst::ICMP_SLE: { // icmp sle i1 A, B -> A | ~B
Value *Not = Builder->CreateNot(Op1, I.getName() + "tmp");
return BinaryOperator::CreateOr(Not, Op0);
@@ -3234,372 +4270,11 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
if (ICmpInst *NewICmp = canonicalizeCmpWithConstant(I))
return NewICmp;
- unsigned BitWidth = 0;
- if (Ty->isIntOrIntVectorTy())
- BitWidth = Ty->getScalarSizeInBits();
- else // Get pointer size.
- BitWidth = DL.getTypeSizeInBits(Ty->getScalarType());
-
- bool isSignBit = false;
-
- // See if we are doing a comparison with a constant.
- if (ConstantInt *CI = dyn_cast<ConstantInt>(Op1)) {
- Value *A = nullptr, *B = nullptr;
-
- // Match the following pattern, which is a common idiom when writing
- // overflow-safe integer arithmetic function. The source performs an
- // addition in wider type, and explicitly checks for overflow using
- // comparisons against INT_MIN and INT_MAX. Simplify this by using the
- // sadd_with_overflow intrinsic.
- //
- // TODO: This could probably be generalized to handle other overflow-safe
- // operations if we worked out the formulas to compute the appropriate
- // magic constants.
- //
- // sum = a + b
- // if (sum+128 >u 255) ... -> llvm.sadd.with.overflow.i8
- {
- ConstantInt *CI2; // I = icmp ugt (add (add A, B), CI2), CI
- if (I.getPredicate() == ICmpInst::ICMP_UGT &&
- match(Op0, m_Add(m_Add(m_Value(A), m_Value(B)), m_ConstantInt(CI2))))
- if (Instruction *Res = ProcessUGT_ADDCST_ADD(I, A, B, CI2, CI, *this))
- return Res;
- }
-
- // (icmp sgt smin(PosA, B) 0) -> (icmp sgt B 0)
- if (CI->isZero() && I.getPredicate() == ICmpInst::ICMP_SGT)
- if (auto *SI = dyn_cast<SelectInst>(Op0)) {
- SelectPatternResult SPR = matchSelectPattern(SI, A, B);
- if (SPR.Flavor == SPF_SMIN) {
- if (isKnownPositive(A, DL))
- return new ICmpInst(I.getPredicate(), B, CI);
- if (isKnownPositive(B, DL))
- return new ICmpInst(I.getPredicate(), A, CI);
- }
- }
-
-
- // The following transforms are only 'worth it' if the only user of the
- // subtraction is the icmp.
- if (Op0->hasOneUse()) {
- // (icmp ne/eq (sub A B) 0) -> (icmp ne/eq A, B)
- if (I.isEquality() && CI->isZero() &&
- match(Op0, m_Sub(m_Value(A), m_Value(B))))
- return new ICmpInst(I.getPredicate(), A, B);
-
- // (icmp sgt (sub nsw A B), -1) -> (icmp sge A, B)
- if (I.getPredicate() == ICmpInst::ICMP_SGT && CI->isAllOnesValue() &&
- match(Op0, m_NSWSub(m_Value(A), m_Value(B))))
- return new ICmpInst(ICmpInst::ICMP_SGE, A, B);
-
- // (icmp sgt (sub nsw A B), 0) -> (icmp sgt A, B)
- if (I.getPredicate() == ICmpInst::ICMP_SGT && CI->isZero() &&
- match(Op0, m_NSWSub(m_Value(A), m_Value(B))))
- return new ICmpInst(ICmpInst::ICMP_SGT, A, B);
-
- // (icmp slt (sub nsw A B), 0) -> (icmp slt A, B)
- if (I.getPredicate() == ICmpInst::ICMP_SLT && CI->isZero() &&
- match(Op0, m_NSWSub(m_Value(A), m_Value(B))))
- return new ICmpInst(ICmpInst::ICMP_SLT, A, B);
-
- // (icmp slt (sub nsw A B), 1) -> (icmp sle A, B)
- if (I.getPredicate() == ICmpInst::ICMP_SLT && CI->isOne() &&
- match(Op0, m_NSWSub(m_Value(A), m_Value(B))))
- return new ICmpInst(ICmpInst::ICMP_SLE, A, B);
- }
-
- if (I.isEquality()) {
- ConstantInt *CI2;
- if (match(Op0, m_AShr(m_ConstantInt(CI2), m_Value(A))) ||
- match(Op0, m_LShr(m_ConstantInt(CI2), m_Value(A)))) {
- // (icmp eq/ne (ashr/lshr const2, A), const1)
- if (Instruction *Inst = FoldICmpCstShrCst(I, Op0, A, CI, CI2))
- return Inst;
- }
- if (match(Op0, m_Shl(m_ConstantInt(CI2), m_Value(A)))) {
- // (icmp eq/ne (shl const2, A), const1)
- if (Instruction *Inst = FoldICmpCstShlCst(I, Op0, A, CI, CI2))
- return Inst;
- }
- }
-
- // If this comparison is a normal comparison, it demands all
- // bits, if it is a sign bit comparison, it only demands the sign bit.
- bool UnusedBit;
- isSignBit = isSignBitCheck(I.getPredicate(), CI, UnusedBit);
-
- // Canonicalize icmp instructions based on dominating conditions.
- BasicBlock *Parent = I.getParent();
- BasicBlock *Dom = Parent->getSinglePredecessor();
- auto *BI = Dom ? dyn_cast<BranchInst>(Dom->getTerminator()) : nullptr;
- ICmpInst::Predicate Pred;
- BasicBlock *TrueBB, *FalseBB;
- ConstantInt *CI2;
- if (BI && match(BI, m_Br(m_ICmp(Pred, m_Specific(Op0), m_ConstantInt(CI2)),
- TrueBB, FalseBB)) &&
- TrueBB != FalseBB) {
- ConstantRange CR = ConstantRange::makeAllowedICmpRegion(I.getPredicate(),
- CI->getValue());
- ConstantRange DominatingCR =
- (Parent == TrueBB)
- ? ConstantRange::makeExactICmpRegion(Pred, CI2->getValue())
- : ConstantRange::makeExactICmpRegion(
- CmpInst::getInversePredicate(Pred), CI2->getValue());
- ConstantRange Intersection = DominatingCR.intersectWith(CR);
- ConstantRange Difference = DominatingCR.difference(CR);
- if (Intersection.isEmptySet())
- return replaceInstUsesWith(I, Builder->getFalse());
- if (Difference.isEmptySet())
- return replaceInstUsesWith(I, Builder->getTrue());
- // Canonicalizing a sign bit comparison that gets used in a branch,
- // pessimizes codegen by generating branch on zero instruction instead
- // of a test and branch. So we avoid canonicalizing in such situations
- // because test and branch instruction has better branch displacement
- // than compare and branch instruction.
- if (!isBranchOnSignBitCheck(I, isSignBit) && !I.isEquality()) {
- if (auto *AI = Intersection.getSingleElement())
- return new ICmpInst(ICmpInst::ICMP_EQ, Op0, Builder->getInt(*AI));
- if (auto *AD = Difference.getSingleElement())
- return new ICmpInst(ICmpInst::ICMP_NE, Op0, Builder->getInt(*AD));
- }
- }
- }
-
- // See if we can fold the comparison based on range information we can get
- // by checking whether bits are known to be zero or one in the input.
- if (BitWidth != 0) {
- APInt Op0KnownZero(BitWidth, 0), Op0KnownOne(BitWidth, 0);
- APInt Op1KnownZero(BitWidth, 0), Op1KnownOne(BitWidth, 0);
-
- if (SimplifyDemandedBits(I.getOperandUse(0),
- DemandedBitsLHSMask(I, BitWidth, isSignBit),
- Op0KnownZero, Op0KnownOne, 0))
- return &I;
- if (SimplifyDemandedBits(I.getOperandUse(1),
- APInt::getAllOnesValue(BitWidth), Op1KnownZero,
- Op1KnownOne, 0))
- return &I;
-
- // Given the known and unknown bits, compute a range that the LHS could be
- // in. Compute the Min, Max and RHS values based on the known bits. For the
- // EQ and NE we use unsigned values.
- APInt Op0Min(BitWidth, 0), Op0Max(BitWidth, 0);
- APInt Op1Min(BitWidth, 0), Op1Max(BitWidth, 0);
- if (I.isSigned()) {
- ComputeSignedMinMaxValuesFromKnownBits(Op0KnownZero, Op0KnownOne,
- Op0Min, Op0Max);
- ComputeSignedMinMaxValuesFromKnownBits(Op1KnownZero, Op1KnownOne,
- Op1Min, Op1Max);
- } else {
- ComputeUnsignedMinMaxValuesFromKnownBits(Op0KnownZero, Op0KnownOne,
- Op0Min, Op0Max);
- ComputeUnsignedMinMaxValuesFromKnownBits(Op1KnownZero, Op1KnownOne,
- Op1Min, Op1Max);
- }
-
- // If Min and Max are known to be the same, then SimplifyDemandedBits
- // figured out that the LHS is a constant. Just constant fold this now so
- // that code below can assume that Min != Max.
- if (!isa<Constant>(Op0) && Op0Min == Op0Max)
- return new ICmpInst(I.getPredicate(),
- ConstantInt::get(Op0->getType(), Op0Min), Op1);
- if (!isa<Constant>(Op1) && Op1Min == Op1Max)
- return new ICmpInst(I.getPredicate(), Op0,
- ConstantInt::get(Op1->getType(), Op1Min));
-
- // Based on the range information we know about the LHS, see if we can
- // simplify this comparison. For example, (x&4) < 8 is always true.
- switch (I.getPredicate()) {
- default: llvm_unreachable("Unknown icmp opcode!");
- case ICmpInst::ICMP_EQ: {
- if (Op0Max.ult(Op1Min) || Op0Min.ugt(Op1Max))
- return replaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
-
- // If all bits are known zero except for one, then we know at most one
- // bit is set. If the comparison is against zero, then this is a check
- // to see if *that* bit is set.
- APInt Op0KnownZeroInverted = ~Op0KnownZero;
- if (~Op1KnownZero == 0) {
- // If the LHS is an AND with the same constant, look through it.
- Value *LHS = nullptr;
- ConstantInt *LHSC = nullptr;
- if (!match(Op0, m_And(m_Value(LHS), m_ConstantInt(LHSC))) ||
- LHSC->getValue() != Op0KnownZeroInverted)
- LHS = Op0;
-
- // If the LHS is 1 << x, and we know the result is a power of 2 like 8,
- // then turn "((1 << x)&8) == 0" into "x != 3".
- // or turn "((1 << x)&7) == 0" into "x > 2".
- Value *X = nullptr;
- if (match(LHS, m_Shl(m_One(), m_Value(X)))) {
- APInt ValToCheck = Op0KnownZeroInverted;
- if (ValToCheck.isPowerOf2()) {
- unsigned CmpVal = ValToCheck.countTrailingZeros();
- return new ICmpInst(ICmpInst::ICMP_NE, X,
- ConstantInt::get(X->getType(), CmpVal));
- } else if ((++ValToCheck).isPowerOf2()) {
- unsigned CmpVal = ValToCheck.countTrailingZeros() - 1;
- return new ICmpInst(ICmpInst::ICMP_UGT, X,
- ConstantInt::get(X->getType(), CmpVal));
- }
- }
-
- // If the LHS is 8 >>u x, and we know the result is a power of 2 like 1,
- // then turn "((8 >>u x)&1) == 0" into "x != 3".
- const APInt *CI;
- if (Op0KnownZeroInverted == 1 &&
- match(LHS, m_LShr(m_Power2(CI), m_Value(X))))
- return new ICmpInst(ICmpInst::ICMP_NE, X,
- ConstantInt::get(X->getType(),
- CI->countTrailingZeros()));
- }
- break;
- }
- case ICmpInst::ICMP_NE: {
- if (Op0Max.ult(Op1Min) || Op0Min.ugt(Op1Max))
- return replaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
-
- // If all bits are known zero except for one, then we know at most one
- // bit is set. If the comparison is against zero, then this is a check
- // to see if *that* bit is set.
- APInt Op0KnownZeroInverted = ~Op0KnownZero;
- if (~Op1KnownZero == 0) {
- // If the LHS is an AND with the same constant, look through it.
- Value *LHS = nullptr;
- ConstantInt *LHSC = nullptr;
- if (!match(Op0, m_And(m_Value(LHS), m_ConstantInt(LHSC))) ||
- LHSC->getValue() != Op0KnownZeroInverted)
- LHS = Op0;
-
- // If the LHS is 1 << x, and we know the result is a power of 2 like 8,
- // then turn "((1 << x)&8) != 0" into "x == 3".
- // or turn "((1 << x)&7) != 0" into "x < 3".
- Value *X = nullptr;
- if (match(LHS, m_Shl(m_One(), m_Value(X)))) {
- APInt ValToCheck = Op0KnownZeroInverted;
- if (ValToCheck.isPowerOf2()) {
- unsigned CmpVal = ValToCheck.countTrailingZeros();
- return new ICmpInst(ICmpInst::ICMP_EQ, X,
- ConstantInt::get(X->getType(), CmpVal));
- } else if ((++ValToCheck).isPowerOf2()) {
- unsigned CmpVal = ValToCheck.countTrailingZeros();
- return new ICmpInst(ICmpInst::ICMP_ULT, X,
- ConstantInt::get(X->getType(), CmpVal));
- }
- }
-
- // If the LHS is 8 >>u x, and we know the result is a power of 2 like 1,
- // then turn "((8 >>u x)&1) != 0" into "x == 3".
- const APInt *CI;
- if (Op0KnownZeroInverted == 1 &&
- match(LHS, m_LShr(m_Power2(CI), m_Value(X))))
- return new ICmpInst(ICmpInst::ICMP_EQ, X,
- ConstantInt::get(X->getType(),
- CI->countTrailingZeros()));
- }
- break;
- }
- case ICmpInst::ICMP_ULT:
- if (Op0Max.ult(Op1Min)) // A <u B -> true if max(A) < min(B)
- return replaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
- if (Op0Min.uge(Op1Max)) // A <u B -> false if min(A) >= max(B)
- return replaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
- if (Op1Min == Op0Max) // A <u B -> A != B if max(A) == min(B)
- return new ICmpInst(ICmpInst::ICMP_NE, Op0, Op1);
- if (ConstantInt *CI = dyn_cast<ConstantInt>(Op1)) {
- if (Op1Max == Op0Min+1) // A <u C -> A == C-1 if min(A)+1 == C
- return new ICmpInst(ICmpInst::ICMP_EQ, Op0,
- Builder->getInt(CI->getValue()-1));
-
- // (x <u 2147483648) -> (x >s -1) -> true if sign bit clear
- if (CI->isMinValue(true))
- return new ICmpInst(ICmpInst::ICMP_SGT, Op0,
- Constant::getAllOnesValue(Op0->getType()));
- }
- break;
- case ICmpInst::ICMP_UGT:
- if (Op0Min.ugt(Op1Max)) // A >u B -> true if min(A) > max(B)
- return replaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
- if (Op0Max.ule(Op1Min)) // A >u B -> false if max(A) <= max(B)
- return replaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
-
- if (Op1Max == Op0Min) // A >u B -> A != B if min(A) == max(B)
- return new ICmpInst(ICmpInst::ICMP_NE, Op0, Op1);
- if (ConstantInt *CI = dyn_cast<ConstantInt>(Op1)) {
- if (Op1Min == Op0Max-1) // A >u C -> A == C+1 if max(a)-1 == C
- return new ICmpInst(ICmpInst::ICMP_EQ, Op0,
- Builder->getInt(CI->getValue()+1));
-
- // (x >u 2147483647) -> (x <s 0) -> true if sign bit set
- if (CI->isMaxValue(true))
- return new ICmpInst(ICmpInst::ICMP_SLT, Op0,
- Constant::getNullValue(Op0->getType()));
- }
- break;
- case ICmpInst::ICMP_SLT:
- if (Op0Max.slt(Op1Min)) // A <s B -> true if max(A) < min(C)
- return replaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
- if (Op0Min.sge(Op1Max)) // A <s B -> false if min(A) >= max(C)
- return replaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
- if (Op1Min == Op0Max) // A <s B -> A != B if max(A) == min(B)
- return new ICmpInst(ICmpInst::ICMP_NE, Op0, Op1);
- if (ConstantInt *CI = dyn_cast<ConstantInt>(Op1)) {
- if (Op1Max == Op0Min+1) // A <s C -> A == C-1 if min(A)+1 == C
- return new ICmpInst(ICmpInst::ICMP_EQ, Op0,
- Builder->getInt(CI->getValue()-1));
- }
- break;
- case ICmpInst::ICMP_SGT:
- if (Op0Min.sgt(Op1Max)) // A >s B -> true if min(A) > max(B)
- return replaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
- if (Op0Max.sle(Op1Min)) // A >s B -> false if max(A) <= min(B)
- return replaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
-
- if (Op1Max == Op0Min) // A >s B -> A != B if min(A) == max(B)
- return new ICmpInst(ICmpInst::ICMP_NE, Op0, Op1);
- if (ConstantInt *CI = dyn_cast<ConstantInt>(Op1)) {
- if (Op1Min == Op0Max-1) // A >s C -> A == C+1 if max(A)-1 == C
- return new ICmpInst(ICmpInst::ICMP_EQ, Op0,
- Builder->getInt(CI->getValue()+1));
- }
- break;
- case ICmpInst::ICMP_SGE:
- assert(!isa<ConstantInt>(Op1) && "ICMP_SGE with ConstantInt not folded!");
- if (Op0Min.sge(Op1Max)) // A >=s B -> true if min(A) >= max(B)
- return replaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
- if (Op0Max.slt(Op1Min)) // A >=s B -> false if max(A) < min(B)
- return replaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
- break;
- case ICmpInst::ICMP_SLE:
- assert(!isa<ConstantInt>(Op1) && "ICMP_SLE with ConstantInt not folded!");
- if (Op0Max.sle(Op1Min)) // A <=s B -> true if max(A) <= min(B)
- return replaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
- if (Op0Min.sgt(Op1Max)) // A <=s B -> false if min(A) > max(B)
- return replaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
- break;
- case ICmpInst::ICMP_UGE:
- assert(!isa<ConstantInt>(Op1) && "ICMP_UGE with ConstantInt not folded!");
- if (Op0Min.uge(Op1Max)) // A >=u B -> true if min(A) >= max(B)
- return replaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
- if (Op0Max.ult(Op1Min)) // A >=u B -> false if max(A) < min(B)
- return replaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
- break;
- case ICmpInst::ICMP_ULE:
- assert(!isa<ConstantInt>(Op1) && "ICMP_ULE with ConstantInt not folded!");
- if (Op0Max.ule(Op1Min)) // A <=u B -> true if max(A) <= min(B)
- return replaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
- if (Op0Min.ugt(Op1Max)) // A <=u B -> false if min(A) > max(B)
- return replaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
- break;
- }
+ if (Instruction *Res = foldICmpWithConstant(I))
+ return Res;
- // Turn a signed comparison into an unsigned one if both operands
- // are known to have the same sign.
- if (I.isSigned() &&
- ((Op0KnownZero.isNegative() && Op1KnownZero.isNegative()) ||
- (Op0KnownOne.isNegative() && Op1KnownOne.isNegative())))
- return new ICmpInst(I.getUnsignedPredicate(), Op0, Op1);
- }
+ if (Instruction *Res = foldICmpUsingKnownBits(I))
+ return Res;
// Test if the ICmpInst instruction is used exclusively by a select as
// part of a minimum or maximum operation. If so, refrain from doing
@@ -3614,122 +4289,39 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
(SI->getOperand(2) == Op0 && SI->getOperand(1) == Op1))
return nullptr;
- // See if we are doing a comparison between a constant and an instruction that
- // can be folded into the comparison.
- if (ConstantInt *CI = dyn_cast<ConstantInt>(Op1)) {
- Value *A = nullptr, *B = nullptr;
- // Since the RHS is a ConstantInt (CI), if the left hand side is an
- // instruction, see if that instruction also has constants so that the
- // instruction can be folded into the icmp
- if (Instruction *LHSI = dyn_cast<Instruction>(Op0))
- if (Instruction *Res = visitICmpInstWithInstAndIntCst(I, LHSI, CI))
- return Res;
+ // FIXME: We only do this after checking for min/max to prevent infinite
+ // looping caused by a reverse canonicalization of these patterns for min/max.
+ // FIXME: The organization of folds is a mess. These would naturally go into
+ // canonicalizeCmpWithConstant(), but we can't move all of the above folds
+ // down here after the min/max restriction.
+ ICmpInst::Predicate Pred = I.getPredicate();
+ const APInt *C;
+ if (match(Op1, m_APInt(C))) {
+ // For i32: x >u 2147483647 -> x <s 0 -> true if sign bit set
+ if (Pred == ICmpInst::ICMP_UGT && C->isMaxSignedValue()) {
+ Constant *Zero = Constant::getNullValue(Op0->getType());
+ return new ICmpInst(ICmpInst::ICMP_SLT, Op0, Zero);
+ }
- // (icmp eq/ne (udiv A, B), 0) -> (icmp ugt/ule i32 B, A)
- if (I.isEquality() && CI->isZero() &&
- match(Op0, m_UDiv(m_Value(A), m_Value(B)))) {
- ICmpInst::Predicate Pred = I.getPredicate() == ICmpInst::ICMP_EQ
- ? ICmpInst::ICMP_UGT
- : ICmpInst::ICMP_ULE;
- return new ICmpInst(Pred, B, A);
+ // For i32: x <u 2147483648 -> x >s -1 -> true if sign bit clear
+ if (Pred == ICmpInst::ICMP_ULT && C->isMinSignedValue()) {
+ Constant *AllOnes = Constant::getAllOnesValue(Op0->getType());
+ return new ICmpInst(ICmpInst::ICMP_SGT, Op0, AllOnes);
}
}
- // Handle icmp with constant (but not simple integer constant) RHS
- if (Constant *RHSC = dyn_cast<Constant>(Op1)) {
- if (Instruction *LHSI = dyn_cast<Instruction>(Op0))
- switch (LHSI->getOpcode()) {
- case Instruction::GetElementPtr:
- // icmp pred GEP (P, int 0, int 0, int 0), null -> icmp pred P, null
- if (RHSC->isNullValue() &&
- cast<GetElementPtrInst>(LHSI)->hasAllZeroIndices())
- return new ICmpInst(I.getPredicate(), LHSI->getOperand(0),
- Constant::getNullValue(LHSI->getOperand(0)->getType()));
- break;
- case Instruction::PHI:
- // Only fold icmp into the PHI if the phi and icmp are in the same
- // block. If in the same block, we're encouraging jump threading. If
- // not, we are just pessimizing the code by making an i1 phi.
- if (LHSI->getParent() == I.getParent())
- if (Instruction *NV = FoldOpIntoPhi(I))
- return NV;
- break;
- case Instruction::Select: {
- // If either operand of the select is a constant, we can fold the
- // comparison into the select arms, which will cause one to be
- // constant folded and the select turned into a bitwise or.
- Value *Op1 = nullptr, *Op2 = nullptr;
- ConstantInt *CI = nullptr;
- if (Constant *C = dyn_cast<Constant>(LHSI->getOperand(1))) {
- Op1 = ConstantExpr::getICmp(I.getPredicate(), C, RHSC);
- CI = dyn_cast<ConstantInt>(Op1);
- }
- if (Constant *C = dyn_cast<Constant>(LHSI->getOperand(2))) {
- Op2 = ConstantExpr::getICmp(I.getPredicate(), C, RHSC);
- CI = dyn_cast<ConstantInt>(Op2);
- }
-
- // We only want to perform this transformation if it will not lead to
- // additional code. This is true if either both sides of the select
- // fold to a constant (in which case the icmp is replaced with a select
- // which will usually simplify) or this is the only user of the
- // select (in which case we are trading a select+icmp for a simpler
- // select+icmp) or all uses of the select can be replaced based on
- // dominance information ("Global cases").
- bool Transform = false;
- if (Op1 && Op2)
- Transform = true;
- else if (Op1 || Op2) {
- // Local case
- if (LHSI->hasOneUse())
- Transform = true;
- // Global cases
- else if (CI && !CI->isZero())
- // When Op1 is constant try replacing select with second operand.
- // Otherwise Op2 is constant and try replacing select with first
- // operand.
- Transform = replacedSelectWithOperand(cast<SelectInst>(LHSI), &I,
- Op1 ? 2 : 1);
- }
- if (Transform) {
- if (!Op1)
- Op1 = Builder->CreateICmp(I.getPredicate(), LHSI->getOperand(1),
- RHSC, I.getName());
- if (!Op2)
- Op2 = Builder->CreateICmp(I.getPredicate(), LHSI->getOperand(2),
- RHSC, I.getName());
- return SelectInst::Create(LHSI->getOperand(0), Op1, Op2);
- }
- break;
- }
- case Instruction::IntToPtr:
- // icmp pred inttoptr(X), null -> icmp pred X, 0
- if (RHSC->isNullValue() &&
- DL.getIntPtrType(RHSC->getType()) == LHSI->getOperand(0)->getType())
- return new ICmpInst(I.getPredicate(), LHSI->getOperand(0),
- Constant::getNullValue(LHSI->getOperand(0)->getType()));
- break;
+ if (Instruction *Res = foldICmpInstWithConstant(I))
+ return Res;
- case Instruction::Load:
- // Try to optimize things like "A[i] > 4" to index computations.
- if (GetElementPtrInst *GEP =
- dyn_cast<GetElementPtrInst>(LHSI->getOperand(0))) {
- if (GlobalVariable *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0)))
- if (GV->isConstant() && GV->hasDefinitiveInitializer() &&
- !cast<LoadInst>(LHSI)->isVolatile())
- if (Instruction *Res = FoldCmpLoadFromIndexedGlobal(GEP, GV, I))
- return Res;
- }
- break;
- }
- }
+ if (Instruction *Res = foldICmpInstWithConstantNotInt(I))
+ return Res;
// If we can optimize a 'icmp GEP, P' or 'icmp P, GEP', do so now.
if (GEPOperator *GEP = dyn_cast<GEPOperator>(Op0))
- if (Instruction *NI = FoldGEPICmp(GEP, Op1, I.getPredicate(), I))
+ if (Instruction *NI = foldGEPICmp(GEP, Op1, I.getPredicate(), I))
return NI;
if (GEPOperator *GEP = dyn_cast<GEPOperator>(Op1))
- if (Instruction *NI = FoldGEPICmp(GEP, Op0,
+ if (Instruction *NI = foldGEPICmp(GEP, Op0,
ICmpInst::getSwappedPredicate(I.getPredicate()), I))
return NI;
@@ -3737,10 +4329,10 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
if (Op0->getType()->isPointerTy() && I.isEquality()) {
assert(Op1->getType()->isPointerTy() && "Comparing pointer with non-pointer?");
if (auto *Alloca = dyn_cast<AllocaInst>(GetUnderlyingObject(Op0, DL)))
- if (Instruction *New = FoldAllocaCmp(I, Alloca, Op1))
+ if (Instruction *New = foldAllocaCmp(I, Alloca, Op1))
return New;
if (auto *Alloca = dyn_cast<AllocaInst>(GetUnderlyingObject(Op1, DL)))
- if (Instruction *New = FoldAllocaCmp(I, Alloca, Op0))
+ if (Instruction *New = foldAllocaCmp(I, Alloca, Op0))
return New;
}
@@ -3780,318 +4372,24 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
// For generality, we handle any zero-extension of any operand comparison
// with a constant or another cast from the same type.
if (isa<Constant>(Op1) || isa<CastInst>(Op1))
- if (Instruction *R = visitICmpInstWithCastAndCast(I))
+ if (Instruction *R = foldICmpWithCastAndCast(I))
return R;
}
- // Special logic for binary operators.
- BinaryOperator *BO0 = dyn_cast<BinaryOperator>(Op0);
- BinaryOperator *BO1 = dyn_cast<BinaryOperator>(Op1);
- if (BO0 || BO1) {
- CmpInst::Predicate Pred = I.getPredicate();
- bool NoOp0WrapProblem = false, NoOp1WrapProblem = false;
- if (BO0 && isa<OverflowingBinaryOperator>(BO0))
- NoOp0WrapProblem = ICmpInst::isEquality(Pred) ||
- (CmpInst::isUnsigned(Pred) && BO0->hasNoUnsignedWrap()) ||
- (CmpInst::isSigned(Pred) && BO0->hasNoSignedWrap());
- if (BO1 && isa<OverflowingBinaryOperator>(BO1))
- NoOp1WrapProblem = ICmpInst::isEquality(Pred) ||
- (CmpInst::isUnsigned(Pred) && BO1->hasNoUnsignedWrap()) ||
- (CmpInst::isSigned(Pred) && BO1->hasNoSignedWrap());
-
- // Analyze the case when either Op0 or Op1 is an add instruction.
- // Op0 = A + B (or A and B are null); Op1 = C + D (or C and D are null).
- Value *A = nullptr, *B = nullptr, *C = nullptr, *D = nullptr;
- if (BO0 && BO0->getOpcode() == Instruction::Add) {
- A = BO0->getOperand(0);
- B = BO0->getOperand(1);
- }
- if (BO1 && BO1->getOpcode() == Instruction::Add) {
- C = BO1->getOperand(0);
- D = BO1->getOperand(1);
- }
-
- // icmp (X+cst) < 0 --> X < -cst
- if (NoOp0WrapProblem && ICmpInst::isSigned(Pred) && match(Op1, m_Zero()))
- if (ConstantInt *RHSC = dyn_cast_or_null<ConstantInt>(B))
- if (!RHSC->isMinValue(/*isSigned=*/true))
- return new ICmpInst(Pred, A, ConstantExpr::getNeg(RHSC));
-
- // icmp (X+Y), X -> icmp Y, 0 for equalities or if there is no overflow.
- if ((A == Op1 || B == Op1) && NoOp0WrapProblem)
- return new ICmpInst(Pred, A == Op1 ? B : A,
- Constant::getNullValue(Op1->getType()));
-
- // icmp X, (X+Y) -> icmp 0, Y for equalities or if there is no overflow.
- if ((C == Op0 || D == Op0) && NoOp1WrapProblem)
- return new ICmpInst(Pred, Constant::getNullValue(Op0->getType()),
- C == Op0 ? D : C);
-
- // icmp (X+Y), (X+Z) -> icmp Y, Z for equalities or if there is no overflow.
- if (A && C && (A == C || A == D || B == C || B == D) &&
- NoOp0WrapProblem && NoOp1WrapProblem &&
- // Try not to increase register pressure.
- BO0->hasOneUse() && BO1->hasOneUse()) {
- // Determine Y and Z in the form icmp (X+Y), (X+Z).
- Value *Y, *Z;
- if (A == C) {
- // C + B == C + D -> B == D
- Y = B;
- Z = D;
- } else if (A == D) {
- // D + B == C + D -> B == C
- Y = B;
- Z = C;
- } else if (B == C) {
- // A + C == C + D -> A == D
- Y = A;
- Z = D;
- } else {
- assert(B == D);
- // A + D == C + D -> A == C
- Y = A;
- Z = C;
- }
- return new ICmpInst(Pred, Y, Z);
- }
-
- // icmp slt (X + -1), Y -> icmp sle X, Y
- if (A && NoOp0WrapProblem && Pred == CmpInst::ICMP_SLT &&
- match(B, m_AllOnes()))
- return new ICmpInst(CmpInst::ICMP_SLE, A, Op1);
-
- // icmp sge (X + -1), Y -> icmp sgt X, Y
- if (A && NoOp0WrapProblem && Pred == CmpInst::ICMP_SGE &&
- match(B, m_AllOnes()))
- return new ICmpInst(CmpInst::ICMP_SGT, A, Op1);
-
- // icmp sle (X + 1), Y -> icmp slt X, Y
- if (A && NoOp0WrapProblem && Pred == CmpInst::ICMP_SLE &&
- match(B, m_One()))
- return new ICmpInst(CmpInst::ICMP_SLT, A, Op1);
-
- // icmp sgt (X + 1), Y -> icmp sge X, Y
- if (A && NoOp0WrapProblem && Pred == CmpInst::ICMP_SGT &&
- match(B, m_One()))
- return new ICmpInst(CmpInst::ICMP_SGE, A, Op1);
-
- // icmp sgt X, (Y + -1) -> icmp sge X, Y
- if (C && NoOp1WrapProblem && Pred == CmpInst::ICMP_SGT &&
- match(D, m_AllOnes()))
- return new ICmpInst(CmpInst::ICMP_SGE, Op0, C);
-
- // icmp sle X, (Y + -1) -> icmp slt X, Y
- if (C && NoOp1WrapProblem && Pred == CmpInst::ICMP_SLE &&
- match(D, m_AllOnes()))
- return new ICmpInst(CmpInst::ICMP_SLT, Op0, C);
-
- // icmp sge X, (Y + 1) -> icmp sgt X, Y
- if (C && NoOp1WrapProblem && Pred == CmpInst::ICMP_SGE &&
- match(D, m_One()))
- return new ICmpInst(CmpInst::ICMP_SGT, Op0, C);
-
- // icmp slt X, (Y + 1) -> icmp sle X, Y
- if (C && NoOp1WrapProblem && Pred == CmpInst::ICMP_SLT &&
- match(D, m_One()))
- return new ICmpInst(CmpInst::ICMP_SLE, Op0, C);
-
- // if C1 has greater magnitude than C2:
- // icmp (X + C1), (Y + C2) -> icmp (X + C3), Y
- // s.t. C3 = C1 - C2
- //
- // if C2 has greater magnitude than C1:
- // icmp (X + C1), (Y + C2) -> icmp X, (Y + C3)
- // s.t. C3 = C2 - C1
- if (A && C && NoOp0WrapProblem && NoOp1WrapProblem &&
- (BO0->hasOneUse() || BO1->hasOneUse()) && !I.isUnsigned())
- if (ConstantInt *C1 = dyn_cast<ConstantInt>(B))
- if (ConstantInt *C2 = dyn_cast<ConstantInt>(D)) {
- const APInt &AP1 = C1->getValue();
- const APInt &AP2 = C2->getValue();
- if (AP1.isNegative() == AP2.isNegative()) {
- APInt AP1Abs = C1->getValue().abs();
- APInt AP2Abs = C2->getValue().abs();
- if (AP1Abs.uge(AP2Abs)) {
- ConstantInt *C3 = Builder->getInt(AP1 - AP2);
- Value *NewAdd = Builder->CreateNSWAdd(A, C3);
- return new ICmpInst(Pred, NewAdd, C);
- } else {
- ConstantInt *C3 = Builder->getInt(AP2 - AP1);
- Value *NewAdd = Builder->CreateNSWAdd(C, C3);
- return new ICmpInst(Pred, A, NewAdd);
- }
- }
- }
-
-
- // Analyze the case when either Op0 or Op1 is a sub instruction.
- // Op0 = A - B (or A and B are null); Op1 = C - D (or C and D are null).
- A = nullptr;
- B = nullptr;
- C = nullptr;
- D = nullptr;
- if (BO0 && BO0->getOpcode() == Instruction::Sub) {
- A = BO0->getOperand(0);
- B = BO0->getOperand(1);
- }
- if (BO1 && BO1->getOpcode() == Instruction::Sub) {
- C = BO1->getOperand(0);
- D = BO1->getOperand(1);
- }
-
- // icmp (X-Y), X -> icmp 0, Y for equalities or if there is no overflow.
- if (A == Op1 && NoOp0WrapProblem)
- return new ICmpInst(Pred, Constant::getNullValue(Op1->getType()), B);
-
- // icmp X, (X-Y) -> icmp Y, 0 for equalities or if there is no overflow.
- if (C == Op0 && NoOp1WrapProblem)
- return new ICmpInst(Pred, D, Constant::getNullValue(Op0->getType()));
-
- // icmp (Y-X), (Z-X) -> icmp Y, Z for equalities or if there is no overflow.
- if (B && D && B == D && NoOp0WrapProblem && NoOp1WrapProblem &&
- // Try not to increase register pressure.
- BO0->hasOneUse() && BO1->hasOneUse())
- return new ICmpInst(Pred, A, C);
-
- // icmp (X-Y), (X-Z) -> icmp Z, Y for equalities or if there is no overflow.
- if (A && C && A == C && NoOp0WrapProblem && NoOp1WrapProblem &&
- // Try not to increase register pressure.
- BO0->hasOneUse() && BO1->hasOneUse())
- return new ICmpInst(Pred, D, B);
-
- // icmp (0-X) < cst --> x > -cst
- if (NoOp0WrapProblem && ICmpInst::isSigned(Pred)) {
- Value *X;
- if (match(BO0, m_Neg(m_Value(X))))
- if (ConstantInt *RHSC = dyn_cast<ConstantInt>(Op1))
- if (!RHSC->isMinValue(/*isSigned=*/true))
- return new ICmpInst(I.getSwappedPredicate(), X,
- ConstantExpr::getNeg(RHSC));
- }
-
- BinaryOperator *SRem = nullptr;
- // icmp (srem X, Y), Y
- if (BO0 && BO0->getOpcode() == Instruction::SRem &&
- Op1 == BO0->getOperand(1))
- SRem = BO0;
- // icmp Y, (srem X, Y)
- else if (BO1 && BO1->getOpcode() == Instruction::SRem &&
- Op0 == BO1->getOperand(1))
- SRem = BO1;
- if (SRem) {
- // We don't check hasOneUse to avoid increasing register pressure because
- // the value we use is the same value this instruction was already using.
- switch (SRem == BO0 ? ICmpInst::getSwappedPredicate(Pred) : Pred) {
- default: break;
- case ICmpInst::ICMP_EQ:
- return replaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
- case ICmpInst::ICMP_NE:
- return replaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
- case ICmpInst::ICMP_SGT:
- case ICmpInst::ICMP_SGE:
- return new ICmpInst(ICmpInst::ICMP_SGT, SRem->getOperand(1),
- Constant::getAllOnesValue(SRem->getType()));
- case ICmpInst::ICMP_SLT:
- case ICmpInst::ICMP_SLE:
- return new ICmpInst(ICmpInst::ICMP_SLT, SRem->getOperand(1),
- Constant::getNullValue(SRem->getType()));
- }
- }
-
- if (BO0 && BO1 && BO0->getOpcode() == BO1->getOpcode() &&
- BO0->hasOneUse() && BO1->hasOneUse() &&
- BO0->getOperand(1) == BO1->getOperand(1)) {
- switch (BO0->getOpcode()) {
- default: break;
- case Instruction::Add:
- case Instruction::Sub:
- case Instruction::Xor:
- if (I.isEquality()) // a+x icmp eq/ne b+x --> a icmp b
- return new ICmpInst(I.getPredicate(), BO0->getOperand(0),
- BO1->getOperand(0));
- // icmp u/s (a ^ signbit), (b ^ signbit) --> icmp s/u a, b
- if (ConstantInt *CI = dyn_cast<ConstantInt>(BO0->getOperand(1))) {
- if (CI->getValue().isSignBit()) {
- ICmpInst::Predicate Pred = I.isSigned()
- ? I.getUnsignedPredicate()
- : I.getSignedPredicate();
- return new ICmpInst(Pred, BO0->getOperand(0),
- BO1->getOperand(0));
- }
-
- if (BO0->getOpcode() == Instruction::Xor && CI->isMaxValue(true)) {
- ICmpInst::Predicate Pred = I.isSigned()
- ? I.getUnsignedPredicate()
- : I.getSignedPredicate();
- Pred = I.getSwappedPredicate(Pred);
- return new ICmpInst(Pred, BO0->getOperand(0),
- BO1->getOperand(0));
- }
- }
- break;
- case Instruction::Mul:
- if (!I.isEquality())
- break;
-
- if (ConstantInt *CI = dyn_cast<ConstantInt>(BO0->getOperand(1))) {
- // a * Cst icmp eq/ne b * Cst --> a & Mask icmp b & Mask
- // Mask = -1 >> count-trailing-zeros(Cst).
- if (!CI->isZero() && !CI->isOne()) {
- const APInt &AP = CI->getValue();
- ConstantInt *Mask = ConstantInt::get(I.getContext(),
- APInt::getLowBitsSet(AP.getBitWidth(),
- AP.getBitWidth() -
- AP.countTrailingZeros()));
- Value *And1 = Builder->CreateAnd(BO0->getOperand(0), Mask);
- Value *And2 = Builder->CreateAnd(BO1->getOperand(0), Mask);
- return new ICmpInst(I.getPredicate(), And1, And2);
- }
- }
- break;
- case Instruction::UDiv:
- case Instruction::LShr:
- if (I.isSigned())
- break;
- // fall-through
- case Instruction::SDiv:
- case Instruction::AShr:
- if (!BO0->isExact() || !BO1->isExact())
- break;
- return new ICmpInst(I.getPredicate(), BO0->getOperand(0),
- BO1->getOperand(0));
- case Instruction::Shl: {
- bool NUW = BO0->hasNoUnsignedWrap() && BO1->hasNoUnsignedWrap();
- bool NSW = BO0->hasNoSignedWrap() && BO1->hasNoSignedWrap();
- if (!NUW && !NSW)
- break;
- if (!NSW && I.isSigned())
- break;
- return new ICmpInst(I.getPredicate(), BO0->getOperand(0),
- BO1->getOperand(0));
- }
- }
- }
-
- if (BO0) {
- // Transform A & (L - 1) `ult` L --> L != 0
- auto LSubOne = m_Add(m_Specific(Op1), m_AllOnes());
- auto BitwiseAnd =
- m_CombineOr(m_And(m_Value(), LSubOne), m_And(LSubOne, m_Value()));
+ if (Instruction *Res = foldICmpBinOp(I))
+ return Res;
- if (match(BO0, BitwiseAnd) && I.getPredicate() == ICmpInst::ICMP_ULT) {
- auto *Zero = Constant::getNullValue(BO0->getType());
- return new ICmpInst(ICmpInst::ICMP_NE, Op1, Zero);
- }
- }
- }
+ if (Instruction *Res = foldICmpWithMinMax(I))
+ return Res;
- { Value *A, *B;
+ {
+ Value *A, *B;
// Transform (A & ~B) == 0 --> (A & B) != 0
// and (A & ~B) != 0 --> (A & B) == 0
// if A is a power of 2.
if (match(Op0, m_And(m_Value(A), m_Not(m_Value(B)))) &&
match(Op1, m_Zero()) &&
- isKnownToBeAPowerOfTwo(A, DL, false, 0, AC, &I, DT) && I.isEquality())
+ isKnownToBeAPowerOfTwo(A, DL, false, 0, &AC, &I, &DT) && I.isEquality())
return new ICmpInst(I.getInversePredicate(),
Builder->CreateAnd(A, B),
Op1);
@@ -4120,149 +4418,17 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
// (zext a) * (zext b) --> llvm.umul.with.overflow.
if (match(Op0, m_Mul(m_ZExt(m_Value(A)), m_ZExt(m_Value(B))))) {
- if (Instruction *R = ProcessUMulZExtIdiom(I, Op0, Op1, *this))
+ if (Instruction *R = processUMulZExtIdiom(I, Op0, Op1, *this))
return R;
}
if (match(Op1, m_Mul(m_ZExt(m_Value(A)), m_ZExt(m_Value(B))))) {
- if (Instruction *R = ProcessUMulZExtIdiom(I, Op1, Op0, *this))
+ if (Instruction *R = processUMulZExtIdiom(I, Op1, Op0, *this))
return R;
}
}
- if (I.isEquality()) {
- Value *A, *B, *C, *D;
-
- if (match(Op0, m_Xor(m_Value(A), m_Value(B)))) {
- if (A == Op1 || B == Op1) { // (A^B) == A -> B == 0
- Value *OtherVal = A == Op1 ? B : A;
- return new ICmpInst(I.getPredicate(), OtherVal,
- Constant::getNullValue(A->getType()));
- }
-
- if (match(Op1, m_Xor(m_Value(C), m_Value(D)))) {
- // A^c1 == C^c2 --> A == C^(c1^c2)
- ConstantInt *C1, *C2;
- if (match(B, m_ConstantInt(C1)) &&
- match(D, m_ConstantInt(C2)) && Op1->hasOneUse()) {
- Constant *NC = Builder->getInt(C1->getValue() ^ C2->getValue());
- Value *Xor = Builder->CreateXor(C, NC);
- return new ICmpInst(I.getPredicate(), A, Xor);
- }
-
- // A^B == A^D -> B == D
- if (A == C) return new ICmpInst(I.getPredicate(), B, D);
- if (A == D) return new ICmpInst(I.getPredicate(), B, C);
- if (B == C) return new ICmpInst(I.getPredicate(), A, D);
- if (B == D) return new ICmpInst(I.getPredicate(), A, C);
- }
- }
-
- if (match(Op1, m_Xor(m_Value(A), m_Value(B))) &&
- (A == Op0 || B == Op0)) {
- // A == (A^B) -> B == 0
- Value *OtherVal = A == Op0 ? B : A;
- return new ICmpInst(I.getPredicate(), OtherVal,
- Constant::getNullValue(A->getType()));
- }
-
- // (X&Z) == (Y&Z) -> (X^Y) & Z == 0
- if (match(Op0, m_OneUse(m_And(m_Value(A), m_Value(B)))) &&
- match(Op1, m_OneUse(m_And(m_Value(C), m_Value(D))))) {
- Value *X = nullptr, *Y = nullptr, *Z = nullptr;
-
- if (A == C) {
- X = B; Y = D; Z = A;
- } else if (A == D) {
- X = B; Y = C; Z = A;
- } else if (B == C) {
- X = A; Y = D; Z = B;
- } else if (B == D) {
- X = A; Y = C; Z = B;
- }
-
- if (X) { // Build (X^Y) & Z
- Op1 = Builder->CreateXor(X, Y);
- Op1 = Builder->CreateAnd(Op1, Z);
- I.setOperand(0, Op1);
- I.setOperand(1, Constant::getNullValue(Op1->getType()));
- return &I;
- }
- }
-
- // Transform (zext A) == (B & (1<<X)-1) --> A == (trunc B)
- // and (B & (1<<X)-1) == (zext A) --> A == (trunc B)
- ConstantInt *Cst1;
- if ((Op0->hasOneUse() &&
- match(Op0, m_ZExt(m_Value(A))) &&
- match(Op1, m_And(m_Value(B), m_ConstantInt(Cst1)))) ||
- (Op1->hasOneUse() &&
- match(Op0, m_And(m_Value(B), m_ConstantInt(Cst1))) &&
- match(Op1, m_ZExt(m_Value(A))))) {
- APInt Pow2 = Cst1->getValue() + 1;
- if (Pow2.isPowerOf2() && isa<IntegerType>(A->getType()) &&
- Pow2.logBase2() == cast<IntegerType>(A->getType())->getBitWidth())
- return new ICmpInst(I.getPredicate(), A,
- Builder->CreateTrunc(B, A->getType()));
- }
-
- // (A >> C) == (B >> C) --> (A^B) u< (1 << C)
- // For lshr and ashr pairs.
- if ((match(Op0, m_OneUse(m_LShr(m_Value(A), m_ConstantInt(Cst1)))) &&
- match(Op1, m_OneUse(m_LShr(m_Value(B), m_Specific(Cst1))))) ||
- (match(Op0, m_OneUse(m_AShr(m_Value(A), m_ConstantInt(Cst1)))) &&
- match(Op1, m_OneUse(m_AShr(m_Value(B), m_Specific(Cst1)))))) {
- unsigned TypeBits = Cst1->getBitWidth();
- unsigned ShAmt = (unsigned)Cst1->getLimitedValue(TypeBits);
- if (ShAmt < TypeBits && ShAmt != 0) {
- ICmpInst::Predicate Pred = I.getPredicate() == ICmpInst::ICMP_NE
- ? ICmpInst::ICMP_UGE
- : ICmpInst::ICMP_ULT;
- Value *Xor = Builder->CreateXor(A, B, I.getName() + ".unshifted");
- APInt CmpVal = APInt::getOneBitSet(TypeBits, ShAmt);
- return new ICmpInst(Pred, Xor, Builder->getInt(CmpVal));
- }
- }
-
- // (A << C) == (B << C) --> ((A^B) & (~0U >> C)) == 0
- if (match(Op0, m_OneUse(m_Shl(m_Value(A), m_ConstantInt(Cst1)))) &&
- match(Op1, m_OneUse(m_Shl(m_Value(B), m_Specific(Cst1))))) {
- unsigned TypeBits = Cst1->getBitWidth();
- unsigned ShAmt = (unsigned)Cst1->getLimitedValue(TypeBits);
- if (ShAmt < TypeBits && ShAmt != 0) {
- Value *Xor = Builder->CreateXor(A, B, I.getName() + ".unshifted");
- APInt AndVal = APInt::getLowBitsSet(TypeBits, TypeBits - ShAmt);
- Value *And = Builder->CreateAnd(Xor, Builder->getInt(AndVal),
- I.getName() + ".mask");
- return new ICmpInst(I.getPredicate(), And,
- Constant::getNullValue(Cst1->getType()));
- }
- }
-
- // Transform "icmp eq (trunc (lshr(X, cst1)), cst" to
- // "icmp (and X, mask), cst"
- uint64_t ShAmt = 0;
- if (Op0->hasOneUse() &&
- match(Op0, m_Trunc(m_OneUse(m_LShr(m_Value(A),
- m_ConstantInt(ShAmt))))) &&
- match(Op1, m_ConstantInt(Cst1)) &&
- // Only do this when A has multiple uses. This is most important to do
- // when it exposes other optimizations.
- !A->hasOneUse()) {
- unsigned ASize =cast<IntegerType>(A->getType())->getPrimitiveSizeInBits();
-
- if (ShAmt < ASize) {
- APInt MaskV =
- APInt::getLowBitsSet(ASize, Op0->getType()->getPrimitiveSizeInBits());
- MaskV <<= ShAmt;
-
- APInt CmpV = Cst1->getValue().zext(ASize);
- CmpV <<= ShAmt;
-
- Value *Mask = Builder->CreateAnd(A, Builder->getInt(MaskV));
- return new ICmpInst(I.getPredicate(), Mask, Builder->getInt(CmpV));
- }
- }
- }
+ if (Instruction *Res = foldICmpEquality(I))
+ return Res;
// The 'cmpxchg' instruction returns an aggregate containing the old value and
// an i1 which indicates whether or not we successfully did the swap.
@@ -4284,18 +4450,17 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
Value *X; ConstantInt *Cst;
// icmp X+Cst, X
if (match(Op0, m_Add(m_Value(X), m_ConstantInt(Cst))) && Op1 == X)
- return FoldICmpAddOpCst(I, X, Cst, I.getPredicate());
+ return foldICmpAddOpConst(I, X, Cst, I.getPredicate());
// icmp X, X+Cst
if (match(Op1, m_Add(m_Value(X), m_ConstantInt(Cst))) && Op0 == X)
- return FoldICmpAddOpCst(I, X, Cst, I.getSwappedPredicate());
+ return foldICmpAddOpConst(I, X, Cst, I.getSwappedPredicate());
}
return Changed ? &I : nullptr;
}
/// Fold fcmp ([us]itofp x, cst) if possible.
-Instruction *InstCombiner::FoldFCmp_IntToFP_Cst(FCmpInst &I,
- Instruction *LHSI,
+Instruction *InstCombiner::foldFCmpIntToFPConst(FCmpInst &I, Instruction *LHSI,
Constant *RHSC) {
if (!isa<ConstantFP>(RHSC)) return nullptr;
const APFloat &RHS = cast<ConstantFP>(RHSC)->getValueAPF();
@@ -4339,21 +4504,21 @@ Instruction *InstCombiner::FoldFCmp_IntToFP_Cst(FCmpInst &I,
// This would allow us to handle (fptosi (x >>s 62) to float) if x is i64 f.e.
unsigned InputSize = IntTy->getScalarSizeInBits();
- // Following test does NOT adjust InputSize downwards for signed inputs,
- // because the most negative value still requires all the mantissa bits
+ // Following test does NOT adjust InputSize downwards for signed inputs,
+ // because the most negative value still requires all the mantissa bits
// to distinguish it from one less than that value.
if ((int)InputSize > MantissaWidth) {
// Conversion would lose accuracy. Check if loss can impact comparison.
int Exp = ilogb(RHS);
if (Exp == APFloat::IEK_Inf) {
int MaxExponent = ilogb(APFloat::getLargest(RHS.getSemantics()));
- if (MaxExponent < (int)InputSize - !LHSUnsigned)
+ if (MaxExponent < (int)InputSize - !LHSUnsigned)
// Conversion could create infinity.
return nullptr;
} else {
- // Note that if RHS is zero or NaN, then Exp is negative
+ // Note that if RHS is zero or NaN, then Exp is negative
// and first condition is trivially false.
- if (MantissaWidth <= Exp && Exp <= (int)InputSize - !LHSUnsigned)
+ if (MantissaWidth <= Exp && Exp <= (int)InputSize - !LHSUnsigned)
// Conversion could affect comparison.
return nullptr;
}
@@ -4547,7 +4712,7 @@ Instruction *InstCombiner::visitFCmpInst(FCmpInst &I) {
Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
if (Value *V = SimplifyFCmpInst(I.getPredicate(), Op0, Op1,
- I.getFastMathFlags(), DL, TLI, DT, AC, &I))
+ I.getFastMathFlags(), DL, &TLI, &DT, &AC, &I))
return replaceInstUsesWith(I, V);
// Simplify 'fcmp pred X, X'
@@ -4601,17 +4766,17 @@ Instruction *InstCombiner::visitFCmpInst(FCmpInst &I) {
const fltSemantics *Sem;
// FIXME: This shouldn't be here.
if (LHSExt->getSrcTy()->isHalfTy())
- Sem = &APFloat::IEEEhalf;
+ Sem = &APFloat::IEEEhalf();
else if (LHSExt->getSrcTy()->isFloatTy())
- Sem = &APFloat::IEEEsingle;
+ Sem = &APFloat::IEEEsingle();
else if (LHSExt->getSrcTy()->isDoubleTy())
- Sem = &APFloat::IEEEdouble;
+ Sem = &APFloat::IEEEdouble();
else if (LHSExt->getSrcTy()->isFP128Ty())
- Sem = &APFloat::IEEEquad;
+ Sem = &APFloat::IEEEquad();
else if (LHSExt->getSrcTy()->isX86_FP80Ty())
- Sem = &APFloat::x87DoubleExtended;
+ Sem = &APFloat::x87DoubleExtended();
else if (LHSExt->getSrcTy()->isPPC_FP128Ty())
- Sem = &APFloat::PPCDoubleDouble;
+ Sem = &APFloat::PPCDoubleDouble();
else
break;
@@ -4641,7 +4806,7 @@ Instruction *InstCombiner::visitFCmpInst(FCmpInst &I) {
break;
case Instruction::SIToFP:
case Instruction::UIToFP:
- if (Instruction *NV = FoldFCmp_IntToFP_Cst(I, LHSI, RHSC))
+ if (Instruction *NV = foldFCmpIntToFPConst(I, LHSI, RHSC))
return NV;
break;
case Instruction::FSub: {
@@ -4658,7 +4823,7 @@ Instruction *InstCombiner::visitFCmpInst(FCmpInst &I) {
if (GlobalVariable *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0)))
if (GV->isConstant() && GV->hasDefinitiveInitializer() &&
!cast<LoadInst>(LHSI)->isVolatile())
- if (Instruction *Res = FoldCmpLoadFromIndexedGlobal(GEP, GV, I))
+ if (Instruction *Res = foldCmpLoadFromIndexedGlobal(GEP, GV, I))
return Res;
}
break;
@@ -4667,7 +4832,7 @@ Instruction *InstCombiner::visitFCmpInst(FCmpInst &I) {
break;
CallInst *CI = cast<CallInst>(LHSI);
- Intrinsic::ID IID = getIntrinsicForCallSite(CI, TLI);
+ Intrinsic::ID IID = getIntrinsicForCallSite(CI, &TLI);
if (IID != Intrinsic::fabs)
break;
diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/contrib/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index aa421ff..2847ce8 100644
--- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -84,6 +84,24 @@ static inline bool IsFreeToInvert(Value *V, bool WillInvertAllUses) {
if (isa<ConstantInt>(V))
return true;
+ // A vector of constant integers can be inverted easily.
+ Constant *CV;
+ if (V->getType()->isVectorTy() && match(V, PatternMatch::m_Constant(CV))) {
+ unsigned NumElts = V->getType()->getVectorNumElements();
+ for (unsigned i = 0; i != NumElts; ++i) {
+ Constant *Elt = CV->getAggregateElement(i);
+ if (!Elt)
+ return false;
+
+ if (isa<UndefValue>(Elt))
+ continue;
+
+ if (!isa<ConstantInt>(Elt))
+ return false;
+ }
+ return true;
+ }
+
// Compares can be inverted if all of their uses are being modified to use the
// ~V.
if (isa<CmpInst>(V))
@@ -135,33 +153,10 @@ IntrinsicIDToOverflowCheckFlavor(unsigned ID) {
}
}
-/// \brief An IRBuilder inserter that adds new instructions to the instcombine
-/// worklist.
-class LLVM_LIBRARY_VISIBILITY InstCombineIRInserter
- : public IRBuilderDefaultInserter {
- InstCombineWorklist &Worklist;
- AssumptionCache *AC;
-
-public:
- InstCombineIRInserter(InstCombineWorklist &WL, AssumptionCache *AC)
- : Worklist(WL), AC(AC) {}
-
- void InsertHelper(Instruction *I, const Twine &Name, BasicBlock *BB,
- BasicBlock::iterator InsertPt) const {
- IRBuilderDefaultInserter::InsertHelper(I, Name, BB, InsertPt);
- Worklist.Add(I);
-
- using namespace llvm::PatternMatch;
- if (match(I, m_Intrinsic<Intrinsic::assume>()))
- AC->registerAssumption(cast<CallInst>(I));
- }
-};
-
/// \brief The core instruction combiner logic.
///
/// This class provides both the logic to recursively visit instructions and
-/// combine them, as well as the pass infrastructure for running this as part
-/// of the LLVM pass pipeline.
+/// combine them.
class LLVM_LIBRARY_VISIBILITY InstCombiner
: public InstVisitor<InstCombiner, Instruction *> {
// FIXME: These members shouldn't be public.
@@ -171,7 +166,7 @@ public:
/// \brief An IRBuilder that automatically inserts new instructions into the
/// worklist.
- typedef IRBuilder<TargetFolder, InstCombineIRInserter> BuilderTy;
+ typedef IRBuilder<TargetFolder, IRBuilderCallbackInserter> BuilderTy;
BuilderTy *Builder;
private:
@@ -183,10 +178,9 @@ private:
AliasAnalysis *AA;
// Required analyses.
- // FIXME: These can never be null and should be references.
- AssumptionCache *AC;
- TargetLibraryInfo *TLI;
- DominatorTree *DT;
+ AssumptionCache &AC;
+ TargetLibraryInfo &TLI;
+ DominatorTree &DT;
const DataLayout &DL;
// Optional analyses. When non-null, these can both be used to do better
@@ -198,8 +192,8 @@ private:
public:
InstCombiner(InstCombineWorklist &Worklist, BuilderTy *Builder,
bool MinimizeSize, bool ExpensiveCombines, AliasAnalysis *AA,
- AssumptionCache *AC, TargetLibraryInfo *TLI,
- DominatorTree *DT, const DataLayout &DL, LoopInfo *LI)
+ AssumptionCache &AC, TargetLibraryInfo &TLI,
+ DominatorTree &DT, const DataLayout &DL, LoopInfo *LI)
: Worklist(Worklist), Builder(Builder), MinimizeSize(MinimizeSize),
ExpensiveCombines(ExpensiveCombines), AA(AA), AC(AC), TLI(TLI), DT(DT),
DL(DL), LI(LI), MadeIRChange(false) {}
@@ -209,15 +203,15 @@ public:
/// \returns true if the IR is changed.
bool run();
- AssumptionCache *getAssumptionCache() const { return AC; }
+ AssumptionCache &getAssumptionCache() const { return AC; }
const DataLayout &getDataLayout() const { return DL; }
- DominatorTree *getDominatorTree() const { return DT; }
+ DominatorTree &getDominatorTree() const { return DT; }
LoopInfo *getLoopInfo() const { return LI; }
- TargetLibraryInfo *getTargetLibraryInfo() const { return TLI; }
+ TargetLibraryInfo &getTargetLibraryInfo() const { return TLI; }
// Visitation implementation - Implement instruction combining for different
// instruction types. The semantics are as follows:
@@ -262,29 +256,8 @@ public:
Instruction *visitAShr(BinaryOperator &I);
Instruction *visitLShr(BinaryOperator &I);
Instruction *commonShiftTransforms(BinaryOperator &I);
- Instruction *FoldFCmp_IntToFP_Cst(FCmpInst &I, Instruction *LHSI,
- Constant *RHSC);
- Instruction *FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP,
- GlobalVariable *GV, CmpInst &ICI,
- ConstantInt *AndCst = nullptr);
Instruction *visitFCmpInst(FCmpInst &I);
Instruction *visitICmpInst(ICmpInst &I);
- Instruction *visitICmpInstWithCastAndCast(ICmpInst &ICI);
- Instruction *visitICmpInstWithInstAndIntCst(ICmpInst &ICI, Instruction *LHS,
- ConstantInt *RHS);
- Instruction *FoldICmpDivCst(ICmpInst &ICI, BinaryOperator *DivI,
- ConstantInt *DivRHS);
- Instruction *FoldICmpShrCst(ICmpInst &ICI, BinaryOperator *DivI,
- ConstantInt *DivRHS);
- Instruction *FoldICmpCstShrCst(ICmpInst &I, Value *Op, Value *A,
- ConstantInt *CI1, ConstantInt *CI2);
- Instruction *FoldICmpCstShlCst(ICmpInst &I, Value *Op, Value *A,
- ConstantInt *CI1, ConstantInt *CI2);
- Instruction *FoldICmpAddOpCst(Instruction &ICI, Value *X, ConstantInt *CI,
- ICmpInst::Predicate Pred);
- Instruction *FoldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
- ICmpInst::Predicate Cond, Instruction &I);
- Instruction *FoldAllocaCmp(ICmpInst &ICI, AllocaInst *Alloca, Value *Other);
Instruction *FoldShiftByConstant(Value *Op0, Constant *Op1,
BinaryOperator &I);
Instruction *commonCastTransforms(CastInst &CI);
@@ -302,14 +275,8 @@ public:
Instruction *visitIntToPtr(IntToPtrInst &CI);
Instruction *visitBitCast(BitCastInst &CI);
Instruction *visitAddrSpaceCast(AddrSpaceCastInst &CI);
- Instruction *FoldSelectOpOp(SelectInst &SI, Instruction *TI, Instruction *FI);
- Instruction *FoldSelectIntoOp(SelectInst &SI, Value *, Value *);
- Instruction *FoldSPFofSPF(Instruction *Inner, SelectPatternFlavor SPF1,
- Value *A, Value *B, Instruction &Outer,
- SelectPatternFlavor SPF2, Value *C);
Instruction *FoldItoFPtoI(Instruction &FI);
Instruction *visitSelectInst(SelectInst &SI);
- Instruction *visitSelectInstWithICmp(SelectInst &SI, ICmpInst *ICI);
Instruction *visitCallInst(CallInst &CI);
Instruction *visitInvokeInst(InvokeInst &II);
@@ -333,16 +300,16 @@ public:
Instruction *visitVAStartInst(VAStartInst &I);
Instruction *visitVACopyInst(VACopyInst &I);
- // visitInstruction - Specify what to return for unhandled instructions...
+ /// Specify what to return for unhandled instructions.
Instruction *visitInstruction(Instruction &I) { return nullptr; }
- // True when DB dominates all uses of DI execpt UI.
- // UI must be in the same block as DI.
- // The routine checks that the DI parent and DB are different.
+ /// True when DB dominates all uses of DI except UI.
+ /// UI must be in the same block as DI.
+ /// The routine checks that the DI parent and DB are different.
bool dominatesAllUses(const Instruction *DI, const Instruction *UI,
const BasicBlock *DB) const;
- // Replace select with select operand SIOpd in SI-ICmp sequence when possible
+ /// Try to replace select with select operand SIOpd in SI-ICmp sequence.
bool replacedSelectWithOperand(SelectInst *SI, const ICmpInst *Icmp,
const unsigned SIOpd);
@@ -353,16 +320,17 @@ private:
Value *dyn_castFNegVal(Value *V, bool NoSignedZero = false) const;
Type *FindElementAtOffset(PointerType *PtrTy, int64_t Offset,
SmallVectorImpl<Value *> &NewIndices);
- Instruction *FoldOpIntoSelect(Instruction &Op, SelectInst *SI);
- /// \brief Classify whether a cast is worth optimizing.
+ /// Classify whether a cast is worth optimizing.
+ ///
+ /// This is a helper to decide whether the simplification of
+ /// logic(cast(A), cast(B)) to cast(logic(A, B)) should be performed.
+ ///
+ /// \param CI The cast we are interested in.
///
- /// Returns true if the cast from "V to Ty" actually results in any code
- /// being generated and is interesting to optimize out. If the cast can be
- /// eliminated by some other simple transformation, we prefer to do the
- /// simplification first.
- bool ShouldOptimizeCast(Instruction::CastOps opcode, const Value *V,
- Type *Ty);
+ /// \return true if this cast actually results in any code being generated and
+ /// if it cannot already be eliminated by some other transformation.
+ bool shouldOptimizeCast(CastInst *CI);
/// \brief Try to optimize a sequence of instructions checking if an operation
/// on LHS and RHS overflows.
@@ -385,8 +353,22 @@ private:
bool transformConstExprCastCall(CallSite CS);
Instruction *transformCallThroughTrampoline(CallSite CS,
IntrinsicInst *Tramp);
- Instruction *transformZExtICmp(ICmpInst *ICI, Instruction &CI,
- bool DoXform = true);
+
+ /// Transform (zext icmp) to bitwise / integer operations in order to
+ /// eliminate it.
+ ///
+ /// \param ICI The icmp of the (zext icmp) pair we are interested in.
+ /// \parem CI The zext of the (zext icmp) pair we are interested in.
+ /// \param DoTransform Pass false to just test whether the given (zext icmp)
+ /// would be transformed. Pass true to actually perform the transformation.
+ ///
+ /// \return null if the transformation cannot be performed. If the
+ /// transformation can be performed the new instruction that replaces the
+ /// (zext icmp) pair will be returned (if \p DoTransform is false the
+ /// unmodified \p ICI will be returned in this case).
+ Instruction *transformZExtICmp(ICmpInst *ICI, ZExtInst &CI,
+ bool DoTransform = true);
+
Instruction *transformSExtICmp(ICmpInst *ICI, Instruction &CI);
bool WillNotOverflowSignedAdd(Value *LHS, Value *RHS, Instruction &CxtI);
bool WillNotOverflowSignedSub(Value *LHS, Value *RHS, Instruction &CxtI);
@@ -396,6 +378,21 @@ private:
Instruction *scalarizePHI(ExtractElementInst &EI, PHINode *PN);
Value *EvaluateInDifferentElementOrder(Value *V, ArrayRef<int> Mask);
Instruction *foldCastedBitwiseLogic(BinaryOperator &I);
+ Instruction *shrinkBitwiseLogic(TruncInst &Trunc);
+ Instruction *optimizeBitCastFromPhi(CastInst &CI, PHINode *PN);
+
+ /// Determine if a pair of casts can be replaced by a single cast.
+ ///
+ /// \param CI1 The first of a pair of casts.
+ /// \param CI2 The second of a pair of casts.
+ ///
+ /// \return 0 if the cast pair cannot be eliminated, otherwise returns an
+ /// Instruction::CastOps value for a cast that can replace the pair, casting
+ /// CI1->getSrcTy() to CI2->getDstTy().
+ ///
+ /// \see CastInst::isEliminableCastPair
+ Instruction::CastOps isEliminableCastPair(const CastInst *CI1,
+ const CastInst *CI2);
public:
/// \brief Inserts an instruction \p New before instruction \p Old
@@ -476,30 +473,30 @@ public:
void computeKnownBits(Value *V, APInt &KnownZero, APInt &KnownOne,
unsigned Depth, Instruction *CxtI) const {
- return llvm::computeKnownBits(V, KnownZero, KnownOne, DL, Depth, AC, CxtI,
- DT);
+ return llvm::computeKnownBits(V, KnownZero, KnownOne, DL, Depth, &AC, CxtI,
+ &DT);
}
bool MaskedValueIsZero(Value *V, const APInt &Mask, unsigned Depth = 0,
Instruction *CxtI = nullptr) const {
- return llvm::MaskedValueIsZero(V, Mask, DL, Depth, AC, CxtI, DT);
+ return llvm::MaskedValueIsZero(V, Mask, DL, Depth, &AC, CxtI, &DT);
}
unsigned ComputeNumSignBits(Value *Op, unsigned Depth = 0,
Instruction *CxtI = nullptr) const {
- return llvm::ComputeNumSignBits(Op, DL, Depth, AC, CxtI, DT);
+ return llvm::ComputeNumSignBits(Op, DL, Depth, &AC, CxtI, &DT);
}
void ComputeSignBit(Value *V, bool &KnownZero, bool &KnownOne,
unsigned Depth = 0, Instruction *CxtI = nullptr) const {
- return llvm::ComputeSignBit(V, KnownZero, KnownOne, DL, Depth, AC, CxtI,
- DT);
+ return llvm::ComputeSignBit(V, KnownZero, KnownOne, DL, Depth, &AC, CxtI,
+ &DT);
}
OverflowResult computeOverflowForUnsignedMul(Value *LHS, Value *RHS,
const Instruction *CxtI) {
- return llvm::computeOverflowForUnsignedMul(LHS, RHS, DL, AC, CxtI, DT);
+ return llvm::computeOverflowForUnsignedMul(LHS, RHS, DL, &AC, CxtI, &DT);
}
OverflowResult computeOverflowForUnsignedAdd(Value *LHS, Value *RHS,
const Instruction *CxtI) {
- return llvm::computeOverflowForUnsignedAdd(LHS, RHS, DL, AC, CxtI, DT);
+ return llvm::computeOverflowForUnsignedAdd(LHS, RHS, DL, &AC, CxtI, &DT);
}
private:
@@ -539,13 +536,21 @@ private:
Value *SimplifyVectorOp(BinaryOperator &Inst);
Value *SimplifyBSwap(BinaryOperator &Inst);
- // FoldOpIntoPhi - Given a binary operator, cast instruction, or select
- // which has a PHI node as operand #0, see if we can fold the instruction
- // into the PHI (which is only possible if all operands to the PHI are
- // constants).
- //
+
+ /// Given a binary operator, cast instruction, or select which has a PHI node
+ /// as operand #0, see if we can fold the instruction into the PHI (which is
+ /// only possible if all operands to the PHI are constants).
Instruction *FoldOpIntoPhi(Instruction &I);
+ /// Given an instruction with a select as one operand and a constant as the
+ /// other operand, try to fold the binary operator into the select arguments.
+ /// This also works for Cast instructions, which obviously do not have a
+ /// second operand.
+ Instruction *FoldOpIntoSelect(Instruction &Op, SelectInst *SI);
+
+ /// This is a convenience wrapper function for the above two functions.
+ Instruction *foldOpWithConstantIntoOperand(Instruction &I);
+
/// \brief Try to rotate an operation below a PHI node, using PHI nodes for
/// its operands.
Instruction *FoldPHIArgOpIntoPHI(PHINode &PN);
@@ -554,13 +559,82 @@ private:
Instruction *FoldPHIArgLoadIntoPHI(PHINode &PN);
Instruction *FoldPHIArgZextsIntoPHI(PHINode &PN);
+ /// Helper function for FoldPHIArgXIntoPHI() to get debug location for the
+ /// folded operation.
+ DebugLoc PHIArgMergedDebugLoc(PHINode &PN);
+
+ Instruction *foldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
+ ICmpInst::Predicate Cond, Instruction &I);
+ Instruction *foldAllocaCmp(ICmpInst &ICI, const AllocaInst *Alloca,
+ const Value *Other);
+ Instruction *foldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP,
+ GlobalVariable *GV, CmpInst &ICI,
+ ConstantInt *AndCst = nullptr);
+ Instruction *foldFCmpIntToFPConst(FCmpInst &I, Instruction *LHSI,
+ Constant *RHSC);
+ Instruction *foldICmpAddOpConst(Instruction &ICI, Value *X, ConstantInt *CI,
+ ICmpInst::Predicate Pred);
+ Instruction *foldICmpWithCastAndCast(ICmpInst &ICI);
+
+ Instruction *foldICmpUsingKnownBits(ICmpInst &Cmp);
+ Instruction *foldICmpWithConstant(ICmpInst &Cmp);
+ Instruction *foldICmpInstWithConstant(ICmpInst &Cmp);
+ Instruction *foldICmpInstWithConstantNotInt(ICmpInst &Cmp);
+ Instruction *foldICmpBinOp(ICmpInst &Cmp);
+ Instruction *foldICmpEquality(ICmpInst &Cmp);
+
+ Instruction *foldICmpTruncConstant(ICmpInst &Cmp, Instruction *Trunc,
+ const APInt *C);
+ Instruction *foldICmpAndConstant(ICmpInst &Cmp, BinaryOperator *And,
+ const APInt *C);
+ Instruction *foldICmpXorConstant(ICmpInst &Cmp, BinaryOperator *Xor,
+ const APInt *C);
+ Instruction *foldICmpOrConstant(ICmpInst &Cmp, BinaryOperator *Or,
+ const APInt *C);
+ Instruction *foldICmpMulConstant(ICmpInst &Cmp, BinaryOperator *Mul,
+ const APInt *C);
+ Instruction *foldICmpShlConstant(ICmpInst &Cmp, BinaryOperator *Shl,
+ const APInt *C);
+ Instruction *foldICmpShrConstant(ICmpInst &Cmp, BinaryOperator *Shr,
+ const APInt *C);
+ Instruction *foldICmpUDivConstant(ICmpInst &Cmp, BinaryOperator *UDiv,
+ const APInt *C);
+ Instruction *foldICmpDivConstant(ICmpInst &Cmp, BinaryOperator *Div,
+ const APInt *C);
+ Instruction *foldICmpSubConstant(ICmpInst &Cmp, BinaryOperator *Sub,
+ const APInt *C);
+ Instruction *foldICmpAddConstant(ICmpInst &Cmp, BinaryOperator *Add,
+ const APInt *C);
+ Instruction *foldICmpAndConstConst(ICmpInst &Cmp, BinaryOperator *And,
+ const APInt *C1);
+ Instruction *foldICmpAndShift(ICmpInst &Cmp, BinaryOperator *And,
+ const APInt *C1, const APInt *C2);
+ Instruction *foldICmpShrConstConst(ICmpInst &I, Value *ShAmt, const APInt &C1,
+ const APInt &C2);
+ Instruction *foldICmpShlConstConst(ICmpInst &I, Value *ShAmt, const APInt &C1,
+ const APInt &C2);
+
+ Instruction *foldICmpBinOpEqualityWithConstant(ICmpInst &Cmp,
+ BinaryOperator *BO,
+ const APInt *C);
+ Instruction *foldICmpIntrinsicWithConstant(ICmpInst &ICI, const APInt *C);
+
+ // Helpers of visitSelectInst().
+ Instruction *foldSelectExtConst(SelectInst &Sel);
+ Instruction *foldSelectOpOp(SelectInst &SI, Instruction *TI, Instruction *FI);
+ Instruction *foldSelectIntoOp(SelectInst &SI, Value *, Value *);
+ Instruction *foldSPFofSPF(Instruction *Inner, SelectPatternFlavor SPF1,
+ Value *A, Value *B, Instruction &Outer,
+ SelectPatternFlavor SPF2, Value *C);
+ Instruction *foldSelectInstWithICmp(SelectInst &SI, ICmpInst *ICI);
+
Instruction *OptAndOp(Instruction *Op, ConstantInt *OpRHS,
ConstantInt *AndRHS, BinaryOperator &TheAnd);
Value *FoldLogicalPlusAnd(Value *LHS, Value *RHS, ConstantInt *Mask,
bool isSub, Instruction &I);
- Value *InsertRangeTest(Value *V, Constant *Lo, Constant *Hi, bool isSigned,
- bool Inside);
+ Value *insertRangeTest(Value *V, const APInt &Lo, const APInt &Hi,
+ bool isSigned, bool Inside);
Instruction *PromoteCastOfAllocation(BitCastInst &CI, AllocaInst &AI);
Instruction *MatchBSwap(BinaryOperator &I);
bool SimplifyStoreAtEndOfBlock(StoreInst &SI);
diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
index d88456e..49e516e 100644
--- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@@ -15,6 +15,7 @@
#include "llvm/ADT/SmallString.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/Loads.h"
+#include "llvm/IR/ConstantRange.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/IntrinsicInst.h"
@@ -59,14 +60,14 @@ isOnlyCopiedFromConstantGlobal(Value *V, MemTransferInst *&TheCopy,
// eliminate the markers.
SmallVector<std::pair<Value *, bool>, 35> ValuesToInspect;
- ValuesToInspect.push_back(std::make_pair(V, false));
+ ValuesToInspect.emplace_back(V, false);
while (!ValuesToInspect.empty()) {
auto ValuePair = ValuesToInspect.pop_back_val();
const bool IsOffset = ValuePair.second;
for (auto &U : ValuePair.first->uses()) {
- Instruction *I = cast<Instruction>(U.getUser());
+ auto *I = cast<Instruction>(U.getUser());
- if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+ if (auto *LI = dyn_cast<LoadInst>(I)) {
// Ignore non-volatile loads, they are always ok.
if (!LI->isSimple()) return false;
continue;
@@ -74,14 +75,13 @@ isOnlyCopiedFromConstantGlobal(Value *V, MemTransferInst *&TheCopy,
if (isa<BitCastInst>(I) || isa<AddrSpaceCastInst>(I)) {
// If uses of the bitcast are ok, we are ok.
- ValuesToInspect.push_back(std::make_pair(I, IsOffset));
+ ValuesToInspect.emplace_back(I, IsOffset);
continue;
}
- if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I)) {
+ if (auto *GEP = dyn_cast<GetElementPtrInst>(I)) {
// If the GEP has all zero indices, it doesn't offset the pointer. If it
// doesn't, it does.
- ValuesToInspect.push_back(
- std::make_pair(I, IsOffset || !GEP->hasAllZeroIndices()));
+ ValuesToInspect.emplace_back(I, IsOffset || !GEP->hasAllZeroIndices());
continue;
}
@@ -286,7 +286,7 @@ Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) {
SmallVector<Instruction *, 4> ToDelete;
if (MemTransferInst *Copy = isOnlyCopiedFromConstantGlobal(&AI, ToDelete)) {
unsigned SourceAlign = getOrEnforceKnownAlignment(
- Copy->getSource(), AI.getAlignment(), DL, &AI, AC, DT);
+ Copy->getSource(), AI.getAlignment(), DL, &AI, &AC, &DT);
if (AI.getAlignment() <= SourceAlign) {
DEBUG(dbgs() << "Found alloca equal to global: " << AI << '\n');
DEBUG(dbgs() << " memcpy = " << *Copy << '\n');
@@ -308,6 +308,11 @@ Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) {
return visitAllocSite(AI);
}
+// Are we allowed to form a atomic load or store of this type?
+static bool isSupportedAtomicType(Type *Ty) {
+ return Ty->isIntegerTy() || Ty->isPointerTy() || Ty->isFloatingPointTy();
+}
+
/// \brief Helper to combine a load to a new type.
///
/// This just does the work of combining a load to a new type. It handles
@@ -319,6 +324,9 @@ Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) {
/// point the \c InstCombiner currently is using.
static LoadInst *combineLoadToNewType(InstCombiner &IC, LoadInst &LI, Type *NewTy,
const Twine &Suffix = "") {
+ assert((!LI.isAtomic() || isSupportedAtomicType(NewTy)) &&
+ "can't fold an atomic load to requested type");
+
Value *Ptr = LI.getPointerOperand();
unsigned AS = LI.getPointerAddressSpace();
SmallVector<std::pair<unsigned, MDNode *>, 8> MD;
@@ -380,8 +388,16 @@ static LoadInst *combineLoadToNewType(InstCombiner &IC, LoadInst &LI, Type *NewT
break;
case LLVMContext::MD_range:
// FIXME: It would be nice to propagate this in some way, but the type
- // conversions make it hard. If the new type is a pointer, we could
- // translate it to !nonnull metadata.
+ // conversions make it hard.
+
+ // If it's a pointer now and the range does not contain 0, make it !nonnull.
+ if (NewTy->isPointerTy()) {
+ unsigned BitWidth = IC.getDataLayout().getTypeSizeInBits(NewTy);
+ if (!getConstantRangeFromMetadata(*N).contains(APInt(BitWidth, 0))) {
+ MDNode *NN = MDNode::get(LI.getContext(), None);
+ NewLoad->setMetadata(LLVMContext::MD_nonnull, NN);
+ }
+ }
break;
}
}
@@ -392,6 +408,9 @@ static LoadInst *combineLoadToNewType(InstCombiner &IC, LoadInst &LI, Type *NewT
///
/// Returns the newly created store instruction.
static StoreInst *combineStoreToNewValue(InstCombiner &IC, StoreInst &SI, Value *V) {
+ assert((!SI.isAtomic() || isSupportedAtomicType(V->getType())) &&
+ "can't fold an atomic store of requested type");
+
Value *Ptr = SI.getPointerOperand();
unsigned AS = SI.getPointerAddressSpace();
SmallVector<std::pair<unsigned, MDNode *>, 8> MD;
@@ -466,6 +485,10 @@ static Instruction *combineLoadToOperationType(InstCombiner &IC, LoadInst &LI) {
if (LI.use_empty())
return nullptr;
+ // swifterror values can't be bitcasted.
+ if (LI.getPointerOperand()->isSwiftError())
+ return nullptr;
+
Type *Ty = LI.getType();
const DataLayout &DL = IC.getDataLayout();
@@ -475,10 +498,12 @@ static Instruction *combineLoadToOperationType(InstCombiner &IC, LoadInst &LI) {
// size is a legal integer type.
if (!Ty->isIntegerTy() && Ty->isSized() &&
DL.isLegalInteger(DL.getTypeStoreSizeInBits(Ty)) &&
- DL.getTypeStoreSizeInBits(Ty) == DL.getTypeSizeInBits(Ty)) {
- if (std::all_of(LI.user_begin(), LI.user_end(), [&LI](User *U) {
+ DL.getTypeStoreSizeInBits(Ty) == DL.getTypeSizeInBits(Ty) &&
+ !DL.isNonIntegralPointerType(Ty)) {
+ if (all_of(LI.users(), [&LI](User *U) {
auto *SI = dyn_cast<StoreInst>(U);
- return SI && SI->getPointerOperand() != &LI;
+ return SI && SI->getPointerOperand() != &LI &&
+ !SI->getPointerOperand()->isSwiftError();
})) {
LoadInst *NewLoad = combineLoadToNewType(
IC, LI,
@@ -501,14 +526,14 @@ static Instruction *combineLoadToOperationType(InstCombiner &IC, LoadInst &LI) {
// as long as those are noops (i.e., the source or dest type have the same
// bitwidth as the target's pointers).
if (LI.hasOneUse())
- if (auto* CI = dyn_cast<CastInst>(LI.user_back())) {
- if (CI->isNoopCast(DL)) {
- LoadInst *NewLoad = combineLoadToNewType(IC, LI, CI->getDestTy());
- CI->replaceAllUsesWith(NewLoad);
- IC.eraseInstFromFunction(*CI);
- return &LI;
- }
- }
+ if (auto* CI = dyn_cast<CastInst>(LI.user_back()))
+ if (CI->isNoopCast(DL))
+ if (!LI.isAtomic() || isSupportedAtomicType(CI->getDestTy())) {
+ LoadInst *NewLoad = combineLoadToNewType(IC, LI, CI->getDestTy());
+ CI->replaceAllUsesWith(NewLoad);
+ IC.eraseInstFromFunction(*CI);
+ return &LI;
+ }
// FIXME: We should also canonicalize loads of vectors when their elements are
// cast to other types.
@@ -802,7 +827,7 @@ Instruction *InstCombiner::visitLoadInst(LoadInst &LI) {
// Attempt to improve the alignment.
unsigned KnownAlign = getOrEnforceKnownAlignment(
- Op, DL.getPrefTypeAlignment(LI.getType()), DL, &LI, AC, DT);
+ Op, DL.getPrefTypeAlignment(LI.getType()), DL, &LI, &AC, &DT);
unsigned LoadAlign = LI.getAlignment();
unsigned EffectiveLoadAlign =
LoadAlign != 0 ? LoadAlign : DL.getABITypeAlignment(LI.getType());
@@ -825,22 +850,11 @@ Instruction *InstCombiner::visitLoadInst(LoadInst &LI) {
// where there are several consecutive memory accesses to the same location,
// separated by a few arithmetic operations.
BasicBlock::iterator BBI(LI);
- AAMDNodes AATags;
bool IsLoadCSE = false;
- if (Value *AvailableVal =
- FindAvailableLoadedValue(&LI, LI.getParent(), BBI,
- DefMaxInstsToScan, AA, &AATags, &IsLoadCSE)) {
- if (IsLoadCSE) {
- LoadInst *NLI = cast<LoadInst>(AvailableVal);
- unsigned KnownIDs[] = {
- LLVMContext::MD_tbaa, LLVMContext::MD_alias_scope,
- LLVMContext::MD_noalias, LLVMContext::MD_range,
- LLVMContext::MD_invariant_load, LLVMContext::MD_nonnull,
- LLVMContext::MD_invariant_group, LLVMContext::MD_align,
- LLVMContext::MD_dereferenceable,
- LLVMContext::MD_dereferenceable_or_null};
- combineMetadata(NLI, &LI, KnownIDs);
- };
+ if (Value *AvailableVal = FindAvailableLoadedValue(
+ &LI, LI.getParent(), BBI, DefMaxInstsToScan, AA, &IsLoadCSE)) {
+ if (IsLoadCSE)
+ combineMetadataForCSE(cast<LoadInst>(AvailableVal), &LI);
return replaceInstUsesWith(
LI, Builder->CreateBitOrPointerCast(AvailableVal, LI.getType(),
@@ -1005,19 +1019,26 @@ static bool combineStoreToValueType(InstCombiner &IC, StoreInst &SI) {
if (!SI.isUnordered())
return false;
+ // swifterror values can't be bitcasted.
+ if (SI.getPointerOperand()->isSwiftError())
+ return false;
+
Value *V = SI.getValueOperand();
// Fold away bit casts of the stored value by storing the original type.
if (auto *BC = dyn_cast<BitCastInst>(V)) {
V = BC->getOperand(0);
- combineStoreToNewValue(IC, SI, V);
- return true;
+ if (!SI.isAtomic() || isSupportedAtomicType(V->getType())) {
+ combineStoreToNewValue(IC, SI, V);
+ return true;
+ }
}
- if (Value *U = likeBitCastFromVector(IC, V)) {
- combineStoreToNewValue(IC, SI, U);
- return true;
- }
+ if (Value *U = likeBitCastFromVector(IC, V))
+ if (!SI.isAtomic() || isSupportedAtomicType(U->getType())) {
+ combineStoreToNewValue(IC, SI, U);
+ return true;
+ }
// FIXME: We should also canonicalize stores of vectors when their elements
// are cast to other types.
@@ -1169,7 +1190,7 @@ Instruction *InstCombiner::visitStoreInst(StoreInst &SI) {
// Attempt to improve the alignment.
unsigned KnownAlign = getOrEnforceKnownAlignment(
- Ptr, DL.getPrefTypeAlignment(Val->getType()), DL, &SI, AC, DT);
+ Ptr, DL.getPrefTypeAlignment(Val->getType()), DL, &SI, &AC, &DT);
unsigned StoreAlign = SI.getAlignment();
unsigned EffectiveStoreAlign =
StoreAlign != 0 ? StoreAlign : DL.getABITypeAlignment(Val->getType());
@@ -1293,7 +1314,7 @@ Instruction *InstCombiner::visitStoreInst(StoreInst &SI) {
bool InstCombiner::SimplifyStoreAtEndOfBlock(StoreInst &SI) {
assert(SI.isUnordered() &&
"this code has not been auditted for volatile or ordered store case");
-
+
BasicBlock *StoreBB = SI.getParent();
// Check to see if the successor block has exactly two incoming edges. If
diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
index 788097f..45a19fb 100644
--- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@@ -48,8 +48,8 @@ static Value *simplifyValueKnownNonZero(Value *V, InstCombiner &IC,
BinaryOperator *I = dyn_cast<BinaryOperator>(V);
if (I && I->isLogicalShift() &&
isKnownToBeAPowerOfTwo(I->getOperand(0), IC.getDataLayout(), false, 0,
- IC.getAssumptionCache(), &CxtI,
- IC.getDominatorTree())) {
+ &IC.getAssumptionCache(), &CxtI,
+ &IC.getDominatorTree())) {
// We know that this is an exact/nuw shift and that the input is a
// non-zero context as well.
if (Value *V2 = simplifyValueKnownNonZero(I->getOperand(0), IC, CxtI)) {
@@ -179,7 +179,7 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) {
if (Value *V = SimplifyVectorOp(I))
return replaceInstUsesWith(I, V);
- if (Value *V = SimplifyMulInst(Op0, Op1, DL, TLI, DT, AC))
+ if (Value *V = SimplifyMulInst(Op0, Op1, DL, &TLI, &DT, &AC))
return replaceInstUsesWith(I, V);
if (Value *V = SimplifyUsingDistributiveLaws(I))
@@ -267,14 +267,8 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) {
// Simplify mul instructions with a constant RHS.
if (isa<Constant>(Op1)) {
- // Try to fold constant mul into select arguments.
- if (SelectInst *SI = dyn_cast<SelectInst>(Op0))
- if (Instruction *R = FoldOpIntoSelect(I, SI))
- return R;
-
- if (isa<PHINode>(Op0))
- if (Instruction *NV = FoldOpIntoPhi(I))
- return NV;
+ if (Instruction *FoldedMul = foldOpWithConstantIntoOperand(I))
+ return FoldedMul;
// Canonicalize (X+C1)*CI -> X*CI+C1*CI.
{
@@ -389,6 +383,80 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) {
}
}
+ // Check for (mul (sext x), y), see if we can merge this into an
+ // integer mul followed by a sext.
+ if (SExtInst *Op0Conv = dyn_cast<SExtInst>(Op0)) {
+ // (mul (sext x), cst) --> (sext (mul x, cst'))
+ if (ConstantInt *Op1C = dyn_cast<ConstantInt>(Op1)) {
+ if (Op0Conv->hasOneUse()) {
+ Constant *CI =
+ ConstantExpr::getTrunc(Op1C, Op0Conv->getOperand(0)->getType());
+ if (ConstantExpr::getSExt(CI, I.getType()) == Op1C &&
+ WillNotOverflowSignedMul(Op0Conv->getOperand(0), CI, I)) {
+ // Insert the new, smaller mul.
+ Value *NewMul =
+ Builder->CreateNSWMul(Op0Conv->getOperand(0), CI, "mulconv");
+ return new SExtInst(NewMul, I.getType());
+ }
+ }
+ }
+
+ // (mul (sext x), (sext y)) --> (sext (mul int x, y))
+ if (SExtInst *Op1Conv = dyn_cast<SExtInst>(Op1)) {
+ // Only do this if x/y have the same type, if at last one of them has a
+ // single use (so we don't increase the number of sexts), and if the
+ // integer mul will not overflow.
+ if (Op0Conv->getOperand(0)->getType() ==
+ Op1Conv->getOperand(0)->getType() &&
+ (Op0Conv->hasOneUse() || Op1Conv->hasOneUse()) &&
+ WillNotOverflowSignedMul(Op0Conv->getOperand(0),
+ Op1Conv->getOperand(0), I)) {
+ // Insert the new integer mul.
+ Value *NewMul = Builder->CreateNSWMul(
+ Op0Conv->getOperand(0), Op1Conv->getOperand(0), "mulconv");
+ return new SExtInst(NewMul, I.getType());
+ }
+ }
+ }
+
+ // Check for (mul (zext x), y), see if we can merge this into an
+ // integer mul followed by a zext.
+ if (auto *Op0Conv = dyn_cast<ZExtInst>(Op0)) {
+ // (mul (zext x), cst) --> (zext (mul x, cst'))
+ if (ConstantInt *Op1C = dyn_cast<ConstantInt>(Op1)) {
+ if (Op0Conv->hasOneUse()) {
+ Constant *CI =
+ ConstantExpr::getTrunc(Op1C, Op0Conv->getOperand(0)->getType());
+ if (ConstantExpr::getZExt(CI, I.getType()) == Op1C &&
+ computeOverflowForUnsignedMul(Op0Conv->getOperand(0), CI, &I) ==
+ OverflowResult::NeverOverflows) {
+ // Insert the new, smaller mul.
+ Value *NewMul =
+ Builder->CreateNUWMul(Op0Conv->getOperand(0), CI, "mulconv");
+ return new ZExtInst(NewMul, I.getType());
+ }
+ }
+ }
+
+ // (mul (zext x), (zext y)) --> (zext (mul int x, y))
+ if (auto *Op1Conv = dyn_cast<ZExtInst>(Op1)) {
+ // Only do this if x/y have the same type, if at last one of them has a
+ // single use (so we don't increase the number of zexts), and if the
+ // integer mul will not overflow.
+ if (Op0Conv->getOperand(0)->getType() ==
+ Op1Conv->getOperand(0)->getType() &&
+ (Op0Conv->hasOneUse() || Op1Conv->hasOneUse()) &&
+ computeOverflowForUnsignedMul(Op0Conv->getOperand(0),
+ Op1Conv->getOperand(0),
+ &I) == OverflowResult::NeverOverflows) {
+ // Insert the new integer mul.
+ Value *NewMul = Builder->CreateNUWMul(
+ Op0Conv->getOperand(0), Op1Conv->getOperand(0), "mulconv");
+ return new ZExtInst(NewMul, I.getType());
+ }
+ }
+ }
+
if (!I.hasNoSignedWrap() && WillNotOverflowSignedMul(Op0, Op1, I)) {
Changed = true;
I.setHasNoSignedWrap(true);
@@ -545,21 +613,15 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) {
std::swap(Op0, Op1);
if (Value *V =
- SimplifyFMulInst(Op0, Op1, I.getFastMathFlags(), DL, TLI, DT, AC))
+ SimplifyFMulInst(Op0, Op1, I.getFastMathFlags(), DL, &TLI, &DT, &AC))
return replaceInstUsesWith(I, V);
bool AllowReassociate = I.hasUnsafeAlgebra();
// Simplify mul instructions with a constant RHS.
if (isa<Constant>(Op1)) {
- // Try to fold constant mul into select arguments.
- if (SelectInst *SI = dyn_cast<SelectInst>(Op0))
- if (Instruction *R = FoldOpIntoSelect(I, SI))
- return R;
-
- if (isa<PHINode>(Op0))
- if (Instruction *NV = FoldOpIntoPhi(I))
- return NV;
+ if (Instruction *FoldedMul = foldOpWithConstantIntoOperand(I))
+ return FoldedMul;
// (fmul X, -1.0) --> (fsub -0.0, X)
if (match(Op1, m_SpecificFP(-1.0))) {
@@ -709,7 +771,6 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) {
BuilderTy::FastMathFlagGuard Guard(*Builder);
Builder->setFastMathFlags(I.getFastMathFlags());
Value *T = Builder->CreateFMul(Opnd1, Opnd1);
-
Value *R = Builder->CreateFMul(T, Y);
R->takeName(&I);
return replaceInstUsesWith(I, R);
@@ -883,14 +944,9 @@ Instruction *InstCombiner::commonIDivTransforms(BinaryOperator &I) {
}
}
- if (*C2 != 0) { // avoid X udiv 0
- if (SelectInst *SI = dyn_cast<SelectInst>(Op0))
- if (Instruction *R = FoldOpIntoSelect(I, SI))
- return R;
- if (isa<PHINode>(Op0))
- if (Instruction *NV = FoldOpIntoPhi(I))
- return NV;
- }
+ if (*C2 != 0) // avoid X udiv 0
+ if (Instruction *FoldedDiv = foldOpWithConstantIntoOperand(I))
+ return FoldedDiv;
}
}
@@ -991,19 +1047,22 @@ static Instruction *foldUDivNegCst(Value *Op0, Value *Op1,
}
// X udiv (C1 << N), where C1 is "1<<C2" --> X >> (N+C2)
+// X udiv (zext (C1 << N)), where C1 is "1<<C2" --> X >> (N+C2)
static Instruction *foldUDivShl(Value *Op0, Value *Op1, const BinaryOperator &I,
InstCombiner &IC) {
- Instruction *ShiftLeft = cast<Instruction>(Op1);
- if (isa<ZExtInst>(ShiftLeft))
- ShiftLeft = cast<Instruction>(ShiftLeft->getOperand(0));
-
- const APInt &CI =
- cast<Constant>(ShiftLeft->getOperand(0))->getUniqueInteger();
- Value *N = ShiftLeft->getOperand(1);
- if (CI != 1)
- N = IC.Builder->CreateAdd(N, ConstantInt::get(N->getType(), CI.logBase2()));
- if (ZExtInst *Z = dyn_cast<ZExtInst>(Op1))
- N = IC.Builder->CreateZExt(N, Z->getDestTy());
+ Value *ShiftLeft;
+ if (!match(Op1, m_ZExt(m_Value(ShiftLeft))))
+ ShiftLeft = Op1;
+
+ const APInt *CI;
+ Value *N;
+ if (!match(ShiftLeft, m_Shl(m_APInt(CI), m_Value(N))))
+ llvm_unreachable("match should never fail here!");
+ if (*CI != 1)
+ N = IC.Builder->CreateAdd(N,
+ ConstantInt::get(N->getType(), CI->logBase2()));
+ if (Op1 != ShiftLeft)
+ N = IC.Builder->CreateZExt(N, Op1->getType());
BinaryOperator *LShr = BinaryOperator::CreateLShr(Op0, N);
if (I.isExact())
LShr->setIsExact();
@@ -1059,7 +1118,7 @@ Instruction *InstCombiner::visitUDiv(BinaryOperator &I) {
if (Value *V = SimplifyVectorOp(I))
return replaceInstUsesWith(I, V);
- if (Value *V = SimplifyUDivInst(Op0, Op1, DL, TLI, DT, AC))
+ if (Value *V = SimplifyUDivInst(Op0, Op1, DL, &TLI, &DT, &AC))
return replaceInstUsesWith(I, V);
// Handle the integer div common cases
@@ -1132,7 +1191,7 @@ Instruction *InstCombiner::visitSDiv(BinaryOperator &I) {
if (Value *V = SimplifyVectorOp(I))
return replaceInstUsesWith(I, V);
- if (Value *V = SimplifySDivInst(Op0, Op1, DL, TLI, DT, AC))
+ if (Value *V = SimplifySDivInst(Op0, Op1, DL, &TLI, &DT, &AC))
return replaceInstUsesWith(I, V);
// Handle the integer div common cases
@@ -1195,7 +1254,7 @@ Instruction *InstCombiner::visitSDiv(BinaryOperator &I) {
return BO;
}
- if (isKnownToBeAPowerOfTwo(Op1, DL, /*OrZero*/ true, 0, AC, &I, DT)) {
+ if (isKnownToBeAPowerOfTwo(Op1, DL, /*OrZero*/ true, 0, &AC, &I, &DT)) {
// X sdiv (1 << Y) -> X udiv (1 << Y) ( -> X u>> Y)
// Safe because the only negative value (1 << Y) can take on is
// INT_MIN, and X sdiv INT_MIN == X udiv INT_MIN == 0 if X doesn't have
@@ -1247,7 +1306,7 @@ Instruction *InstCombiner::visitFDiv(BinaryOperator &I) {
return replaceInstUsesWith(I, V);
if (Value *V = SimplifyFDivInst(Op0, Op1, I.getFastMathFlags(),
- DL, TLI, DT, AC))
+ DL, &TLI, &DT, &AC))
return replaceInstUsesWith(I, V);
if (isa<Constant>(Op0))
@@ -1367,6 +1426,16 @@ Instruction *InstCombiner::visitFDiv(BinaryOperator &I) {
}
}
+ Value *LHS;
+ Value *RHS;
+
+ // -x / -y -> x / y
+ if (match(Op0, m_FNeg(m_Value(LHS))) && match(Op1, m_FNeg(m_Value(RHS)))) {
+ I.setOperand(0, LHS);
+ I.setOperand(1, RHS);
+ return &I;
+ }
+
return nullptr;
}
@@ -1421,7 +1490,7 @@ Instruction *InstCombiner::visitURem(BinaryOperator &I) {
if (Value *V = SimplifyVectorOp(I))
return replaceInstUsesWith(I, V);
- if (Value *V = SimplifyURemInst(Op0, Op1, DL, TLI, DT, AC))
+ if (Value *V = SimplifyURemInst(Op0, Op1, DL, &TLI, &DT, &AC))
return replaceInstUsesWith(I, V);
if (Instruction *common = commonIRemTransforms(I))
@@ -1434,7 +1503,7 @@ Instruction *InstCombiner::visitURem(BinaryOperator &I) {
I.getType());
// X urem Y -> X and Y-1, where Y is a power of 2,
- if (isKnownToBeAPowerOfTwo(Op1, DL, /*OrZero*/ true, 0, AC, &I, DT)) {
+ if (isKnownToBeAPowerOfTwo(Op1, DL, /*OrZero*/ true, 0, &AC, &I, &DT)) {
Constant *N1 = Constant::getAllOnesValue(I.getType());
Value *Add = Builder->CreateAdd(Op1, N1);
return BinaryOperator::CreateAnd(Op0, Add);
@@ -1447,6 +1516,14 @@ Instruction *InstCombiner::visitURem(BinaryOperator &I) {
return replaceInstUsesWith(I, Ext);
}
+ // X urem C -> X < C ? X : X - C, where C >= signbit.
+ const APInt *DivisorC;
+ if (match(Op1, m_APInt(DivisorC)) && DivisorC->isNegative()) {
+ Value *Cmp = Builder->CreateICmpULT(Op0, Op1);
+ Value *Sub = Builder->CreateSub(Op0, Op1);
+ return SelectInst::Create(Cmp, Op0, Sub);
+ }
+
return nullptr;
}
@@ -1456,7 +1533,7 @@ Instruction *InstCombiner::visitSRem(BinaryOperator &I) {
if (Value *V = SimplifyVectorOp(I))
return replaceInstUsesWith(I, V);
- if (Value *V = SimplifySRemInst(Op0, Op1, DL, TLI, DT, AC))
+ if (Value *V = SimplifySRemInst(Op0, Op1, DL, &TLI, &DT, &AC))
return replaceInstUsesWith(I, V);
// Handle the integer rem common cases
@@ -1532,7 +1609,7 @@ Instruction *InstCombiner::visitFRem(BinaryOperator &I) {
return replaceInstUsesWith(I, V);
if (Value *V = SimplifyFRemInst(Op0, Op1, I.getFastMathFlags(),
- DL, TLI, DT, AC))
+ DL, &TLI, &DT, &AC))
return replaceInstUsesWith(I, V);
// Handle cases involving: rem X, (select Cond, Y, Z)
diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp
index 79a4912..4cbffe9 100644
--- a/contrib/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp
+++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp
@@ -18,11 +18,27 @@
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/IR/DebugInfo.h"
using namespace llvm;
using namespace llvm::PatternMatch;
#define DEBUG_TYPE "instcombine"
+/// The PHI arguments will be folded into a single operation with a PHI node
+/// as input. The debug location of the single operation will be the merged
+/// locations of the original PHI node arguments.
+DebugLoc InstCombiner::PHIArgMergedDebugLoc(PHINode &PN) {
+ auto *FirstInst = cast<Instruction>(PN.getIncomingValue(0));
+ const DILocation *Loc = FirstInst->getDebugLoc();
+
+ for (unsigned i = 1; i != PN.getNumIncomingValues(); ++i) {
+ auto *I = cast<Instruction>(PN.getIncomingValue(i));
+ Loc = DILocation::getMergedLocation(Loc, I->getDebugLoc());
+ }
+
+ return Loc;
+}
+
/// If we have something like phi [add (a,b), add(a,c)] and if a/b/c and the
/// adds all have a single use, turn this into a phi and a single binop.
Instruction *InstCombiner::FoldPHIArgBinOpIntoPHI(PHINode &PN) {
@@ -101,7 +117,7 @@ Instruction *InstCombiner::FoldPHIArgBinOpIntoPHI(PHINode &PN) {
if (CmpInst *CIOp = dyn_cast<CmpInst>(FirstInst)) {
CmpInst *NewCI = CmpInst::Create(CIOp->getOpcode(), CIOp->getPredicate(),
LHSVal, RHSVal);
- NewCI->setDebugLoc(FirstInst->getDebugLoc());
+ NewCI->setDebugLoc(PHIArgMergedDebugLoc(PN));
return NewCI;
}
@@ -114,7 +130,7 @@ Instruction *InstCombiner::FoldPHIArgBinOpIntoPHI(PHINode &PN) {
for (unsigned i = 1, e = PN.getNumIncomingValues(); i != e; ++i)
NewBinOp->andIRFlags(PN.getIncomingValue(i));
- NewBinOp->setDebugLoc(FirstInst->getDebugLoc());
+ NewBinOp->setDebugLoc(PHIArgMergedDebugLoc(PN));
return NewBinOp;
}
@@ -223,7 +239,7 @@ Instruction *InstCombiner::FoldPHIArgGEPIntoPHI(PHINode &PN) {
GetElementPtrInst::Create(FirstInst->getSourceElementType(), Base,
makeArrayRef(FixedOperands).slice(1));
if (AllInBounds) NewGEP->setIsInBounds();
- NewGEP->setDebugLoc(FirstInst->getDebugLoc());
+ NewGEP->setDebugLoc(PHIArgMergedDebugLoc(PN));
return NewGEP;
}
@@ -383,7 +399,7 @@ Instruction *InstCombiner::FoldPHIArgLoadIntoPHI(PHINode &PN) {
for (Value *IncValue : PN.incoming_values())
cast<LoadInst>(IncValue)->setVolatile(false);
- NewLI->setDebugLoc(FirstLI->getDebugLoc());
+ NewLI->setDebugLoc(PHIArgMergedDebugLoc(PN));
return NewLI;
}
@@ -549,7 +565,7 @@ Instruction *InstCombiner::FoldPHIArgOpIntoPHI(PHINode &PN) {
if (CastInst *FirstCI = dyn_cast<CastInst>(FirstInst)) {
CastInst *NewCI = CastInst::Create(FirstCI->getOpcode(), PhiVal,
PN.getType());
- NewCI->setDebugLoc(FirstInst->getDebugLoc());
+ NewCI->setDebugLoc(PHIArgMergedDebugLoc(PN));
return NewCI;
}
@@ -560,14 +576,14 @@ Instruction *InstCombiner::FoldPHIArgOpIntoPHI(PHINode &PN) {
for (unsigned i = 1, e = PN.getNumIncomingValues(); i != e; ++i)
BinOp->andIRFlags(PN.getIncomingValue(i));
- BinOp->setDebugLoc(FirstInst->getDebugLoc());
+ BinOp->setDebugLoc(PHIArgMergedDebugLoc(PN));
return BinOp;
}
CmpInst *CIOp = cast<CmpInst>(FirstInst);
CmpInst *NewCI = CmpInst::Create(CIOp->getOpcode(), CIOp->getPredicate(),
PhiVal, ConstantOp);
- NewCI->setDebugLoc(FirstInst->getDebugLoc());
+ NewCI->setDebugLoc(PHIArgMergedDebugLoc(PN));
return NewCI;
}
@@ -835,8 +851,8 @@ Instruction *InstCombiner::SliceUpIllegalIntegerPHI(PHINode &FirstPhi) {
// needed piece.
if (PHINode *OldInVal = dyn_cast<PHINode>(PN->getIncomingValue(i)))
if (PHIsInspected.count(OldInVal)) {
- unsigned RefPHIId = std::find(PHIsToSlice.begin(),PHIsToSlice.end(),
- OldInVal)-PHIsToSlice.begin();
+ unsigned RefPHIId =
+ find(PHIsToSlice, OldInVal) - PHIsToSlice.begin();
PHIUsers.push_back(PHIUsageRecord(RefPHIId, Offset,
cast<Instruction>(Res)));
++UserE;
@@ -864,7 +880,7 @@ Instruction *InstCombiner::SliceUpIllegalIntegerPHI(PHINode &FirstPhi) {
// PHINode simplification
//
Instruction *InstCombiner::visitPHINode(PHINode &PN) {
- if (Value *V = SimplifyInstruction(&PN, DL, TLI, DT, AC))
+ if (Value *V = SimplifyInstruction(&PN, DL, &TLI, &DT, &AC))
return replaceInstUsesWith(PN, V);
if (Instruction *Result = FoldPHIArgZextsIntoPHI(PN))
@@ -921,7 +937,7 @@ Instruction *InstCombiner::visitPHINode(PHINode &PN) {
for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i) {
Instruction *CtxI = PN.getIncomingBlock(i)->getTerminator();
Value *VA = PN.getIncomingValue(i);
- if (isKnownNonZero(VA, DL, 0, AC, CtxI, DT)) {
+ if (isKnownNonZero(VA, DL, 0, &AC, CtxI, &DT)) {
if (!NonZeroConst)
NonZeroConst = GetAnyNonZeroConstInt(PN);
PN.setIncomingValue(i, NonZeroConst);
diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
index 8f1ff8a..3664484 100644
--- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -15,6 +15,7 @@
#include "llvm/Analysis/ConstantFolding.h"
#include "llvm/Analysis/InstructionSimplify.h"
#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/MDBuilder.h"
#include "llvm/IR/PatternMatch.h"
using namespace llvm;
using namespace PatternMatch;
@@ -78,7 +79,7 @@ static Value *generateMinMaxSelectPattern(InstCombiner::BuilderTy *Builder,
/// a bitmask indicating which operands of this instruction are foldable if they
/// equal the other incoming value of the select.
///
-static unsigned GetSelectFoldableOperands(Instruction *I) {
+static unsigned getSelectFoldableOperands(Instruction *I) {
switch (I->getOpcode()) {
case Instruction::Add:
case Instruction::Mul:
@@ -98,7 +99,7 @@ static unsigned GetSelectFoldableOperands(Instruction *I) {
/// For the same transformation as the previous function, return the identity
/// constant that goes into the select.
-static Constant *GetSelectFoldableConstant(Instruction *I) {
+static Constant *getSelectFoldableConstant(Instruction *I) {
switch (I->getOpcode()) {
default: llvm_unreachable("This cannot happen!");
case Instruction::Add:
@@ -117,7 +118,7 @@ static Constant *GetSelectFoldableConstant(Instruction *I) {
}
/// We have (select c, TI, FI), and we know that TI and FI have the same opcode.
-Instruction *InstCombiner::FoldSelectOpOp(SelectInst &SI, Instruction *TI,
+Instruction *InstCombiner::foldSelectOpOp(SelectInst &SI, Instruction *TI,
Instruction *FI) {
// If this is a cast from the same type, merge.
if (TI->getNumOperands() == 1 && TI->isCast()) {
@@ -154,19 +155,19 @@ Instruction *InstCombiner::FoldSelectOpOp(SelectInst &SI, Instruction *TI,
}
// Fold this by inserting a select from the input values.
- Value *NewSI = Builder->CreateSelect(SI.getCondition(), TI->getOperand(0),
- FI->getOperand(0), SI.getName()+".v");
+ Value *NewSI =
+ Builder->CreateSelect(SI.getCondition(), TI->getOperand(0),
+ FI->getOperand(0), SI.getName() + ".v", &SI);
return CastInst::Create(Instruction::CastOps(TI->getOpcode()), NewSI,
TI->getType());
}
- // TODO: This function ends awkwardly in unreachable - fix to be more normal.
-
// Only handle binary operators with one-use here. As with the cast case
// above, it may be possible to relax the one-use constraint, but that needs
// be examined carefully since it may not reduce the total number of
// instructions.
- if (!isa<BinaryOperator>(TI) || !TI->hasOneUse() || !FI->hasOneUse())
+ BinaryOperator *BO = dyn_cast<BinaryOperator>(TI);
+ if (!BO || !TI->hasOneUse() || !FI->hasOneUse())
return nullptr;
// Figure out if the operations have any operands in common.
@@ -199,16 +200,11 @@ Instruction *InstCombiner::FoldSelectOpOp(SelectInst &SI, Instruction *TI,
}
// If we reach here, they do have operations in common.
- Value *NewSI = Builder->CreateSelect(SI.getCondition(), OtherOpT,
- OtherOpF, SI.getName()+".v");
-
- if (BinaryOperator *BO = dyn_cast<BinaryOperator>(TI)) {
- if (MatchIsOpZero)
- return BinaryOperator::Create(BO->getOpcode(), MatchOp, NewSI);
- else
- return BinaryOperator::Create(BO->getOpcode(), NewSI, MatchOp);
- }
- llvm_unreachable("Shouldn't get here");
+ Value *NewSI = Builder->CreateSelect(SI.getCondition(), OtherOpT, OtherOpF,
+ SI.getName() + ".v", &SI);
+ Value *Op0 = MatchIsOpZero ? MatchOp : NewSI;
+ Value *Op1 = MatchIsOpZero ? NewSI : MatchOp;
+ return BinaryOperator::Create(BO->getOpcode(), Op0, Op1);
}
static bool isSelect01(Constant *C1, Constant *C2) {
@@ -226,14 +222,14 @@ static bool isSelect01(Constant *C1, Constant *C2) {
/// Try to fold the select into one of the operands to allow further
/// optimization.
-Instruction *InstCombiner::FoldSelectIntoOp(SelectInst &SI, Value *TrueVal,
+Instruction *InstCombiner::foldSelectIntoOp(SelectInst &SI, Value *TrueVal,
Value *FalseVal) {
// See the comment above GetSelectFoldableOperands for a description of the
// transformation we are doing here.
if (Instruction *TVI = dyn_cast<Instruction>(TrueVal)) {
if (TVI->hasOneUse() && TVI->getNumOperands() == 2 &&
!isa<Constant>(FalseVal)) {
- if (unsigned SFO = GetSelectFoldableOperands(TVI)) {
+ if (unsigned SFO = getSelectFoldableOperands(TVI)) {
unsigned OpToFold = 0;
if ((SFO & 1) && FalseVal == TVI->getOperand(0)) {
OpToFold = 1;
@@ -242,7 +238,7 @@ Instruction *InstCombiner::FoldSelectIntoOp(SelectInst &SI, Value *TrueVal,
}
if (OpToFold) {
- Constant *C = GetSelectFoldableConstant(TVI);
+ Constant *C = getSelectFoldableConstant(TVI);
Value *OOp = TVI->getOperand(2-OpToFold);
// Avoid creating select between 2 constants unless it's selecting
// between 0, 1 and -1.
@@ -263,7 +259,7 @@ Instruction *InstCombiner::FoldSelectIntoOp(SelectInst &SI, Value *TrueVal,
if (Instruction *FVI = dyn_cast<Instruction>(FalseVal)) {
if (FVI->hasOneUse() && FVI->getNumOperands() == 2 &&
!isa<Constant>(TrueVal)) {
- if (unsigned SFO = GetSelectFoldableOperands(FVI)) {
+ if (unsigned SFO = getSelectFoldableOperands(FVI)) {
unsigned OpToFold = 0;
if ((SFO & 1) && TrueVal == FVI->getOperand(0)) {
OpToFold = 1;
@@ -272,7 +268,7 @@ Instruction *InstCombiner::FoldSelectIntoOp(SelectInst &SI, Value *TrueVal,
}
if (OpToFold) {
- Constant *C = GetSelectFoldableConstant(FVI);
+ Constant *C = getSelectFoldableConstant(FVI);
Value *OOp = FVI->getOperand(2-OpToFold);
// Avoid creating select between 2 constants unless it's selecting
// between 0, 1 and -1.
@@ -411,103 +407,151 @@ static Value *foldSelectCttzCtlz(ICmpInst *ICI, Value *TrueVal, Value *FalseVal,
return nullptr;
}
+/// Return true if we find and adjust an icmp+select pattern where the compare
+/// is with a constant that can be incremented or decremented to match the
+/// minimum or maximum idiom.
+static bool adjustMinMax(SelectInst &Sel, ICmpInst &Cmp) {
+ ICmpInst::Predicate Pred = Cmp.getPredicate();
+ Value *CmpLHS = Cmp.getOperand(0);
+ Value *CmpRHS = Cmp.getOperand(1);
+ Value *TrueVal = Sel.getTrueValue();
+ Value *FalseVal = Sel.getFalseValue();
+
+ // We may move or edit the compare, so make sure the select is the only user.
+ const APInt *CmpC;
+ if (!Cmp.hasOneUse() || !match(CmpRHS, m_APInt(CmpC)))
+ return false;
+
+ // These transforms only work for selects of integers or vector selects of
+ // integer vectors.
+ Type *SelTy = Sel.getType();
+ auto *SelEltTy = dyn_cast<IntegerType>(SelTy->getScalarType());
+ if (!SelEltTy || SelTy->isVectorTy() != Cmp.getType()->isVectorTy())
+ return false;
+
+ Constant *AdjustedRHS;
+ if (Pred == ICmpInst::ICMP_UGT || Pred == ICmpInst::ICMP_SGT)
+ AdjustedRHS = ConstantInt::get(CmpRHS->getType(), *CmpC + 1);
+ else if (Pred == ICmpInst::ICMP_ULT || Pred == ICmpInst::ICMP_SLT)
+ AdjustedRHS = ConstantInt::get(CmpRHS->getType(), *CmpC - 1);
+ else
+ return false;
+
+ // X > C ? X : C+1 --> X < C+1 ? C+1 : X
+ // X < C ? X : C-1 --> X > C-1 ? C-1 : X
+ if ((CmpLHS == TrueVal && AdjustedRHS == FalseVal) ||
+ (CmpLHS == FalseVal && AdjustedRHS == TrueVal)) {
+ ; // Nothing to do here. Values match without any sign/zero extension.
+ }
+ // Types do not match. Instead of calculating this with mixed types, promote
+ // all to the larger type. This enables scalar evolution to analyze this
+ // expression.
+ else if (CmpRHS->getType()->getScalarSizeInBits() < SelEltTy->getBitWidth()) {
+ Constant *SextRHS = ConstantExpr::getSExt(AdjustedRHS, SelTy);
+
+ // X = sext x; x >s c ? X : C+1 --> X = sext x; X <s C+1 ? C+1 : X
+ // X = sext x; x <s c ? X : C-1 --> X = sext x; X >s C-1 ? C-1 : X
+ // X = sext x; x >u c ? X : C+1 --> X = sext x; X <u C+1 ? C+1 : X
+ // X = sext x; x <u c ? X : C-1 --> X = sext x; X >u C-1 ? C-1 : X
+ if (match(TrueVal, m_SExt(m_Specific(CmpLHS))) && SextRHS == FalseVal) {
+ CmpLHS = TrueVal;
+ AdjustedRHS = SextRHS;
+ } else if (match(FalseVal, m_SExt(m_Specific(CmpLHS))) &&
+ SextRHS == TrueVal) {
+ CmpLHS = FalseVal;
+ AdjustedRHS = SextRHS;
+ } else if (Cmp.isUnsigned()) {
+ Constant *ZextRHS = ConstantExpr::getZExt(AdjustedRHS, SelTy);
+ // X = zext x; x >u c ? X : C+1 --> X = zext x; X <u C+1 ? C+1 : X
+ // X = zext x; x <u c ? X : C-1 --> X = zext x; X >u C-1 ? C-1 : X
+ // zext + signed compare cannot be changed:
+ // 0xff <s 0x00, but 0x00ff >s 0x0000
+ if (match(TrueVal, m_ZExt(m_Specific(CmpLHS))) && ZextRHS == FalseVal) {
+ CmpLHS = TrueVal;
+ AdjustedRHS = ZextRHS;
+ } else if (match(FalseVal, m_ZExt(m_Specific(CmpLHS))) &&
+ ZextRHS == TrueVal) {
+ CmpLHS = FalseVal;
+ AdjustedRHS = ZextRHS;
+ } else {
+ return false;
+ }
+ } else {
+ return false;
+ }
+ } else {
+ return false;
+ }
+
+ Pred = ICmpInst::getSwappedPredicate(Pred);
+ CmpRHS = AdjustedRHS;
+ std::swap(FalseVal, TrueVal);
+ Cmp.setPredicate(Pred);
+ Cmp.setOperand(0, CmpLHS);
+ Cmp.setOperand(1, CmpRHS);
+ Sel.setOperand(1, TrueVal);
+ Sel.setOperand(2, FalseVal);
+ Sel.swapProfMetadata();
+
+ // Move the compare instruction right before the select instruction. Otherwise
+ // the sext/zext value may be defined after the compare instruction uses it.
+ Cmp.moveBefore(&Sel);
+
+ return true;
+}
+
+/// If this is an integer min/max where the select's 'true' operand is a
+/// constant, canonicalize that constant to the 'false' operand:
+/// select (icmp Pred X, C), C, X --> select (icmp Pred' X, C), X, C
+static Instruction *
+canonicalizeMinMaxWithConstant(SelectInst &Sel, ICmpInst &Cmp,
+ InstCombiner::BuilderTy &Builder) {
+ // TODO: We should also canonicalize min/max when the select has a different
+ // constant value than the cmp constant, but we need to fix the backend first.
+ if (!Cmp.hasOneUse() || !isa<Constant>(Cmp.getOperand(1)) ||
+ !isa<Constant>(Sel.getTrueValue()) ||
+ isa<Constant>(Sel.getFalseValue()) ||
+ Cmp.getOperand(1) != Sel.getTrueValue())
+ return nullptr;
+
+ // Canonicalize the compare predicate based on whether we have min or max.
+ Value *LHS, *RHS;
+ ICmpInst::Predicate NewPred;
+ SelectPatternResult SPR = matchSelectPattern(&Sel, LHS, RHS);
+ switch (SPR.Flavor) {
+ case SPF_SMIN: NewPred = ICmpInst::ICMP_SLT; break;
+ case SPF_UMIN: NewPred = ICmpInst::ICMP_ULT; break;
+ case SPF_SMAX: NewPred = ICmpInst::ICMP_SGT; break;
+ case SPF_UMAX: NewPred = ICmpInst::ICMP_UGT; break;
+ default: return nullptr;
+ }
+
+ // Canonicalize the constant to the right side.
+ if (isa<Constant>(LHS))
+ std::swap(LHS, RHS);
+
+ Value *NewCmp = Builder.CreateICmp(NewPred, LHS, RHS);
+ SelectInst *NewSel = SelectInst::Create(NewCmp, LHS, RHS, "", nullptr, &Sel);
+
+ // We swapped the select operands, so swap the metadata too.
+ NewSel->swapProfMetadata();
+ return NewSel;
+}
+
/// Visit a SelectInst that has an ICmpInst as its first operand.
-Instruction *InstCombiner::visitSelectInstWithICmp(SelectInst &SI,
- ICmpInst *ICI) {
- bool Changed = false;
+Instruction *InstCombiner::foldSelectInstWithICmp(SelectInst &SI,
+ ICmpInst *ICI) {
+ if (Instruction *NewSel = canonicalizeMinMaxWithConstant(SI, *ICI, *Builder))
+ return NewSel;
+
+ bool Changed = adjustMinMax(SI, *ICI);
+
ICmpInst::Predicate Pred = ICI->getPredicate();
Value *CmpLHS = ICI->getOperand(0);
Value *CmpRHS = ICI->getOperand(1);
Value *TrueVal = SI.getTrueValue();
Value *FalseVal = SI.getFalseValue();
- // Check cases where the comparison is with a constant that
- // can be adjusted to fit the min/max idiom. We may move or edit ICI
- // here, so make sure the select is the only user.
- if (ICI->hasOneUse())
- if (ConstantInt *CI = dyn_cast<ConstantInt>(CmpRHS)) {
- switch (Pred) {
- default: break;
- case ICmpInst::ICMP_ULT:
- case ICmpInst::ICMP_SLT:
- case ICmpInst::ICMP_UGT:
- case ICmpInst::ICMP_SGT: {
- // These transformations only work for selects over integers.
- IntegerType *SelectTy = dyn_cast<IntegerType>(SI.getType());
- if (!SelectTy)
- break;
-
- Constant *AdjustedRHS;
- if (Pred == ICmpInst::ICMP_UGT || Pred == ICmpInst::ICMP_SGT)
- AdjustedRHS = ConstantInt::get(CI->getContext(), CI->getValue() + 1);
- else // (Pred == ICmpInst::ICMP_ULT || Pred == ICmpInst::ICMP_SLT)
- AdjustedRHS = ConstantInt::get(CI->getContext(), CI->getValue() - 1);
-
- // X > C ? X : C+1 --> X < C+1 ? C+1 : X
- // X < C ? X : C-1 --> X > C-1 ? C-1 : X
- if ((CmpLHS == TrueVal && AdjustedRHS == FalseVal) ||
- (CmpLHS == FalseVal && AdjustedRHS == TrueVal))
- ; // Nothing to do here. Values match without any sign/zero extension.
-
- // Types do not match. Instead of calculating this with mixed types
- // promote all to the larger type. This enables scalar evolution to
- // analyze this expression.
- else if (CmpRHS->getType()->getScalarSizeInBits()
- < SelectTy->getBitWidth()) {
- Constant *sextRHS = ConstantExpr::getSExt(AdjustedRHS, SelectTy);
-
- // X = sext x; x >s c ? X : C+1 --> X = sext x; X <s C+1 ? C+1 : X
- // X = sext x; x <s c ? X : C-1 --> X = sext x; X >s C-1 ? C-1 : X
- // X = sext x; x >u c ? X : C+1 --> X = sext x; X <u C+1 ? C+1 : X
- // X = sext x; x <u c ? X : C-1 --> X = sext x; X >u C-1 ? C-1 : X
- if (match(TrueVal, m_SExt(m_Specific(CmpLHS))) &&
- sextRHS == FalseVal) {
- CmpLHS = TrueVal;
- AdjustedRHS = sextRHS;
- } else if (match(FalseVal, m_SExt(m_Specific(CmpLHS))) &&
- sextRHS == TrueVal) {
- CmpLHS = FalseVal;
- AdjustedRHS = sextRHS;
- } else if (ICI->isUnsigned()) {
- Constant *zextRHS = ConstantExpr::getZExt(AdjustedRHS, SelectTy);
- // X = zext x; x >u c ? X : C+1 --> X = zext x; X <u C+1 ? C+1 : X
- // X = zext x; x <u c ? X : C-1 --> X = zext x; X >u C-1 ? C-1 : X
- // zext + signed compare cannot be changed:
- // 0xff <s 0x00, but 0x00ff >s 0x0000
- if (match(TrueVal, m_ZExt(m_Specific(CmpLHS))) &&
- zextRHS == FalseVal) {
- CmpLHS = TrueVal;
- AdjustedRHS = zextRHS;
- } else if (match(FalseVal, m_ZExt(m_Specific(CmpLHS))) &&
- zextRHS == TrueVal) {
- CmpLHS = FalseVal;
- AdjustedRHS = zextRHS;
- } else
- break;
- } else
- break;
- } else
- break;
-
- Pred = ICmpInst::getSwappedPredicate(Pred);
- CmpRHS = AdjustedRHS;
- std::swap(FalseVal, TrueVal);
- ICI->setPredicate(Pred);
- ICI->setOperand(0, CmpLHS);
- ICI->setOperand(1, CmpRHS);
- SI.setOperand(1, TrueVal);
- SI.setOperand(2, FalseVal);
-
- // Move ICI instruction right before the select instruction. Otherwise
- // the sext/zext value may be defined after the ICI instruction uses it.
- ICI->moveBefore(&SI);
-
- Changed = true;
- break;
- }
- }
- }
-
// Transform (X >s -1) ? C1 : C2 --> ((X >>s 31) & (C2 - C1)) + C1
// and (X <s 0) ? C2 : C1 --> ((X >>s 31) & (C2 - C1)) + C1
// FIXME: Type and constness constraints could be lifted, but we have to
@@ -623,7 +667,7 @@ Instruction *InstCombiner::visitSelectInstWithICmp(SelectInst &SI,
///
/// because Y is not live in BB1/BB2.
///
-static bool CanSelectOperandBeMappingIntoPredBlock(const Value *V,
+static bool canSelectOperandBeMappingIntoPredBlock(const Value *V,
const SelectInst &SI) {
// If the value is a non-instruction value like a constant or argument, it
// can always be mapped.
@@ -651,7 +695,7 @@ static bool CanSelectOperandBeMappingIntoPredBlock(const Value *V,
/// We have an SPF (e.g. a min or max) of an SPF of the form:
/// SPF2(SPF1(A, B), C)
-Instruction *InstCombiner::FoldSPFofSPF(Instruction *Inner,
+Instruction *InstCombiner::foldSPFofSPF(Instruction *Inner,
SelectPatternFlavor SPF1,
Value *A, Value *B,
Instruction &Outer,
@@ -675,28 +719,24 @@ Instruction *InstCombiner::FoldSPFofSPF(Instruction *Inner,
}
if (SPF1 == SPF2) {
- if (ConstantInt *CB = dyn_cast<ConstantInt>(B)) {
- if (ConstantInt *CC = dyn_cast<ConstantInt>(C)) {
- const APInt &ACB = CB->getValue();
- const APInt &ACC = CC->getValue();
-
- // MIN(MIN(A, 23), 97) -> MIN(A, 23)
- // MAX(MAX(A, 97), 23) -> MAX(A, 97)
- if ((SPF1 == SPF_UMIN && ACB.ule(ACC)) ||
- (SPF1 == SPF_SMIN && ACB.sle(ACC)) ||
- (SPF1 == SPF_UMAX && ACB.uge(ACC)) ||
- (SPF1 == SPF_SMAX && ACB.sge(ACC)))
- return replaceInstUsesWith(Outer, Inner);
-
- // MIN(MIN(A, 97), 23) -> MIN(A, 23)
- // MAX(MAX(A, 23), 97) -> MAX(A, 97)
- if ((SPF1 == SPF_UMIN && ACB.ugt(ACC)) ||
- (SPF1 == SPF_SMIN && ACB.sgt(ACC)) ||
- (SPF1 == SPF_UMAX && ACB.ult(ACC)) ||
- (SPF1 == SPF_SMAX && ACB.slt(ACC))) {
- Outer.replaceUsesOfWith(Inner, A);
- return &Outer;
- }
+ const APInt *CB, *CC;
+ if (match(B, m_APInt(CB)) && match(C, m_APInt(CC))) {
+ // MIN(MIN(A, 23), 97) -> MIN(A, 23)
+ // MAX(MAX(A, 97), 23) -> MAX(A, 97)
+ if ((SPF1 == SPF_UMIN && CB->ule(*CC)) ||
+ (SPF1 == SPF_SMIN && CB->sle(*CC)) ||
+ (SPF1 == SPF_UMAX && CB->uge(*CC)) ||
+ (SPF1 == SPF_SMAX && CB->sge(*CC)))
+ return replaceInstUsesWith(Outer, Inner);
+
+ // MIN(MIN(A, 97), 23) -> MIN(A, 23)
+ // MAX(MAX(A, 23), 97) -> MAX(A, 97)
+ if ((SPF1 == SPF_UMIN && CB->ugt(*CC)) ||
+ (SPF1 == SPF_SMIN && CB->sgt(*CC)) ||
+ (SPF1 == SPF_UMAX && CB->ult(*CC)) ||
+ (SPF1 == SPF_SMAX && CB->slt(*CC))) {
+ Outer.replaceUsesOfWith(Inner, A);
+ return &Outer;
}
}
}
@@ -712,8 +752,9 @@ Instruction *InstCombiner::FoldSPFofSPF(Instruction *Inner,
if ((SPF1 == SPF_ABS && SPF2 == SPF_NABS) ||
(SPF1 == SPF_NABS && SPF2 == SPF_ABS)) {
SelectInst *SI = cast<SelectInst>(Inner);
- Value *NewSI = Builder->CreateSelect(
- SI->getCondition(), SI->getFalseValue(), SI->getTrueValue());
+ Value *NewSI =
+ Builder->CreateSelect(SI->getCondition(), SI->getFalseValue(),
+ SI->getTrueValue(), SI->getName(), SI);
return replaceInstUsesWith(Outer, NewSI);
}
@@ -895,7 +936,7 @@ static Instruction *foldAddSubSelect(SelectInst &SI,
if (AddOp != TI)
std::swap(NewTrueOp, NewFalseOp);
Value *NewSel = Builder.CreateSelect(CondVal, NewTrueOp, NewFalseOp,
- SI.getName() + ".p");
+ SI.getName() + ".p", &SI);
if (SI.getType()->isFPOrFPVectorTy()) {
Instruction *RI =
@@ -912,6 +953,147 @@ static Instruction *foldAddSubSelect(SelectInst &SI,
return nullptr;
}
+Instruction *InstCombiner::foldSelectExtConst(SelectInst &Sel) {
+ Instruction *ExtInst;
+ if (!match(Sel.getTrueValue(), m_Instruction(ExtInst)) &&
+ !match(Sel.getFalseValue(), m_Instruction(ExtInst)))
+ return nullptr;
+
+ auto ExtOpcode = ExtInst->getOpcode();
+ if (ExtOpcode != Instruction::ZExt && ExtOpcode != Instruction::SExt)
+ return nullptr;
+
+ // TODO: Handle larger types? That requires adjusting FoldOpIntoSelect too.
+ Value *X = ExtInst->getOperand(0);
+ Type *SmallType = X->getType();
+ if (!SmallType->getScalarType()->isIntegerTy(1))
+ return nullptr;
+
+ Constant *C;
+ if (!match(Sel.getTrueValue(), m_Constant(C)) &&
+ !match(Sel.getFalseValue(), m_Constant(C)))
+ return nullptr;
+
+ // If the constant is the same after truncation to the smaller type and
+ // extension to the original type, we can narrow the select.
+ Value *Cond = Sel.getCondition();
+ Type *SelType = Sel.getType();
+ Constant *TruncC = ConstantExpr::getTrunc(C, SmallType);
+ Constant *ExtC = ConstantExpr::getCast(ExtOpcode, TruncC, SelType);
+ if (ExtC == C) {
+ Value *TruncCVal = cast<Value>(TruncC);
+ if (ExtInst == Sel.getFalseValue())
+ std::swap(X, TruncCVal);
+
+ // select Cond, (ext X), C --> ext(select Cond, X, C')
+ // select Cond, C, (ext X) --> ext(select Cond, C', X)
+ Value *NewSel = Builder->CreateSelect(Cond, X, TruncCVal, "narrow", &Sel);
+ return CastInst::Create(Instruction::CastOps(ExtOpcode), NewSel, SelType);
+ }
+
+ // If one arm of the select is the extend of the condition, replace that arm
+ // with the extension of the appropriate known bool value.
+ if (Cond == X) {
+ if (ExtInst == Sel.getTrueValue()) {
+ // select X, (sext X), C --> select X, -1, C
+ // select X, (zext X), C --> select X, 1, C
+ Constant *One = ConstantInt::getTrue(SmallType);
+ Constant *AllOnesOrOne = ConstantExpr::getCast(ExtOpcode, One, SelType);
+ return SelectInst::Create(Cond, AllOnesOrOne, C, "", nullptr, &Sel);
+ } else {
+ // select X, C, (sext X) --> select X, C, 0
+ // select X, C, (zext X) --> select X, C, 0
+ Constant *Zero = ConstantInt::getNullValue(SelType);
+ return SelectInst::Create(Cond, C, Zero, "", nullptr, &Sel);
+ }
+ }
+
+ return nullptr;
+}
+
+/// Try to transform a vector select with a constant condition vector into a
+/// shuffle for easier combining with other shuffles and insert/extract.
+static Instruction *canonicalizeSelectToShuffle(SelectInst &SI) {
+ Value *CondVal = SI.getCondition();
+ Constant *CondC;
+ if (!CondVal->getType()->isVectorTy() || !match(CondVal, m_Constant(CondC)))
+ return nullptr;
+
+ unsigned NumElts = CondVal->getType()->getVectorNumElements();
+ SmallVector<Constant *, 16> Mask;
+ Mask.reserve(NumElts);
+ Type *Int32Ty = Type::getInt32Ty(CondVal->getContext());
+ for (unsigned i = 0; i != NumElts; ++i) {
+ Constant *Elt = CondC->getAggregateElement(i);
+ if (!Elt)
+ return nullptr;
+
+ if (Elt->isOneValue()) {
+ // If the select condition element is true, choose from the 1st vector.
+ Mask.push_back(ConstantInt::get(Int32Ty, i));
+ } else if (Elt->isNullValue()) {
+ // If the select condition element is false, choose from the 2nd vector.
+ Mask.push_back(ConstantInt::get(Int32Ty, i + NumElts));
+ } else if (isa<UndefValue>(Elt)) {
+ // If the select condition element is undef, the shuffle mask is undef.
+ Mask.push_back(UndefValue::get(Int32Ty));
+ } else {
+ // Bail out on a constant expression.
+ return nullptr;
+ }
+ }
+
+ return new ShuffleVectorInst(SI.getTrueValue(), SI.getFalseValue(),
+ ConstantVector::get(Mask));
+}
+
+/// Reuse bitcasted operands between a compare and select:
+/// select (cmp (bitcast C), (bitcast D)), (bitcast' C), (bitcast' D) -->
+/// bitcast (select (cmp (bitcast C), (bitcast D)), (bitcast C), (bitcast D))
+static Instruction *foldSelectCmpBitcasts(SelectInst &Sel,
+ InstCombiner::BuilderTy &Builder) {
+ Value *Cond = Sel.getCondition();
+ Value *TVal = Sel.getTrueValue();
+ Value *FVal = Sel.getFalseValue();
+
+ CmpInst::Predicate Pred;
+ Value *A, *B;
+ if (!match(Cond, m_Cmp(Pred, m_Value(A), m_Value(B))))
+ return nullptr;
+
+ // The select condition is a compare instruction. If the select's true/false
+ // values are already the same as the compare operands, there's nothing to do.
+ if (TVal == A || TVal == B || FVal == A || FVal == B)
+ return nullptr;
+
+ Value *C, *D;
+ if (!match(A, m_BitCast(m_Value(C))) || !match(B, m_BitCast(m_Value(D))))
+ return nullptr;
+
+ // select (cmp (bitcast C), (bitcast D)), (bitcast TSrc), (bitcast FSrc)
+ Value *TSrc, *FSrc;
+ if (!match(TVal, m_BitCast(m_Value(TSrc))) ||
+ !match(FVal, m_BitCast(m_Value(FSrc))))
+ return nullptr;
+
+ // If the select true/false values are *different bitcasts* of the same source
+ // operands, make the select operands the same as the compare operands and
+ // cast the result. This is the canonical select form for min/max.
+ Value *NewSel;
+ if (TSrc == C && FSrc == D) {
+ // select (cmp (bitcast C), (bitcast D)), (bitcast' C), (bitcast' D) -->
+ // bitcast (select (cmp A, B), A, B)
+ NewSel = Builder.CreateSelect(Cond, A, B, "", &Sel);
+ } else if (TSrc == D && FSrc == C) {
+ // select (cmp (bitcast C), (bitcast D)), (bitcast' D), (bitcast' C) -->
+ // bitcast (select (cmp A, B), B, A)
+ NewSel = Builder.CreateSelect(Cond, B, A, "", &Sel);
+ } else {
+ return nullptr;
+ }
+ return CastInst::CreateBitOrPointerCast(NewSel, Sel.getType());
+}
+
Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
Value *CondVal = SI.getCondition();
Value *TrueVal = SI.getTrueValue();
@@ -919,9 +1101,12 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
Type *SelType = SI.getType();
if (Value *V =
- SimplifySelectInst(CondVal, TrueVal, FalseVal, DL, TLI, DT, AC))
+ SimplifySelectInst(CondVal, TrueVal, FalseVal, DL, &TLI, &DT, &AC))
return replaceInstUsesWith(SI, V);
+ if (Instruction *I = canonicalizeSelectToShuffle(SI))
+ return I;
+
if (SelType->getScalarType()->isIntegerTy(1) &&
TrueVal->getType() == CondVal->getType()) {
if (match(TrueVal, m_One())) {
@@ -1085,7 +1270,7 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
// See if we are selecting two values based on a comparison of the two values.
if (ICmpInst *ICI = dyn_cast<ICmpInst>(CondVal))
- if (Instruction *Result = visitSelectInstWithICmp(SI, ICI))
+ if (Instruction *Result = foldSelectInstWithICmp(SI, ICI))
return Result;
if (Instruction *Add = foldAddSubSelect(SI, *Builder))
@@ -1095,12 +1280,15 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
auto *TI = dyn_cast<Instruction>(TrueVal);
auto *FI = dyn_cast<Instruction>(FalseVal);
if (TI && FI && TI->getOpcode() == FI->getOpcode())
- if (Instruction *IV = FoldSelectOpOp(SI, TI, FI))
+ if (Instruction *IV = foldSelectOpOp(SI, TI, FI))
return IV;
+ if (Instruction *I = foldSelectExtConst(SI))
+ return I;
+
// See if we can fold the select into one of our operands.
if (SelType->isIntOrIntVectorTy() || SelType->isFPOrFPVectorTy()) {
- if (Instruction *FoldI = FoldSelectIntoOp(SI, TrueVal, FalseVal))
+ if (Instruction *FoldI = foldSelectIntoOp(SI, TrueVal, FalseVal))
return FoldI;
Value *LHS, *RHS, *LHS2, *RHS2;
@@ -1124,9 +1312,9 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
Cmp = Builder->CreateFCmp(Pred, LHS, RHS);
}
- Value *NewSI = Builder->CreateCast(CastOp,
- Builder->CreateSelect(Cmp, LHS, RHS),
- SelType);
+ Value *NewSI = Builder->CreateCast(
+ CastOp, Builder->CreateSelect(Cmp, LHS, RHS, SI.getName(), &SI),
+ SelType);
return replaceInstUsesWith(SI, NewSI);
}
}
@@ -1139,39 +1327,35 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
// ABS(ABS(a)) -> ABS(a)
// NABS(NABS(a)) -> NABS(a)
if (SelectPatternFlavor SPF2 = matchSelectPattern(LHS, LHS2, RHS2).Flavor)
- if (Instruction *R = FoldSPFofSPF(cast<Instruction>(LHS),SPF2,LHS2,RHS2,
+ if (Instruction *R = foldSPFofSPF(cast<Instruction>(LHS),SPF2,LHS2,RHS2,
SI, SPF, RHS))
return R;
if (SelectPatternFlavor SPF2 = matchSelectPattern(RHS, LHS2, RHS2).Flavor)
- if (Instruction *R = FoldSPFofSPF(cast<Instruction>(RHS),SPF2,LHS2,RHS2,
+ if (Instruction *R = foldSPFofSPF(cast<Instruction>(RHS),SPF2,LHS2,RHS2,
SI, SPF, LHS))
return R;
}
// MAX(~a, ~b) -> ~MIN(a, b)
- if (SPF == SPF_SMAX || SPF == SPF_UMAX) {
- if (IsFreeToInvert(LHS, LHS->hasNUses(2)) &&
- IsFreeToInvert(RHS, RHS->hasNUses(2))) {
-
- // This transform adds a xor operation and that extra cost needs to be
- // justified. We look for simplifications that will result from
- // applying this rule:
-
- bool Profitable =
- (LHS->hasNUses(2) && match(LHS, m_Not(m_Value()))) ||
- (RHS->hasNUses(2) && match(RHS, m_Not(m_Value()))) ||
- (SI.hasOneUse() && match(*SI.user_begin(), m_Not(m_Value())));
-
- if (Profitable) {
- Value *NewLHS = Builder->CreateNot(LHS);
- Value *NewRHS = Builder->CreateNot(RHS);
- Value *NewCmp = SPF == SPF_SMAX
- ? Builder->CreateICmpSLT(NewLHS, NewRHS)
- : Builder->CreateICmpULT(NewLHS, NewRHS);
- Value *NewSI =
- Builder->CreateNot(Builder->CreateSelect(NewCmp, NewLHS, NewRHS));
- return replaceInstUsesWith(SI, NewSI);
- }
+ if ((SPF == SPF_SMAX || SPF == SPF_UMAX) &&
+ IsFreeToInvert(LHS, LHS->hasNUses(2)) &&
+ IsFreeToInvert(RHS, RHS->hasNUses(2))) {
+ // For this transform to be profitable, we need to eliminate at least two
+ // 'not' instructions if we're going to add one 'not' instruction.
+ int NumberOfNots =
+ (LHS->hasNUses(2) && match(LHS, m_Not(m_Value()))) +
+ (RHS->hasNUses(2) && match(RHS, m_Not(m_Value()))) +
+ (SI.hasOneUse() && match(*SI.user_begin(), m_Not(m_Value())));
+
+ if (NumberOfNots >= 2) {
+ Value *NewLHS = Builder->CreateNot(LHS);
+ Value *NewRHS = Builder->CreateNot(RHS);
+ Value *NewCmp = SPF == SPF_SMAX
+ ? Builder->CreateICmpSLT(NewLHS, NewRHS)
+ : Builder->CreateICmpULT(NewLHS, NewRHS);
+ Value *NewSI =
+ Builder->CreateNot(Builder->CreateSelect(NewCmp, NewLHS, NewRHS));
+ return replaceInstUsesWith(SI, NewSI);
}
}
@@ -1182,8 +1366,8 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
// See if we can fold the select into a phi node if the condition is a select.
if (isa<PHINode>(SI.getCondition()))
// The true/false values have to be live in the PHI predecessor's blocks.
- if (CanSelectOperandBeMappingIntoPredBlock(TrueVal, SI) &&
- CanSelectOperandBeMappingIntoPredBlock(FalseVal, SI))
+ if (canSelectOperandBeMappingIntoPredBlock(TrueVal, SI) &&
+ canSelectOperandBeMappingIntoPredBlock(FalseVal, SI))
if (Instruction *NV = FoldOpIntoPhi(SI))
return NV;
@@ -1233,7 +1417,7 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
return &SI;
}
- if (VectorType* VecTy = dyn_cast<VectorType>(SelType)) {
+ if (VectorType *VecTy = dyn_cast<VectorType>(SelType)) {
unsigned VWidth = VecTy->getNumElements();
APInt UndefElts(VWidth, 0);
APInt AllOnesEltMask(APInt::getAllOnesValue(VWidth));
@@ -1266,5 +1450,8 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
}
}
+ if (Instruction *BitCastSel = foldSelectCmpBitcasts(SI, *Builder))
+ return BitCastSel;
+
return nullptr;
}
diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
index 08e16a7..4ff9b64 100644
--- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
+++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
@@ -39,10 +39,19 @@ Instruction *InstCombiner::commonShiftTransforms(BinaryOperator &I) {
if (Instruction *Res = FoldShiftByConstant(Op0, CUI, I))
return Res;
+ // (C1 shift (A add C2)) -> (C1 shift C2) shift A)
+ // iff A and C2 are both positive.
+ Value *A;
+ Constant *C;
+ if (match(Op0, m_Constant()) && match(Op1, m_Add(m_Value(A), m_Constant(C))))
+ if (isKnownNonNegative(A, DL) && isKnownNonNegative(C, DL))
+ return BinaryOperator::Create(
+ I.getOpcode(), Builder->CreateBinOp(I.getOpcode(), Op0, C), A);
+
// X shift (A srem B) -> X shift (A and B-1) iff B is a power of 2.
// Because shifts by negative values (which could occur if A were negative)
// are undefined.
- Value *A; const APInt *B;
+ const APInt *B;
if (Op1->hasOneUse() && match(Op1, m_SRem(m_Value(A), m_Power2(B)))) {
// FIXME: Should this get moved into SimplifyDemandedBits by saying we don't
// demand the sign bit (and many others) here??
@@ -194,8 +203,10 @@ static Value *GetShiftedValue(Value *V, unsigned NumBits, bool isLeftShift,
else
V = IC.Builder->CreateLShr(C, NumBits);
// If we got a constantexpr back, try to simplify it with TD info.
- if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V))
- V = ConstantFoldConstantExpression(CE, DL, IC.getTargetLibraryInfo());
+ if (auto *C = dyn_cast<Constant>(V))
+ if (auto *FoldedC =
+ ConstantFoldConstant(C, DL, &IC.getTargetLibraryInfo()))
+ V = FoldedC;
return V;
}
@@ -317,7 +328,167 @@ static Value *GetShiftedValue(Value *V, unsigned NumBits, bool isLeftShift,
}
}
+/// Try to fold (X << C1) << C2, where the shifts are some combination of
+/// shl/ashr/lshr.
+static Instruction *
+foldShiftByConstOfShiftByConst(BinaryOperator &I, ConstantInt *COp1,
+ InstCombiner::BuilderTy *Builder) {
+ Value *Op0 = I.getOperand(0);
+ uint32_t TypeBits = Op0->getType()->getScalarSizeInBits();
+
+ // Find out if this is a shift of a shift by a constant.
+ BinaryOperator *ShiftOp = dyn_cast<BinaryOperator>(Op0);
+ if (ShiftOp && !ShiftOp->isShift())
+ ShiftOp = nullptr;
+
+ if (ShiftOp && isa<ConstantInt>(ShiftOp->getOperand(1))) {
+
+ // This is a constant shift of a constant shift. Be careful about hiding
+ // shl instructions behind bit masks. They are used to represent multiplies
+ // by a constant, and it is important that simple arithmetic expressions
+ // are still recognizable by scalar evolution.
+ //
+ // The transforms applied to shl are very similar to the transforms applied
+ // to mul by constant. We can be more aggressive about optimizing right
+ // shifts.
+ //
+ // Combinations of right and left shifts will still be optimized in
+ // DAGCombine where scalar evolution no longer applies.
+
+ ConstantInt *ShiftAmt1C = cast<ConstantInt>(ShiftOp->getOperand(1));
+ uint32_t ShiftAmt1 = ShiftAmt1C->getLimitedValue(TypeBits);
+ uint32_t ShiftAmt2 = COp1->getLimitedValue(TypeBits);
+ assert(ShiftAmt2 != 0 && "Should have been simplified earlier");
+ if (ShiftAmt1 == 0)
+ return nullptr; // Will be simplified in the future.
+ Value *X = ShiftOp->getOperand(0);
+
+ IntegerType *Ty = cast<IntegerType>(I.getType());
+
+ // Check for (X << c1) << c2 and (X >> c1) >> c2
+ if (I.getOpcode() == ShiftOp->getOpcode()) {
+ uint32_t AmtSum = ShiftAmt1 + ShiftAmt2; // Fold into one big shift.
+ // If this is an oversized composite shift, then unsigned shifts become
+ // zero (handled in InstSimplify) and ashr saturates.
+ if (AmtSum >= TypeBits) {
+ if (I.getOpcode() != Instruction::AShr)
+ return nullptr;
+ AmtSum = TypeBits - 1; // Saturate to 31 for i32 ashr.
+ }
+
+ return BinaryOperator::Create(I.getOpcode(), X,
+ ConstantInt::get(Ty, AmtSum));
+ }
+
+ if (ShiftAmt1 == ShiftAmt2) {
+ // If we have ((X << C) >>u C), turn this into X & (-1 >>u C).
+ if (I.getOpcode() == Instruction::LShr &&
+ ShiftOp->getOpcode() == Instruction::Shl) {
+ APInt Mask(APInt::getLowBitsSet(TypeBits, TypeBits - ShiftAmt1));
+ return BinaryOperator::CreateAnd(
+ X, ConstantInt::get(I.getContext(), Mask));
+ }
+ } else if (ShiftAmt1 < ShiftAmt2) {
+ uint32_t ShiftDiff = ShiftAmt2 - ShiftAmt1;
+
+ // (X >>?,exact C1) << C2 --> X << (C2-C1)
+ // The inexact version is deferred to DAGCombine so we don't hide shl
+ // behind a bit mask.
+ if (I.getOpcode() == Instruction::Shl &&
+ ShiftOp->getOpcode() != Instruction::Shl && ShiftOp->isExact()) {
+ assert(ShiftOp->getOpcode() == Instruction::LShr ||
+ ShiftOp->getOpcode() == Instruction::AShr);
+ ConstantInt *ShiftDiffCst = ConstantInt::get(Ty, ShiftDiff);
+ BinaryOperator *NewShl =
+ BinaryOperator::Create(Instruction::Shl, X, ShiftDiffCst);
+ NewShl->setHasNoUnsignedWrap(I.hasNoUnsignedWrap());
+ NewShl->setHasNoSignedWrap(I.hasNoSignedWrap());
+ return NewShl;
+ }
+ // (X << C1) >>u C2 --> X >>u (C2-C1) & (-1 >> C2)
+ if (I.getOpcode() == Instruction::LShr &&
+ ShiftOp->getOpcode() == Instruction::Shl) {
+ ConstantInt *ShiftDiffCst = ConstantInt::get(Ty, ShiftDiff);
+ // (X <<nuw C1) >>u C2 --> X >>u (C2-C1)
+ if (ShiftOp->hasNoUnsignedWrap()) {
+ BinaryOperator *NewLShr =
+ BinaryOperator::Create(Instruction::LShr, X, ShiftDiffCst);
+ NewLShr->setIsExact(I.isExact());
+ return NewLShr;
+ }
+ Value *Shift = Builder->CreateLShr(X, ShiftDiffCst);
+
+ APInt Mask(APInt::getLowBitsSet(TypeBits, TypeBits - ShiftAmt2));
+ return BinaryOperator::CreateAnd(
+ Shift, ConstantInt::get(I.getContext(), Mask));
+ }
+
+ // We can't handle (X << C1) >>s C2, it shifts arbitrary bits in. However,
+ // we can handle (X <<nsw C1) >>s C2 since it only shifts in sign bits.
+ if (I.getOpcode() == Instruction::AShr &&
+ ShiftOp->getOpcode() == Instruction::Shl) {
+ if (ShiftOp->hasNoSignedWrap()) {
+ // (X <<nsw C1) >>s C2 --> X >>s (C2-C1)
+ ConstantInt *ShiftDiffCst = ConstantInt::get(Ty, ShiftDiff);
+ BinaryOperator *NewAShr =
+ BinaryOperator::Create(Instruction::AShr, X, ShiftDiffCst);
+ NewAShr->setIsExact(I.isExact());
+ return NewAShr;
+ }
+ }
+ } else {
+ assert(ShiftAmt2 < ShiftAmt1);
+ uint32_t ShiftDiff = ShiftAmt1 - ShiftAmt2;
+
+ // (X >>?exact C1) << C2 --> X >>?exact (C1-C2)
+ // The inexact version is deferred to DAGCombine so we don't hide shl
+ // behind a bit mask.
+ if (I.getOpcode() == Instruction::Shl &&
+ ShiftOp->getOpcode() != Instruction::Shl && ShiftOp->isExact()) {
+ ConstantInt *ShiftDiffCst = ConstantInt::get(Ty, ShiftDiff);
+ BinaryOperator *NewShr =
+ BinaryOperator::Create(ShiftOp->getOpcode(), X, ShiftDiffCst);
+ NewShr->setIsExact(true);
+ return NewShr;
+ }
+
+ // (X << C1) >>u C2 --> X << (C1-C2) & (-1 >> C2)
+ if (I.getOpcode() == Instruction::LShr &&
+ ShiftOp->getOpcode() == Instruction::Shl) {
+ ConstantInt *ShiftDiffCst = ConstantInt::get(Ty, ShiftDiff);
+ if (ShiftOp->hasNoUnsignedWrap()) {
+ // (X <<nuw C1) >>u C2 --> X <<nuw (C1-C2)
+ BinaryOperator *NewShl =
+ BinaryOperator::Create(Instruction::Shl, X, ShiftDiffCst);
+ NewShl->setHasNoUnsignedWrap(true);
+ return NewShl;
+ }
+ Value *Shift = Builder->CreateShl(X, ShiftDiffCst);
+
+ APInt Mask(APInt::getLowBitsSet(TypeBits, TypeBits - ShiftAmt2));
+ return BinaryOperator::CreateAnd(
+ Shift, ConstantInt::get(I.getContext(), Mask));
+ }
+
+ // We can't handle (X << C1) >>s C2, it shifts arbitrary bits in. However,
+ // we can handle (X <<nsw C1) >>s C2 since it only shifts in sign bits.
+ if (I.getOpcode() == Instruction::AShr &&
+ ShiftOp->getOpcode() == Instruction::Shl) {
+ if (ShiftOp->hasNoSignedWrap()) {
+ // (X <<nsw C1) >>s C2 --> X <<nsw (C1-C2)
+ ConstantInt *ShiftDiffCst = ConstantInt::get(Ty, ShiftDiff);
+ BinaryOperator *NewShl =
+ BinaryOperator::Create(Instruction::Shl, X, ShiftDiffCst);
+ NewShl->setHasNoSignedWrap(true);
+ return NewShl;
+ }
+ }
+ }
+ }
+
+ return nullptr;
+}
Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, Constant *Op1,
BinaryOperator &I) {
@@ -359,13 +530,8 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, Constant *Op1,
return BinaryOperator::CreateMul(BO->getOperand(0),
ConstantExpr::getShl(BOOp, Op1));
- // Try to fold constant and into select arguments.
- if (SelectInst *SI = dyn_cast<SelectInst>(Op0))
- if (Instruction *R = FoldOpIntoSelect(I, SI))
- return R;
- if (isa<PHINode>(Op0))
- if (Instruction *NV = FoldOpIntoPhi(I))
- return NV;
+ if (Instruction *FoldedShift = foldOpWithConstantIntoOperand(I))
+ return FoldedShift;
// Fold shift2(trunc(shift1(x,c1)), c2) -> trunc(shift2(shift1(x,c1),c2))
if (TruncInst *TI = dyn_cast<TruncInst>(Op0)) {
@@ -455,9 +621,9 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, Constant *Op1,
V1->getName()+".mask");
return BinaryOperator::Create(Op0BO->getOpcode(), YS, XM);
}
+ LLVM_FALLTHROUGH;
}
- // FALL THROUGH.
case Instruction::Sub: {
// Turn ((X >> C) + Y) << C -> (X + (Y << C)) & (~0 << C)
if (isLeftShift && Op0BO->getOperand(0)->hasOneUse() &&
@@ -539,157 +705,9 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, Constant *Op1,
}
}
- // Find out if this is a shift of a shift by a constant.
- BinaryOperator *ShiftOp = dyn_cast<BinaryOperator>(Op0);
- if (ShiftOp && !ShiftOp->isShift())
- ShiftOp = nullptr;
-
- if (ShiftOp && isa<ConstantInt>(ShiftOp->getOperand(1))) {
-
- // This is a constant shift of a constant shift. Be careful about hiding
- // shl instructions behind bit masks. They are used to represent multiplies
- // by a constant, and it is important that simple arithmetic expressions
- // are still recognizable by scalar evolution.
- //
- // The transforms applied to shl are very similar to the transforms applied
- // to mul by constant. We can be more aggressive about optimizing right
- // shifts.
- //
- // Combinations of right and left shifts will still be optimized in
- // DAGCombine where scalar evolution no longer applies.
-
- ConstantInt *ShiftAmt1C = cast<ConstantInt>(ShiftOp->getOperand(1));
- uint32_t ShiftAmt1 = ShiftAmt1C->getLimitedValue(TypeBits);
- uint32_t ShiftAmt2 = COp1->getLimitedValue(TypeBits);
- assert(ShiftAmt2 != 0 && "Should have been simplified earlier");
- if (ShiftAmt1 == 0) return nullptr; // Will be simplified in the future.
- Value *X = ShiftOp->getOperand(0);
-
- IntegerType *Ty = cast<IntegerType>(I.getType());
-
- // Check for (X << c1) << c2 and (X >> c1) >> c2
- if (I.getOpcode() == ShiftOp->getOpcode()) {
- uint32_t AmtSum = ShiftAmt1+ShiftAmt2; // Fold into one big shift.
- // If this is oversized composite shift, then unsigned shifts get 0, ashr
- // saturates.
- if (AmtSum >= TypeBits) {
- if (I.getOpcode() != Instruction::AShr)
- return replaceInstUsesWith(I, Constant::getNullValue(I.getType()));
- AmtSum = TypeBits-1; // Saturate to 31 for i32 ashr.
- }
-
- return BinaryOperator::Create(I.getOpcode(), X,
- ConstantInt::get(Ty, AmtSum));
- }
-
- if (ShiftAmt1 == ShiftAmt2) {
- // If we have ((X << C) >>u C), turn this into X & (-1 >>u C).
- if (I.getOpcode() == Instruction::LShr &&
- ShiftOp->getOpcode() == Instruction::Shl) {
- APInt Mask(APInt::getLowBitsSet(TypeBits, TypeBits - ShiftAmt1));
- return BinaryOperator::CreateAnd(X,
- ConstantInt::get(I.getContext(), Mask));
- }
- } else if (ShiftAmt1 < ShiftAmt2) {
- uint32_t ShiftDiff = ShiftAmt2-ShiftAmt1;
-
- // (X >>?,exact C1) << C2 --> X << (C2-C1)
- // The inexact version is deferred to DAGCombine so we don't hide shl
- // behind a bit mask.
- if (I.getOpcode() == Instruction::Shl &&
- ShiftOp->getOpcode() != Instruction::Shl &&
- ShiftOp->isExact()) {
- assert(ShiftOp->getOpcode() == Instruction::LShr ||
- ShiftOp->getOpcode() == Instruction::AShr);
- ConstantInt *ShiftDiffCst = ConstantInt::get(Ty, ShiftDiff);
- BinaryOperator *NewShl = BinaryOperator::Create(Instruction::Shl,
- X, ShiftDiffCst);
- NewShl->setHasNoUnsignedWrap(I.hasNoUnsignedWrap());
- NewShl->setHasNoSignedWrap(I.hasNoSignedWrap());
- return NewShl;
- }
-
- // (X << C1) >>u C2 --> X >>u (C2-C1) & (-1 >> C2)
- if (I.getOpcode() == Instruction::LShr &&
- ShiftOp->getOpcode() == Instruction::Shl) {
- ConstantInt *ShiftDiffCst = ConstantInt::get(Ty, ShiftDiff);
- // (X <<nuw C1) >>u C2 --> X >>u (C2-C1)
- if (ShiftOp->hasNoUnsignedWrap()) {
- BinaryOperator *NewLShr = BinaryOperator::Create(Instruction::LShr,
- X, ShiftDiffCst);
- NewLShr->setIsExact(I.isExact());
- return NewLShr;
- }
- Value *Shift = Builder->CreateLShr(X, ShiftDiffCst);
-
- APInt Mask(APInt::getLowBitsSet(TypeBits, TypeBits - ShiftAmt2));
- return BinaryOperator::CreateAnd(Shift,
- ConstantInt::get(I.getContext(),Mask));
- }
-
- // We can't handle (X << C1) >>s C2, it shifts arbitrary bits in. However,
- // we can handle (X <<nsw C1) >>s C2 since it only shifts in sign bits.
- if (I.getOpcode() == Instruction::AShr &&
- ShiftOp->getOpcode() == Instruction::Shl) {
- if (ShiftOp->hasNoSignedWrap()) {
- // (X <<nsw C1) >>s C2 --> X >>s (C2-C1)
- ConstantInt *ShiftDiffCst = ConstantInt::get(Ty, ShiftDiff);
- BinaryOperator *NewAShr = BinaryOperator::Create(Instruction::AShr,
- X, ShiftDiffCst);
- NewAShr->setIsExact(I.isExact());
- return NewAShr;
- }
- }
- } else {
- assert(ShiftAmt2 < ShiftAmt1);
- uint32_t ShiftDiff = ShiftAmt1-ShiftAmt2;
-
- // (X >>?exact C1) << C2 --> X >>?exact (C1-C2)
- // The inexact version is deferred to DAGCombine so we don't hide shl
- // behind a bit mask.
- if (I.getOpcode() == Instruction::Shl &&
- ShiftOp->getOpcode() != Instruction::Shl &&
- ShiftOp->isExact()) {
- ConstantInt *ShiftDiffCst = ConstantInt::get(Ty, ShiftDiff);
- BinaryOperator *NewShr = BinaryOperator::Create(ShiftOp->getOpcode(),
- X, ShiftDiffCst);
- NewShr->setIsExact(true);
- return NewShr;
- }
+ if (Instruction *Folded = foldShiftByConstOfShiftByConst(I, COp1, Builder))
+ return Folded;
- // (X << C1) >>u C2 --> X << (C1-C2) & (-1 >> C2)
- if (I.getOpcode() == Instruction::LShr &&
- ShiftOp->getOpcode() == Instruction::Shl) {
- ConstantInt *ShiftDiffCst = ConstantInt::get(Ty, ShiftDiff);
- if (ShiftOp->hasNoUnsignedWrap()) {
- // (X <<nuw C1) >>u C2 --> X <<nuw (C1-C2)
- BinaryOperator *NewShl = BinaryOperator::Create(Instruction::Shl,
- X, ShiftDiffCst);
- NewShl->setHasNoUnsignedWrap(true);
- return NewShl;
- }
- Value *Shift = Builder->CreateShl(X, ShiftDiffCst);
-
- APInt Mask(APInt::getLowBitsSet(TypeBits, TypeBits - ShiftAmt2));
- return BinaryOperator::CreateAnd(Shift,
- ConstantInt::get(I.getContext(),Mask));
- }
-
- // We can't handle (X << C1) >>s C2, it shifts arbitrary bits in. However,
- // we can handle (X <<nsw C1) >>s C2 since it only shifts in sign bits.
- if (I.getOpcode() == Instruction::AShr &&
- ShiftOp->getOpcode() == Instruction::Shl) {
- if (ShiftOp->hasNoSignedWrap()) {
- // (X <<nsw C1) >>s C2 --> X <<nsw (C1-C2)
- ConstantInt *ShiftDiffCst = ConstantInt::get(Ty, ShiftDiff);
- BinaryOperator *NewShl = BinaryOperator::Create(Instruction::Shl,
- X, ShiftDiffCst);
- NewShl->setHasNoSignedWrap(true);
- return NewShl;
- }
- }
- }
- }
return nullptr;
}
@@ -699,7 +717,7 @@ Instruction *InstCombiner::visitShl(BinaryOperator &I) {
if (Value *V =
SimplifyShlInst(I.getOperand(0), I.getOperand(1), I.hasNoSignedWrap(),
- I.hasNoUnsignedWrap(), DL, TLI, DT, AC))
+ I.hasNoUnsignedWrap(), DL, &TLI, &DT, &AC))
return replaceInstUsesWith(I, V);
if (Instruction *V = commonShiftTransforms(I))
@@ -708,6 +726,25 @@ Instruction *InstCombiner::visitShl(BinaryOperator &I) {
if (ConstantInt *Op1C = dyn_cast<ConstantInt>(I.getOperand(1))) {
unsigned ShAmt = Op1C->getZExtValue();
+ // Turn:
+ // %zext = zext i32 %V to i64
+ // %res = shl i64 %V, 8
+ //
+ // Into:
+ // %shl = shl i32 %V, 8
+ // %res = zext i32 %shl to i64
+ //
+ // This is only valid if %V would have zeros shifted out.
+ if (auto *ZI = dyn_cast<ZExtInst>(I.getOperand(0))) {
+ unsigned SrcBitWidth = ZI->getSrcTy()->getScalarSizeInBits();
+ if (ShAmt < SrcBitWidth &&
+ MaskedValueIsZero(ZI->getOperand(0),
+ APInt::getHighBitsSet(SrcBitWidth, ShAmt), 0, &I)) {
+ auto *Shl = Builder->CreateShl(ZI->getOperand(0), ShAmt);
+ return new ZExtInst(Shl, I.getType());
+ }
+ }
+
// If the shifted-out value is known-zero, then this is a NUW shift.
if (!I.hasNoUnsignedWrap() &&
MaskedValueIsZero(I.getOperand(0),
@@ -740,7 +777,7 @@ Instruction *InstCombiner::visitLShr(BinaryOperator &I) {
return replaceInstUsesWith(I, V);
if (Value *V = SimplifyLShrInst(I.getOperand(0), I.getOperand(1), I.isExact(),
- DL, TLI, DT, AC))
+ DL, &TLI, &DT, &AC))
return replaceInstUsesWith(I, V);
if (Instruction *R = commonShiftTransforms(I))
@@ -784,7 +821,7 @@ Instruction *InstCombiner::visitAShr(BinaryOperator &I) {
return replaceInstUsesWith(I, V);
if (Value *V = SimplifyAShrInst(I.getOperand(0), I.getOperand(1), I.isExact(),
- DL, TLI, DT, AC))
+ DL, &TLI, &DT, &AC))
return replaceInstUsesWith(I, V);
if (Instruction *R = commonShiftTransforms(I))
diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
index f3268d2..8b930bd 100644
--- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@@ -981,6 +981,7 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
bool MadeChange = false;
APInt UndefElts2(VWidth, 0);
+ APInt UndefElts3(VWidth, 0);
Value *TmpV;
switch (I->getOpcode()) {
default: break;
@@ -1020,8 +1021,8 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
}
case Instruction::ShuffleVector: {
ShuffleVectorInst *Shuffle = cast<ShuffleVectorInst>(I);
- uint64_t LHSVWidth =
- cast<VectorType>(Shuffle->getOperand(0)->getType())->getNumElements();
+ unsigned LHSVWidth =
+ Shuffle->getOperand(0)->getType()->getVectorNumElements();
APInt LeftDemanded(LHSVWidth, 0), RightDemanded(LHSVWidth, 0);
for (unsigned i = 0; i < VWidth; i++) {
if (DemandedElts[i]) {
@@ -1037,17 +1038,21 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
}
}
- APInt UndefElts4(LHSVWidth, 0);
+ APInt LHSUndefElts(LHSVWidth, 0);
TmpV = SimplifyDemandedVectorElts(I->getOperand(0), LeftDemanded,
- UndefElts4, Depth + 1);
+ LHSUndefElts, Depth + 1);
if (TmpV) { I->setOperand(0, TmpV); MadeChange = true; }
- APInt UndefElts3(LHSVWidth, 0);
+ APInt RHSUndefElts(LHSVWidth, 0);
TmpV = SimplifyDemandedVectorElts(I->getOperand(1), RightDemanded,
- UndefElts3, Depth + 1);
+ RHSUndefElts, Depth + 1);
if (TmpV) { I->setOperand(1, TmpV); MadeChange = true; }
bool NewUndefElts = false;
+ unsigned LHSIdx = -1u, LHSValIdx = -1u;
+ unsigned RHSIdx = -1u, RHSValIdx = -1u;
+ bool LHSUniform = true;
+ bool RHSUniform = true;
for (unsigned i = 0; i < VWidth; i++) {
unsigned MaskVal = Shuffle->getMaskValue(i);
if (MaskVal == -1u) {
@@ -1056,18 +1061,59 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
NewUndefElts = true;
UndefElts.setBit(i);
} else if (MaskVal < LHSVWidth) {
- if (UndefElts4[MaskVal]) {
+ if (LHSUndefElts[MaskVal]) {
NewUndefElts = true;
UndefElts.setBit(i);
+ } else {
+ LHSIdx = LHSIdx == -1u ? i : LHSVWidth;
+ LHSValIdx = LHSValIdx == -1u ? MaskVal : LHSVWidth;
+ LHSUniform = LHSUniform && (MaskVal == i);
}
} else {
- if (UndefElts3[MaskVal - LHSVWidth]) {
+ if (RHSUndefElts[MaskVal - LHSVWidth]) {
NewUndefElts = true;
UndefElts.setBit(i);
+ } else {
+ RHSIdx = RHSIdx == -1u ? i : LHSVWidth;
+ RHSValIdx = RHSValIdx == -1u ? MaskVal - LHSVWidth : LHSVWidth;
+ RHSUniform = RHSUniform && (MaskVal - LHSVWidth == i);
}
}
}
+ // Try to transform shuffle with constant vector and single element from
+ // this constant vector to single insertelement instruction.
+ // shufflevector V, C, <v1, v2, .., ci, .., vm> ->
+ // insertelement V, C[ci], ci-n
+ if (LHSVWidth == Shuffle->getType()->getNumElements()) {
+ Value *Op = nullptr;
+ Constant *Value = nullptr;
+ unsigned Idx = -1u;
+
+ // Find constant vector with the single element in shuffle (LHS or RHS).
+ if (LHSIdx < LHSVWidth && RHSUniform) {
+ if (auto *CV = dyn_cast<ConstantVector>(Shuffle->getOperand(0))) {
+ Op = Shuffle->getOperand(1);
+ Value = CV->getOperand(LHSValIdx);
+ Idx = LHSIdx;
+ }
+ }
+ if (RHSIdx < LHSVWidth && LHSUniform) {
+ if (auto *CV = dyn_cast<ConstantVector>(Shuffle->getOperand(1))) {
+ Op = Shuffle->getOperand(0);
+ Value = CV->getOperand(RHSValIdx);
+ Idx = RHSIdx;
+ }
+ }
+ // Found constant vector with single element - convert to insertelement.
+ if (Op && Value) {
+ Instruction *New = InsertElementInst::Create(
+ Op, Value, ConstantInt::get(Type::getInt32Ty(I->getContext()), Idx),
+ Shuffle->getName());
+ InsertNewInstWith(New, *Shuffle);
+ return New;
+ }
+ }
if (NewUndefElts) {
// Add additional discovered undefs.
SmallVector<Constant*, 16> Elts;
@@ -1209,114 +1255,223 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
switch (II->getIntrinsicID()) {
default: break;
+ case Intrinsic::x86_xop_vfrcz_ss:
+ case Intrinsic::x86_xop_vfrcz_sd:
+ // The instructions for these intrinsics are speced to zero upper bits not
+ // pass them through like other scalar intrinsics. So we shouldn't just
+ // use Arg0 if DemandedElts[0] is clear like we do for other intrinsics.
+ // Instead we should return a zero vector.
+ if (!DemandedElts[0]) {
+ Worklist.Add(II);
+ return ConstantAggregateZero::get(II->getType());
+ }
+
+ // Only the lower element is used.
+ DemandedElts = 1;
+ TmpV = SimplifyDemandedVectorElts(II->getArgOperand(0), DemandedElts,
+ UndefElts, Depth + 1);
+ if (TmpV) { II->setArgOperand(0, TmpV); MadeChange = true; }
+
+ // Only the lower element is undefined. The high elements are zero.
+ UndefElts = UndefElts[0];
+ break;
+
// Unary scalar-as-vector operations that work column-wise.
case Intrinsic::x86_sse_rcp_ss:
case Intrinsic::x86_sse_rsqrt_ss:
case Intrinsic::x86_sse_sqrt_ss:
case Intrinsic::x86_sse2_sqrt_sd:
- case Intrinsic::x86_xop_vfrcz_ss:
- case Intrinsic::x86_xop_vfrcz_sd:
TmpV = SimplifyDemandedVectorElts(II->getArgOperand(0), DemandedElts,
UndefElts, Depth + 1);
if (TmpV) { II->setArgOperand(0, TmpV); MadeChange = true; }
// If lowest element of a scalar op isn't used then use Arg0.
- if (DemandedElts.getLoBits(1) != 1)
+ if (!DemandedElts[0]) {
+ Worklist.Add(II);
return II->getArgOperand(0);
+ }
// TODO: If only low elt lower SQRT to FSQRT (with rounding/exceptions
// checks).
break;
- // Binary scalar-as-vector operations that work column-wise. A dest element
- // is a function of the corresponding input elements from the two inputs.
- case Intrinsic::x86_sse_add_ss:
- case Intrinsic::x86_sse_sub_ss:
- case Intrinsic::x86_sse_mul_ss:
- case Intrinsic::x86_sse_div_ss:
+ // Binary scalar-as-vector operations that work column-wise. The high
+ // elements come from operand 0. The low element is a function of both
+ // operands.
case Intrinsic::x86_sse_min_ss:
case Intrinsic::x86_sse_max_ss:
case Intrinsic::x86_sse_cmp_ss:
- case Intrinsic::x86_sse2_add_sd:
- case Intrinsic::x86_sse2_sub_sd:
- case Intrinsic::x86_sse2_mul_sd:
- case Intrinsic::x86_sse2_div_sd:
case Intrinsic::x86_sse2_min_sd:
case Intrinsic::x86_sse2_max_sd:
- case Intrinsic::x86_sse2_cmp_sd:
- case Intrinsic::x86_sse41_round_ss:
- case Intrinsic::x86_sse41_round_sd:
+ case Intrinsic::x86_sse2_cmp_sd: {
TmpV = SimplifyDemandedVectorElts(II->getArgOperand(0), DemandedElts,
UndefElts, Depth + 1);
if (TmpV) { II->setArgOperand(0, TmpV); MadeChange = true; }
+
+ // If lowest element of a scalar op isn't used then use Arg0.
+ if (!DemandedElts[0]) {
+ Worklist.Add(II);
+ return II->getArgOperand(0);
+ }
+
+ // Only lower element is used for operand 1.
+ DemandedElts = 1;
TmpV = SimplifyDemandedVectorElts(II->getArgOperand(1), DemandedElts,
UndefElts2, Depth + 1);
if (TmpV) { II->setArgOperand(1, TmpV); MadeChange = true; }
- // If only the low elt is demanded and this is a scalarizable intrinsic,
- // scalarize it now.
- if (DemandedElts == 1) {
- switch (II->getIntrinsicID()) {
- default: break;
- case Intrinsic::x86_sse_add_ss:
- case Intrinsic::x86_sse_sub_ss:
- case Intrinsic::x86_sse_mul_ss:
- case Intrinsic::x86_sse_div_ss:
- case Intrinsic::x86_sse2_add_sd:
- case Intrinsic::x86_sse2_sub_sd:
- case Intrinsic::x86_sse2_mul_sd:
- case Intrinsic::x86_sse2_div_sd:
- // TODO: Lower MIN/MAX/etc.
- Value *LHS = II->getArgOperand(0);
- Value *RHS = II->getArgOperand(1);
- // Extract the element as scalars.
- LHS = InsertNewInstWith(ExtractElementInst::Create(LHS,
- ConstantInt::get(Type::getInt32Ty(I->getContext()), 0U)), *II);
- RHS = InsertNewInstWith(ExtractElementInst::Create(RHS,
- ConstantInt::get(Type::getInt32Ty(I->getContext()), 0U)), *II);
-
- switch (II->getIntrinsicID()) {
- default: llvm_unreachable("Case stmts out of sync!");
- case Intrinsic::x86_sse_add_ss:
- case Intrinsic::x86_sse2_add_sd:
- TmpV = InsertNewInstWith(BinaryOperator::CreateFAdd(LHS, RHS,
- II->getName()), *II);
- break;
- case Intrinsic::x86_sse_sub_ss:
- case Intrinsic::x86_sse2_sub_sd:
- TmpV = InsertNewInstWith(BinaryOperator::CreateFSub(LHS, RHS,
- II->getName()), *II);
- break;
- case Intrinsic::x86_sse_mul_ss:
- case Intrinsic::x86_sse2_mul_sd:
- TmpV = InsertNewInstWith(BinaryOperator::CreateFMul(LHS, RHS,
- II->getName()), *II);
- break;
- case Intrinsic::x86_sse_div_ss:
- case Intrinsic::x86_sse2_div_sd:
- TmpV = InsertNewInstWith(BinaryOperator::CreateFDiv(LHS, RHS,
- II->getName()), *II);
- break;
- }
-
- Instruction *New =
- InsertElementInst::Create(
- UndefValue::get(II->getType()), TmpV,
- ConstantInt::get(Type::getInt32Ty(I->getContext()), 0U, false),
- II->getName());
- InsertNewInstWith(New, *II);
- return New;
- }
+ // Lower element is undefined if both lower elements are undefined.
+ // Consider things like undef&0. The result is known zero, not undef.
+ if (!UndefElts2[0])
+ UndefElts.clearBit(0);
+
+ break;
+ }
+
+ // Binary scalar-as-vector operations that work column-wise. The high
+ // elements come from operand 0 and the low element comes from operand 1.
+ case Intrinsic::x86_sse41_round_ss:
+ case Intrinsic::x86_sse41_round_sd: {
+ // Don't use the low element of operand 0.
+ APInt DemandedElts2 = DemandedElts;
+ DemandedElts2.clearBit(0);
+ TmpV = SimplifyDemandedVectorElts(II->getArgOperand(0), DemandedElts2,
+ UndefElts, Depth + 1);
+ if (TmpV) { II->setArgOperand(0, TmpV); MadeChange = true; }
+
+ // If lowest element of a scalar op isn't used then use Arg0.
+ if (!DemandedElts[0]) {
+ Worklist.Add(II);
+ return II->getArgOperand(0);
}
+ // Only lower element is used for operand 1.
+ DemandedElts = 1;
+ TmpV = SimplifyDemandedVectorElts(II->getArgOperand(1), DemandedElts,
+ UndefElts2, Depth + 1);
+ if (TmpV) { II->setArgOperand(1, TmpV); MadeChange = true; }
+
+ // Take the high undef elements from operand 0 and take the lower element
+ // from operand 1.
+ UndefElts.clearBit(0);
+ UndefElts |= UndefElts2[0];
+ break;
+ }
+
+ // Three input scalar-as-vector operations that work column-wise. The high
+ // elements come from operand 0 and the low element is a function of all
+ // three inputs.
+ case Intrinsic::x86_avx512_mask_add_ss_round:
+ case Intrinsic::x86_avx512_mask_div_ss_round:
+ case Intrinsic::x86_avx512_mask_mul_ss_round:
+ case Intrinsic::x86_avx512_mask_sub_ss_round:
+ case Intrinsic::x86_avx512_mask_max_ss_round:
+ case Intrinsic::x86_avx512_mask_min_ss_round:
+ case Intrinsic::x86_avx512_mask_add_sd_round:
+ case Intrinsic::x86_avx512_mask_div_sd_round:
+ case Intrinsic::x86_avx512_mask_mul_sd_round:
+ case Intrinsic::x86_avx512_mask_sub_sd_round:
+ case Intrinsic::x86_avx512_mask_max_sd_round:
+ case Intrinsic::x86_avx512_mask_min_sd_round:
+ case Intrinsic::x86_fma_vfmadd_ss:
+ case Intrinsic::x86_fma_vfmsub_ss:
+ case Intrinsic::x86_fma_vfnmadd_ss:
+ case Intrinsic::x86_fma_vfnmsub_ss:
+ case Intrinsic::x86_fma_vfmadd_sd:
+ case Intrinsic::x86_fma_vfmsub_sd:
+ case Intrinsic::x86_fma_vfnmadd_sd:
+ case Intrinsic::x86_fma_vfnmsub_sd:
+ case Intrinsic::x86_avx512_mask_vfmadd_ss:
+ case Intrinsic::x86_avx512_mask_vfmadd_sd:
+ case Intrinsic::x86_avx512_maskz_vfmadd_ss:
+ case Intrinsic::x86_avx512_maskz_vfmadd_sd:
+ TmpV = SimplifyDemandedVectorElts(II->getArgOperand(0), DemandedElts,
+ UndefElts, Depth + 1);
+ if (TmpV) { II->setArgOperand(0, TmpV); MadeChange = true; }
+
// If lowest element of a scalar op isn't used then use Arg0.
- if (DemandedElts.getLoBits(1) != 1)
+ if (!DemandedElts[0]) {
+ Worklist.Add(II);
return II->getArgOperand(0);
+ }
+
+ // Only lower element is used for operand 1 and 2.
+ DemandedElts = 1;
+ TmpV = SimplifyDemandedVectorElts(II->getArgOperand(1), DemandedElts,
+ UndefElts2, Depth + 1);
+ if (TmpV) { II->setArgOperand(1, TmpV); MadeChange = true; }
+ TmpV = SimplifyDemandedVectorElts(II->getArgOperand(2), DemandedElts,
+ UndefElts3, Depth + 1);
+ if (TmpV) { II->setArgOperand(2, TmpV); MadeChange = true; }
+
+ // Lower element is undefined if all three lower elements are undefined.
+ // Consider things like undef&0. The result is known zero, not undef.
+ if (!UndefElts2[0] || !UndefElts3[0])
+ UndefElts.clearBit(0);
- // Output elements are undefined if both are undefined. Consider things
- // like undef&0. The result is known zero, not undef.
- UndefElts &= UndefElts2;
break;
+ case Intrinsic::x86_avx512_mask3_vfmadd_ss:
+ case Intrinsic::x86_avx512_mask3_vfmadd_sd:
+ case Intrinsic::x86_avx512_mask3_vfmsub_ss:
+ case Intrinsic::x86_avx512_mask3_vfmsub_sd:
+ case Intrinsic::x86_avx512_mask3_vfnmsub_ss:
+ case Intrinsic::x86_avx512_mask3_vfnmsub_sd:
+ // These intrinsics get the passthru bits from operand 2.
+ TmpV = SimplifyDemandedVectorElts(II->getArgOperand(2), DemandedElts,
+ UndefElts, Depth + 1);
+ if (TmpV) { II->setArgOperand(2, TmpV); MadeChange = true; }
+
+ // If lowest element of a scalar op isn't used then use Arg2.
+ if (!DemandedElts[0]) {
+ Worklist.Add(II);
+ return II->getArgOperand(2);
+ }
+
+ // Only lower element is used for operand 0 and 1.
+ DemandedElts = 1;
+ TmpV = SimplifyDemandedVectorElts(II->getArgOperand(0), DemandedElts,
+ UndefElts2, Depth + 1);
+ if (TmpV) { II->setArgOperand(0, TmpV); MadeChange = true; }
+ TmpV = SimplifyDemandedVectorElts(II->getArgOperand(1), DemandedElts,
+ UndefElts3, Depth + 1);
+ if (TmpV) { II->setArgOperand(1, TmpV); MadeChange = true; }
+
+ // Lower element is undefined if all three lower elements are undefined.
+ // Consider things like undef&0. The result is known zero, not undef.
+ if (!UndefElts2[0] || !UndefElts3[0])
+ UndefElts.clearBit(0);
+
+ break;
+
+ case Intrinsic::x86_sse2_pmulu_dq:
+ case Intrinsic::x86_sse41_pmuldq:
+ case Intrinsic::x86_avx2_pmul_dq:
+ case Intrinsic::x86_avx2_pmulu_dq:
+ case Intrinsic::x86_avx512_pmul_dq_512:
+ case Intrinsic::x86_avx512_pmulu_dq_512: {
+ Value *Op0 = II->getArgOperand(0);
+ Value *Op1 = II->getArgOperand(1);
+ unsigned InnerVWidth = Op0->getType()->getVectorNumElements();
+ assert((VWidth * 2) == InnerVWidth && "Unexpected input size");
+
+ APInt InnerDemandedElts(InnerVWidth, 0);
+ for (unsigned i = 0; i != VWidth; ++i)
+ if (DemandedElts[i])
+ InnerDemandedElts.setBit(i * 2);
+
+ UndefElts2 = APInt(InnerVWidth, 0);
+ TmpV = SimplifyDemandedVectorElts(Op0, InnerDemandedElts, UndefElts2,
+ Depth + 1);
+ if (TmpV) { II->setArgOperand(0, TmpV); MadeChange = true; }
+
+ UndefElts3 = APInt(InnerVWidth, 0);
+ TmpV = SimplifyDemandedVectorElts(Op1, InnerDemandedElts, UndefElts3,
+ Depth + 1);
+ if (TmpV) { II->setArgOperand(1, TmpV); MadeChange = true; }
+
+ break;
+ }
+
// SSE4A instructions leave the upper 64-bits of the 128-bit result
// in an undefined state.
case Intrinsic::x86_sse4a_extrq:
diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
index a761387..b2477f6 100644
--- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
+++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
@@ -145,7 +145,7 @@ Instruction *InstCombiner::scalarizePHI(ExtractElementInst &EI, PHINode *PN) {
Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) {
if (Value *V = SimplifyExtractElementInst(
- EI.getVectorOperand(), EI.getIndexOperand(), DL, TLI, DT, AC))
+ EI.getVectorOperand(), EI.getIndexOperand(), DL, &TLI, &DT, &AC))
return replaceInstUsesWith(EI, V);
// If vector val is constant with all elements the same, replace EI with
@@ -413,6 +413,14 @@ static void replaceExtractElements(InsertElementInst *InsElt,
if (InsertionBlock != InsElt->getParent())
return;
+ // TODO: This restriction matches the check in visitInsertElementInst() and
+ // prevents an infinite loop caused by not turning the extract/insert pair
+ // into a shuffle. We really should not need either check, but we're lacking
+ // folds for shufflevectors because we're afraid to generate shuffle masks
+ // that the backend can't handle.
+ if (InsElt->hasOneUse() && isa<InsertElementInst>(InsElt->user_back()))
+ return;
+
auto *WideVec = new ShuffleVectorInst(ExtVecOp, UndefValue::get(ExtVecType),
ConstantVector::get(ExtendMask));
@@ -452,7 +460,7 @@ static ShuffleOps collectShuffleElements(Value *V,
Value *PermittedRHS,
InstCombiner &IC) {
assert(V->getType()->isVectorTy() && "Invalid shuffle!");
- unsigned NumElts = cast<VectorType>(V->getType())->getNumElements();
+ unsigned NumElts = V->getType()->getVectorNumElements();
if (isa<UndefValue>(V)) {
Mask.assign(NumElts, UndefValue::get(Type::getInt32Ty(V->getContext())));
@@ -566,6 +574,176 @@ Instruction *InstCombiner::visitInsertValueInst(InsertValueInst &I) {
return nullptr;
}
+static bool isShuffleEquivalentToSelect(ShuffleVectorInst &Shuf) {
+ int MaskSize = Shuf.getMask()->getType()->getVectorNumElements();
+ int VecSize = Shuf.getOperand(0)->getType()->getVectorNumElements();
+
+ // A vector select does not change the size of the operands.
+ if (MaskSize != VecSize)
+ return false;
+
+ // Each mask element must be undefined or choose a vector element from one of
+ // the source operands without crossing vector lanes.
+ for (int i = 0; i != MaskSize; ++i) {
+ int Elt = Shuf.getMaskValue(i);
+ if (Elt != -1 && Elt != i && Elt != i + VecSize)
+ return false;
+ }
+
+ return true;
+}
+
+// Turn a chain of inserts that splats a value into a canonical insert + shuffle
+// splat. That is:
+// insertelt(insertelt(insertelt(insertelt X, %k, 0), %k, 1), %k, 2) ... ->
+// shufflevector(insertelt(X, %k, 0), undef, zero)
+static Instruction *foldInsSequenceIntoBroadcast(InsertElementInst &InsElt) {
+ // We are interested in the last insert in a chain. So, if this insert
+ // has a single user, and that user is an insert, bail.
+ if (InsElt.hasOneUse() && isa<InsertElementInst>(InsElt.user_back()))
+ return nullptr;
+
+ VectorType *VT = cast<VectorType>(InsElt.getType());
+ int NumElements = VT->getNumElements();
+
+ // Do not try to do this for a one-element vector, since that's a nop,
+ // and will cause an inf-loop.
+ if (NumElements == 1)
+ return nullptr;
+
+ Value *SplatVal = InsElt.getOperand(1);
+ InsertElementInst *CurrIE = &InsElt;
+ SmallVector<bool, 16> ElementPresent(NumElements, false);
+
+ // Walk the chain backwards, keeping track of which indices we inserted into,
+ // until we hit something that isn't an insert of the splatted value.
+ while (CurrIE) {
+ ConstantInt *Idx = dyn_cast<ConstantInt>(CurrIE->getOperand(2));
+ if (!Idx || CurrIE->getOperand(1) != SplatVal)
+ return nullptr;
+
+ // Check none of the intermediate steps have any additional uses.
+ if ((CurrIE != &InsElt) && !CurrIE->hasOneUse())
+ return nullptr;
+
+ ElementPresent[Idx->getZExtValue()] = true;
+ CurrIE = dyn_cast<InsertElementInst>(CurrIE->getOperand(0));
+ }
+
+ // Make sure we've seen an insert into every element.
+ if (llvm::any_of(ElementPresent, [](bool Present) { return !Present; }))
+ return nullptr;
+
+ // All right, create the insert + shuffle.
+ Instruction *InsertFirst = InsertElementInst::Create(
+ UndefValue::get(VT), SplatVal,
+ ConstantInt::get(Type::getInt32Ty(InsElt.getContext()), 0), "", &InsElt);
+
+ Constant *ZeroMask = ConstantAggregateZero::get(
+ VectorType::get(Type::getInt32Ty(InsElt.getContext()), NumElements));
+
+ return new ShuffleVectorInst(InsertFirst, UndefValue::get(VT), ZeroMask);
+}
+
+/// insertelt (shufflevector X, CVec, Mask|insertelt X, C1, CIndex1), C, CIndex
+/// --> shufflevector X, CVec', Mask'
+static Instruction *foldConstantInsEltIntoShuffle(InsertElementInst &InsElt) {
+ auto *Inst = dyn_cast<Instruction>(InsElt.getOperand(0));
+ // Bail out if the parent has more than one use. In that case, we'd be
+ // replacing the insertelt with a shuffle, and that's not a clear win.
+ if (!Inst || !Inst->hasOneUse())
+ return nullptr;
+ if (auto *Shuf = dyn_cast<ShuffleVectorInst>(InsElt.getOperand(0))) {
+ // The shuffle must have a constant vector operand. The insertelt must have
+ // a constant scalar being inserted at a constant position in the vector.
+ Constant *ShufConstVec, *InsEltScalar;
+ uint64_t InsEltIndex;
+ if (!match(Shuf->getOperand(1), m_Constant(ShufConstVec)) ||
+ !match(InsElt.getOperand(1), m_Constant(InsEltScalar)) ||
+ !match(InsElt.getOperand(2), m_ConstantInt(InsEltIndex)))
+ return nullptr;
+
+ // Adding an element to an arbitrary shuffle could be expensive, but a
+ // shuffle that selects elements from vectors without crossing lanes is
+ // assumed cheap.
+ // If we're just adding a constant into that shuffle, it will still be
+ // cheap.
+ if (!isShuffleEquivalentToSelect(*Shuf))
+ return nullptr;
+
+ // From the above 'select' check, we know that the mask has the same number
+ // of elements as the vector input operands. We also know that each constant
+ // input element is used in its lane and can not be used more than once by
+ // the shuffle. Therefore, replace the constant in the shuffle's constant
+ // vector with the insertelt constant. Replace the constant in the shuffle's
+ // mask vector with the insertelt index plus the length of the vector
+ // (because the constant vector operand of a shuffle is always the 2nd
+ // operand).
+ Constant *Mask = Shuf->getMask();
+ unsigned NumElts = Mask->getType()->getVectorNumElements();
+ SmallVector<Constant *, 16> NewShufElts(NumElts);
+ SmallVector<Constant *, 16> NewMaskElts(NumElts);
+ for (unsigned I = 0; I != NumElts; ++I) {
+ if (I == InsEltIndex) {
+ NewShufElts[I] = InsEltScalar;
+ Type *Int32Ty = Type::getInt32Ty(Shuf->getContext());
+ NewMaskElts[I] = ConstantInt::get(Int32Ty, InsEltIndex + NumElts);
+ } else {
+ // Copy over the existing values.
+ NewShufElts[I] = ShufConstVec->getAggregateElement(I);
+ NewMaskElts[I] = Mask->getAggregateElement(I);
+ }
+ }
+
+ // Create new operands for a shuffle that includes the constant of the
+ // original insertelt. The old shuffle will be dead now.
+ return new ShuffleVectorInst(Shuf->getOperand(0),
+ ConstantVector::get(NewShufElts),
+ ConstantVector::get(NewMaskElts));
+ } else if (auto *IEI = dyn_cast<InsertElementInst>(Inst)) {
+ // Transform sequences of insertelements ops with constant data/indexes into
+ // a single shuffle op.
+ unsigned NumElts = InsElt.getType()->getNumElements();
+
+ uint64_t InsertIdx[2];
+ Constant *Val[2];
+ if (!match(InsElt.getOperand(2), m_ConstantInt(InsertIdx[0])) ||
+ !match(InsElt.getOperand(1), m_Constant(Val[0])) ||
+ !match(IEI->getOperand(2), m_ConstantInt(InsertIdx[1])) ||
+ !match(IEI->getOperand(1), m_Constant(Val[1])))
+ return nullptr;
+ SmallVector<Constant *, 16> Values(NumElts);
+ SmallVector<Constant *, 16> Mask(NumElts);
+ auto ValI = std::begin(Val);
+ // Generate new constant vector and mask.
+ // We have 2 values/masks from the insertelements instructions. Insert them
+ // into new value/mask vectors.
+ for (uint64_t I : InsertIdx) {
+ if (!Values[I]) {
+ assert(!Mask[I]);
+ Values[I] = *ValI;
+ Mask[I] = ConstantInt::get(Type::getInt32Ty(InsElt.getContext()),
+ NumElts + I);
+ }
+ ++ValI;
+ }
+ // Remaining values are filled with 'undef' values.
+ for (unsigned I = 0; I < NumElts; ++I) {
+ if (!Values[I]) {
+ assert(!Mask[I]);
+ Values[I] = UndefValue::get(InsElt.getType()->getElementType());
+ Mask[I] = ConstantInt::get(Type::getInt32Ty(InsElt.getContext()), I);
+ }
+ }
+ // Create new operands for a shuffle that includes the constant of the
+ // original insertelt.
+ return new ShuffleVectorInst(IEI->getOperand(0),
+ ConstantVector::get(Values),
+ ConstantVector::get(Mask));
+ }
+ return nullptr;
+}
+
Instruction *InstCombiner::visitInsertElementInst(InsertElementInst &IE) {
Value *VecOp = IE.getOperand(0);
Value *ScalarOp = IE.getOperand(1);
@@ -616,7 +794,7 @@ Instruction *InstCombiner::visitInsertElementInst(InsertElementInst &IE) {
}
}
- unsigned VWidth = cast<VectorType>(VecOp->getType())->getNumElements();
+ unsigned VWidth = VecOp->getType()->getVectorNumElements();
APInt UndefElts(VWidth, 0);
APInt AllOnesEltMask(APInt::getAllOnesValue(VWidth));
if (Value *V = SimplifyDemandedVectorElts(&IE, AllOnesEltMask, UndefElts)) {
@@ -625,6 +803,14 @@ Instruction *InstCombiner::visitInsertElementInst(InsertElementInst &IE) {
return &IE;
}
+ if (Instruction *Shuf = foldConstantInsEltIntoShuffle(IE))
+ return Shuf;
+
+ // Turn a sequence of inserts that broadcasts a scalar into a single
+ // insert + shufflevector.
+ if (Instruction *Broadcast = foldInsSequenceIntoBroadcast(IE))
+ return Broadcast;
+
return nullptr;
}
@@ -903,8 +1089,7 @@ static void recognizeIdentityMask(const SmallVectorImpl<int> &Mask,
// +--+--+--+--+
static bool isShuffleExtractingFromLHS(ShuffleVectorInst &SVI,
SmallVector<int, 16> &Mask) {
- unsigned LHSElems =
- cast<VectorType>(SVI.getOperand(0)->getType())->getNumElements();
+ unsigned LHSElems = SVI.getOperand(0)->getType()->getVectorNumElements();
unsigned MaskElems = Mask.size();
unsigned BegIdx = Mask.front();
unsigned EndIdx = Mask.back();
@@ -928,7 +1113,7 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
if (isa<UndefValue>(SVI.getOperand(2)))
return replaceInstUsesWith(SVI, UndefValue::get(SVI.getType()));
- unsigned VWidth = cast<VectorType>(SVI.getType())->getNumElements();
+ unsigned VWidth = SVI.getType()->getVectorNumElements();
APInt UndefElts(VWidth, 0);
APInt AllOnesEltMask(APInt::getAllOnesValue(VWidth));
@@ -940,7 +1125,7 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
MadeChange = true;
}
- unsigned LHSWidth = cast<VectorType>(LHS->getType())->getNumElements();
+ unsigned LHSWidth = LHS->getType()->getVectorNumElements();
// Canonicalize shuffle(x ,x,mask) -> shuffle(x, undef,mask')
// Canonicalize shuffle(undef,x,mask) -> shuffle(x, undef,mask').
@@ -1143,11 +1328,11 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
if (LHSShuffle) {
LHSOp0 = LHSShuffle->getOperand(0);
LHSOp1 = LHSShuffle->getOperand(1);
- LHSOp0Width = cast<VectorType>(LHSOp0->getType())->getNumElements();
+ LHSOp0Width = LHSOp0->getType()->getVectorNumElements();
}
if (RHSShuffle) {
RHSOp0 = RHSShuffle->getOperand(0);
- RHSOp0Width = cast<VectorType>(RHSOp0->getType())->getNumElements();
+ RHSOp0Width = RHSOp0->getType()->getVectorNumElements();
}
Value* newLHS = LHS;
Value* newRHS = RHS;
diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 377ccb9..27fc34d 100644
--- a/contrib/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/contrib/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -177,11 +177,10 @@ static bool simplifyAssocCastAssoc(BinaryOperator *BinOp1) {
return false;
// TODO: Enhance logic for other BinOps and remove this check.
- auto AssocOpcode = BinOp1->getOpcode();
- if (AssocOpcode != Instruction::Xor && AssocOpcode != Instruction::And &&
- AssocOpcode != Instruction::Or)
+ if (!BinOp1->isBitwiseLogicOp())
return false;
+ auto AssocOpcode = BinOp1->getOpcode();
auto *BinOp2 = dyn_cast<BinaryOperator>(Cast->getOperand(0));
if (!BinOp2 || !BinOp2->hasOneUse() || BinOp2->getOpcode() != AssocOpcode)
return false;
@@ -684,14 +683,14 @@ Value *InstCombiner::SimplifyUsingDistributiveLaws(BinaryOperator &I) {
if (SI0->getCondition() == SI1->getCondition()) {
Value *SI = nullptr;
if (Value *V = SimplifyBinOp(TopLevelOpcode, SI0->getFalseValue(),
- SI1->getFalseValue(), DL, TLI, DT, AC))
+ SI1->getFalseValue(), DL, &TLI, &DT, &AC))
SI = Builder->CreateSelect(SI0->getCondition(),
Builder->CreateBinOp(TopLevelOpcode,
SI0->getTrueValue(),
SI1->getTrueValue()),
V);
if (Value *V = SimplifyBinOp(TopLevelOpcode, SI0->getTrueValue(),
- SI1->getTrueValue(), DL, TLI, DT, AC))
+ SI1->getTrueValue(), DL, &TLI, &DT, &AC))
SI = Builder->CreateSelect(
SI0->getCondition(), V,
Builder->CreateBinOp(TopLevelOpcode, SI0->getFalseValue(),
@@ -741,17 +740,18 @@ Value *InstCombiner::dyn_castFNegVal(Value *V, bool IgnoreZeroSign) const {
return nullptr;
}
-static Value *FoldOperationIntoSelectOperand(Instruction &I, Value *SO,
+static Value *foldOperationIntoSelectOperand(Instruction &I, Value *SO,
InstCombiner *IC) {
- if (CastInst *CI = dyn_cast<CastInst>(&I)) {
- return IC->Builder->CreateCast(CI->getOpcode(), SO, I.getType());
- }
+ if (auto *Cast = dyn_cast<CastInst>(&I))
+ return IC->Builder->CreateCast(Cast->getOpcode(), SO, I.getType());
+
+ assert(I.isBinaryOp() && "Unexpected opcode for select folding");
// Figure out if the constant is the left or the right argument.
bool ConstIsRHS = isa<Constant>(I.getOperand(1));
Constant *ConstOperand = cast<Constant>(I.getOperand(ConstIsRHS));
- if (Constant *SOC = dyn_cast<Constant>(SO)) {
+ if (auto *SOC = dyn_cast<Constant>(SO)) {
if (ConstIsRHS)
return ConstantExpr::get(I.getOpcode(), SOC, ConstOperand);
return ConstantExpr::get(I.getOpcode(), ConstOperand, SOC);
@@ -761,78 +761,65 @@ static Value *FoldOperationIntoSelectOperand(Instruction &I, Value *SO,
if (!ConstIsRHS)
std::swap(Op0, Op1);
- if (BinaryOperator *BO = dyn_cast<BinaryOperator>(&I)) {
- Value *RI = IC->Builder->CreateBinOp(BO->getOpcode(), Op0, Op1,
- SO->getName()+".op");
- Instruction *FPInst = dyn_cast<Instruction>(RI);
- if (FPInst && isa<FPMathOperator>(FPInst))
- FPInst->copyFastMathFlags(BO);
- return RI;
- }
- if (ICmpInst *CI = dyn_cast<ICmpInst>(&I))
- return IC->Builder->CreateICmp(CI->getPredicate(), Op0, Op1,
- SO->getName()+".cmp");
- if (FCmpInst *CI = dyn_cast<FCmpInst>(&I))
- return IC->Builder->CreateICmp(CI->getPredicate(), Op0, Op1,
- SO->getName()+".cmp");
- llvm_unreachable("Unknown binary instruction type!");
+ auto *BO = cast<BinaryOperator>(&I);
+ Value *RI = IC->Builder->CreateBinOp(BO->getOpcode(), Op0, Op1,
+ SO->getName() + ".op");
+ auto *FPInst = dyn_cast<Instruction>(RI);
+ if (FPInst && isa<FPMathOperator>(FPInst))
+ FPInst->copyFastMathFlags(BO);
+ return RI;
}
-/// Given an instruction with a select as one operand and a constant as the
-/// other operand, try to fold the binary operator into the select arguments.
-/// This also works for Cast instructions, which obviously do not have a second
-/// operand.
Instruction *InstCombiner::FoldOpIntoSelect(Instruction &Op, SelectInst *SI) {
- // Don't modify shared select instructions
- if (!SI->hasOneUse()) return nullptr;
- Value *TV = SI->getOperand(1);
- Value *FV = SI->getOperand(2);
-
- if (isa<Constant>(TV) || isa<Constant>(FV)) {
- // Bool selects with constant operands can be folded to logical ops.
- if (SI->getType()->isIntegerTy(1)) return nullptr;
-
- // If it's a bitcast involving vectors, make sure it has the same number of
- // elements on both sides.
- if (BitCastInst *BC = dyn_cast<BitCastInst>(&Op)) {
- VectorType *DestTy = dyn_cast<VectorType>(BC->getDestTy());
- VectorType *SrcTy = dyn_cast<VectorType>(BC->getSrcTy());
-
- // Verify that either both or neither are vectors.
- if ((SrcTy == nullptr) != (DestTy == nullptr)) return nullptr;
- // If vectors, verify that they have the same number of elements.
- if (SrcTy && SrcTy->getNumElements() != DestTy->getNumElements())
- return nullptr;
- }
+ // Don't modify shared select instructions.
+ if (!SI->hasOneUse())
+ return nullptr;
- // Test if a CmpInst instruction is used exclusively by a select as
- // part of a minimum or maximum operation. If so, refrain from doing
- // any other folding. This helps out other analyses which understand
- // non-obfuscated minimum and maximum idioms, such as ScalarEvolution
- // and CodeGen. And in this case, at least one of the comparison
- // operands has at least one user besides the compare (the select),
- // which would often largely negate the benefit of folding anyway.
- if (auto *CI = dyn_cast<CmpInst>(SI->getCondition())) {
- if (CI->hasOneUse()) {
- Value *Op0 = CI->getOperand(0), *Op1 = CI->getOperand(1);
- if ((SI->getOperand(1) == Op0 && SI->getOperand(2) == Op1) ||
- (SI->getOperand(2) == Op0 && SI->getOperand(1) == Op1))
- return nullptr;
- }
- }
+ Value *TV = SI->getTrueValue();
+ Value *FV = SI->getFalseValue();
+ if (!(isa<Constant>(TV) || isa<Constant>(FV)))
+ return nullptr;
- Value *SelectTrueVal = FoldOperationIntoSelectOperand(Op, TV, this);
- Value *SelectFalseVal = FoldOperationIntoSelectOperand(Op, FV, this);
+ // Bool selects with constant operands can be folded to logical ops.
+ if (SI->getType()->getScalarType()->isIntegerTy(1))
+ return nullptr;
- return SelectInst::Create(SI->getCondition(),
- SelectTrueVal, SelectFalseVal);
+ // If it's a bitcast involving vectors, make sure it has the same number of
+ // elements on both sides.
+ if (auto *BC = dyn_cast<BitCastInst>(&Op)) {
+ VectorType *DestTy = dyn_cast<VectorType>(BC->getDestTy());
+ VectorType *SrcTy = dyn_cast<VectorType>(BC->getSrcTy());
+
+ // Verify that either both or neither are vectors.
+ if ((SrcTy == nullptr) != (DestTy == nullptr))
+ return nullptr;
+
+ // If vectors, verify that they have the same number of elements.
+ if (SrcTy && SrcTy->getNumElements() != DestTy->getNumElements())
+ return nullptr;
}
- return nullptr;
+
+ // Test if a CmpInst instruction is used exclusively by a select as
+ // part of a minimum or maximum operation. If so, refrain from doing
+ // any other folding. This helps out other analyses which understand
+ // non-obfuscated minimum and maximum idioms, such as ScalarEvolution
+ // and CodeGen. And in this case, at least one of the comparison
+ // operands has at least one user besides the compare (the select),
+ // which would often largely negate the benefit of folding anyway.
+ if (auto *CI = dyn_cast<CmpInst>(SI->getCondition())) {
+ if (CI->hasOneUse()) {
+ Value *Op0 = CI->getOperand(0), *Op1 = CI->getOperand(1);
+ if ((SI->getOperand(1) == Op0 && SI->getOperand(2) == Op1) ||
+ (SI->getOperand(2) == Op0 && SI->getOperand(1) == Op1))
+ return nullptr;
+ }
+ }
+
+ Value *NewTV = foldOperationIntoSelectOperand(Op, TV, this);
+ Value *NewFV = foldOperationIntoSelectOperand(Op, FV, this);
+ return SelectInst::Create(SI->getCondition(), NewTV, NewFV, "", nullptr, SI);
}
-/// Given a binary operator, cast instruction, or select which has a PHI node as
-/// operand #0, see if we can fold the instruction into the PHI (which is only
-/// possible if all operands to the PHI are constants).
Instruction *InstCombiner::FoldOpIntoPhi(Instruction &I) {
PHINode *PN = cast<PHINode>(I.getOperand(0));
unsigned NumPHIValues = PN->getNumIncomingValues();
@@ -877,7 +864,7 @@ Instruction *InstCombiner::FoldOpIntoPhi(Instruction &I) {
// If the incoming non-constant value is in I's block, we will remove one
// instruction, but insert another equivalent one, leading to infinite
// instcombine.
- if (isPotentiallyReachable(I.getParent(), NonConstBB, DT, LI))
+ if (isPotentiallyReachable(I.getParent(), NonConstBB, &DT, LI))
return nullptr;
}
@@ -970,6 +957,19 @@ Instruction *InstCombiner::FoldOpIntoPhi(Instruction &I) {
return replaceInstUsesWith(I, NewPN);
}
+Instruction *InstCombiner::foldOpWithConstantIntoOperand(Instruction &I) {
+ assert(isa<Constant>(I.getOperand(1)) && "Unexpected operand type");
+
+ if (auto *Sel = dyn_cast<SelectInst>(I.getOperand(0))) {
+ if (Instruction *NewSel = FoldOpIntoSelect(I, Sel))
+ return NewSel;
+ } else if (isa<PHINode>(I.getOperand(0))) {
+ if (Instruction *NewPhi = FoldOpIntoPhi(I))
+ return NewPhi;
+ }
+ return nullptr;
+}
+
/// Given a pointer type and a constant offset, determine whether or not there
/// is a sequence of GEP indices into the pointed type that will land us at the
/// specified offset. If so, fill them into NewIndices and return the resultant
@@ -1379,7 +1379,8 @@ Value *InstCombiner::SimplifyVectorOp(BinaryOperator &Inst) {
Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
SmallVector<Value*, 8> Ops(GEP.op_begin(), GEP.op_end());
- if (Value *V = SimplifyGEPInst(GEP.getSourceElementType(), Ops, DL, TLI, DT, AC))
+ if (Value *V =
+ SimplifyGEPInst(GEP.getSourceElementType(), Ops, DL, &TLI, &DT, &AC))
return replaceInstUsesWith(GEP, V);
Value *PtrOp = GEP.getOperand(0);
@@ -1394,7 +1395,7 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
for (User::op_iterator I = GEP.op_begin() + 1, E = GEP.op_end(); I != E;
++I, ++GTI) {
// Skip indices into struct types.
- if (isa<StructType>(*GTI))
+ if (GTI.isStruct())
continue;
// Index type should have the same width as IntPtr
@@ -1551,7 +1552,7 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
bool EndsWithSequential = false;
for (gep_type_iterator I = gep_type_begin(*Src), E = gep_type_end(*Src);
I != E; ++I)
- EndsWithSequential = !(*I)->isStructTy();
+ EndsWithSequential = I.isSequential();
// Can we combine the two pointer arithmetics offsets?
if (EndsWithSequential) {
@@ -1860,7 +1861,7 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
if (!Offset) {
// If the bitcast is of an allocation, and the allocation will be
// converted to match the type of the cast, don't touch this.
- if (isa<AllocaInst>(Operand) || isAllocationFn(Operand, TLI)) {
+ if (isa<AllocaInst>(Operand) || isAllocationFn(Operand, &TLI)) {
// See if the bitcast simplifies, if so, don't nuke this GEP yet.
if (Instruction *I = visitBitCast(*BCI)) {
if (I != BCI) {
@@ -1898,6 +1899,25 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
}
}
+ if (!GEP.isInBounds()) {
+ unsigned PtrWidth =
+ DL.getPointerSizeInBits(PtrOp->getType()->getPointerAddressSpace());
+ APInt BasePtrOffset(PtrWidth, 0);
+ Value *UnderlyingPtrOp =
+ PtrOp->stripAndAccumulateInBoundsConstantOffsets(DL,
+ BasePtrOffset);
+ if (auto *AI = dyn_cast<AllocaInst>(UnderlyingPtrOp)) {
+ if (GEP.accumulateConstantOffset(DL, BasePtrOffset) &&
+ BasePtrOffset.isNonNegative()) {
+ APInt AllocSize(PtrWidth, DL.getTypeAllocSize(AI->getAllocatedType()));
+ if (BasePtrOffset.ule(AllocSize)) {
+ return GetElementPtrInst::CreateInBounds(
+ PtrOp, makeArrayRef(Ops).slice(1), GEP.getName());
+ }
+ }
+ }
+ }
+
return nullptr;
}
@@ -1963,8 +1983,8 @@ isAllocSiteRemovable(Instruction *AI, SmallVectorImpl<WeakVH> &Users,
MemIntrinsic *MI = cast<MemIntrinsic>(II);
if (MI->isVolatile() || MI->getRawDest() != PI)
return false;
+ LLVM_FALLTHROUGH;
}
- // fall through
case Intrinsic::dbg_declare:
case Intrinsic::dbg_value:
case Intrinsic::invariant_start:
@@ -2002,7 +2022,7 @@ Instruction *InstCombiner::visitAllocSite(Instruction &MI) {
// to null and free calls, delete the calls and replace the comparisons with
// true or false as appropriate.
SmallVector<WeakVH, 64> Users;
- if (isAllocSiteRemovable(&MI, Users, TLI)) {
+ if (isAllocSiteRemovable(&MI, Users, &TLI)) {
for (unsigned i = 0, e = Users.size(); i != e; ++i) {
// Lowering all @llvm.objectsize calls first because they may
// use a bitcast/GEP of the alloca we are removing.
@@ -2013,12 +2033,9 @@ Instruction *InstCombiner::visitAllocSite(Instruction &MI) {
if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
if (II->getIntrinsicID() == Intrinsic::objectsize) {
- uint64_t Size;
- if (!getObjectSize(II->getArgOperand(0), Size, DL, TLI)) {
- ConstantInt *CI = cast<ConstantInt>(II->getArgOperand(1));
- Size = CI->isZero() ? -1ULL : 0;
- }
- replaceInstUsesWith(*I, ConstantInt::get(I->getType(), Size));
+ ConstantInt *Result = lowerObjectSizeCall(II, DL, &TLI,
+ /*MustSucceed=*/true);
+ replaceInstUsesWith(*I, Result);
eraseInstFromFunction(*I);
Users[i] = nullptr; // Skip examining in the next loop.
}
@@ -2218,6 +2235,20 @@ Instruction *InstCombiner::visitBranchInst(BranchInst &BI) {
Instruction *InstCombiner::visitSwitchInst(SwitchInst &SI) {
Value *Cond = SI.getCondition();
+ Value *Op0;
+ ConstantInt *AddRHS;
+ if (match(Cond, m_Add(m_Value(Op0), m_ConstantInt(AddRHS)))) {
+ // Change 'switch (X+4) case 1:' into 'switch (X) case -3'.
+ for (SwitchInst::CaseIt CaseIter : SI.cases()) {
+ Constant *NewCase = ConstantExpr::getSub(CaseIter.getCaseValue(), AddRHS);
+ assert(isa<ConstantInt>(NewCase) &&
+ "Result of expression should be constant");
+ CaseIter.setValue(cast<ConstantInt>(NewCase));
+ }
+ SI.setCondition(Op0);
+ return &SI;
+ }
+
unsigned BitWidth = cast<IntegerType>(Cond->getType())->getBitWidth();
APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0);
computeKnownBits(Cond, KnownZero, KnownOne, 0, &SI);
@@ -2238,43 +2269,20 @@ Instruction *InstCombiner::visitSwitchInst(SwitchInst &SI) {
// Shrink the condition operand if the new type is smaller than the old type.
// This may produce a non-standard type for the switch, but that's ok because
// the backend should extend back to a legal type for the target.
- bool TruncCond = false;
if (NewWidth > 0 && NewWidth < BitWidth) {
- TruncCond = true;
IntegerType *Ty = IntegerType::get(SI.getContext(), NewWidth);
Builder->SetInsertPoint(&SI);
Value *NewCond = Builder->CreateTrunc(Cond, Ty, "trunc");
SI.setCondition(NewCond);
- for (auto &C : SI.cases())
- static_cast<SwitchInst::CaseIt *>(&C)->setValue(ConstantInt::get(
- SI.getContext(), C.getCaseValue()->getValue().trunc(NewWidth)));
- }
-
- ConstantInt *AddRHS = nullptr;
- if (match(Cond, m_Add(m_Value(), m_ConstantInt(AddRHS)))) {
- Instruction *I = cast<Instruction>(Cond);
- // Change 'switch (X+4) case 1:' into 'switch (X) case -3'.
- for (SwitchInst::CaseIt i = SI.case_begin(), e = SI.case_end(); i != e;
- ++i) {
- ConstantInt *CaseVal = i.getCaseValue();
- Constant *LHS = CaseVal;
- if (TruncCond) {
- LHS = LeadingKnownZeros
- ? ConstantExpr::getZExt(CaseVal, Cond->getType())
- : ConstantExpr::getSExt(CaseVal, Cond->getType());
- }
- Constant *NewCaseVal = ConstantExpr::getSub(LHS, AddRHS);
- assert(isa<ConstantInt>(NewCaseVal) &&
- "Result of expression should be constant");
- i.setValue(cast<ConstantInt>(NewCaseVal));
+ for (SwitchInst::CaseIt CaseIter : SI.cases()) {
+ APInt TruncatedCase = CaseIter.getCaseValue()->getValue().trunc(NewWidth);
+ CaseIter.setValue(ConstantInt::get(SI.getContext(), TruncatedCase));
}
- SI.setCondition(I->getOperand(0));
- Worklist.Add(I);
return &SI;
}
- return TruncCond ? &SI : nullptr;
+ return nullptr;
}
Instruction *InstCombiner::visitExtractValueInst(ExtractValueInst &EV) {
@@ -2284,7 +2292,7 @@ Instruction *InstCombiner::visitExtractValueInst(ExtractValueInst &EV) {
return replaceInstUsesWith(EV, Agg);
if (Value *V =
- SimplifyExtractValueInst(Agg, EV.getIndices(), DL, TLI, DT, AC))
+ SimplifyExtractValueInst(Agg, EV.getIndices(), DL, &TLI, &DT, &AC))
return replaceInstUsesWith(EV, V);
if (InsertValueInst *IV = dyn_cast<InsertValueInst>(Agg)) {
@@ -2560,7 +2568,7 @@ Instruction *InstCombiner::visitLandingPadInst(LandingPadInst &LI) {
// remove it from the filter. An unexpected type handler may be
// set up for a call site which throws an exception of the same
// type caught. In order for the exception thrown by the unexpected
- // handler to propogate correctly, the filter must be correctly
+ // handler to propagate correctly, the filter must be correctly
// described for the call site.
//
// Example:
@@ -2813,7 +2821,7 @@ bool InstCombiner::run() {
if (I == nullptr) continue; // skip null values.
// Check to see if we can DCE the instruction.
- if (isInstructionTriviallyDead(I, TLI)) {
+ if (isInstructionTriviallyDead(I, &TLI)) {
DEBUG(dbgs() << "IC: DCE: " << *I << '\n');
eraseInstFromFunction(*I);
++NumDeadInst;
@@ -2824,13 +2832,13 @@ bool InstCombiner::run() {
// Instruction isn't dead, see if we can constant propagate it.
if (!I->use_empty() &&
(I->getNumOperands() == 0 || isa<Constant>(I->getOperand(0)))) {
- if (Constant *C = ConstantFoldInstruction(I, DL, TLI)) {
+ if (Constant *C = ConstantFoldInstruction(I, DL, &TLI)) {
DEBUG(dbgs() << "IC: ConstFold to: " << *C << " from: " << *I << '\n');
// Add operands to the worklist.
replaceInstUsesWith(*I, C);
++NumConstProp;
- if (isInstructionTriviallyDead(I, TLI))
+ if (isInstructionTriviallyDead(I, &TLI))
eraseInstFromFunction(*I);
MadeIRChange = true;
continue;
@@ -2839,20 +2847,21 @@ bool InstCombiner::run() {
// In general, it is possible for computeKnownBits to determine all bits in
// a value even when the operands are not all constants.
- if (ExpensiveCombines && !I->use_empty() && I->getType()->isIntegerTy()) {
- unsigned BitWidth = I->getType()->getScalarSizeInBits();
+ Type *Ty = I->getType();
+ if (ExpensiveCombines && !I->use_empty() && Ty->isIntOrIntVectorTy()) {
+ unsigned BitWidth = Ty->getScalarSizeInBits();
APInt KnownZero(BitWidth, 0);
APInt KnownOne(BitWidth, 0);
computeKnownBits(I, KnownZero, KnownOne, /*Depth*/0, I);
if ((KnownZero | KnownOne).isAllOnesValue()) {
- Constant *C = ConstantInt::get(I->getContext(), KnownOne);
+ Constant *C = ConstantInt::get(Ty, KnownOne);
DEBUG(dbgs() << "IC: ConstFold (all bits known) to: " << *C <<
" from: " << *I << '\n');
// Add operands to the worklist.
replaceInstUsesWith(*I, C);
++NumConstProp;
- if (isInstructionTriviallyDead(I, TLI))
+ if (isInstructionTriviallyDead(I, &TLI))
eraseInstFromFunction(*I);
MadeIRChange = true;
continue;
@@ -2883,7 +2892,7 @@ bool InstCombiner::run() {
// If the user is one of our immediate successors, and if that successor
// only has us as a predecessors (we'd have to split the critical edge
// otherwise), we can keep going.
- if (UserIsSuccessor && UserParent->getSinglePredecessor()) {
+ if (UserIsSuccessor && UserParent->getUniquePredecessor()) {
// Okay, the CFG is simple enough, try to sink this instruction.
if (TryToSinkInstruction(I, UserParent)) {
DEBUG(dbgs() << "IC: Sink: " << *I << '\n');
@@ -2941,14 +2950,12 @@ bool InstCombiner::run() {
eraseInstFromFunction(*I);
} else {
-#ifndef NDEBUG
DEBUG(dbgs() << "IC: Mod = " << OrigI << '\n'
<< " New = " << *I << '\n');
-#endif
// If the instruction was modified, it's possible that it is now dead.
// if so, remove it.
- if (isInstructionTriviallyDead(I, TLI)) {
+ if (isInstructionTriviallyDead(I, &TLI)) {
eraseInstFromFunction(*I);
} else {
Worklist.Add(I);
@@ -2981,7 +2988,7 @@ static bool AddReachableCodeToWorklist(BasicBlock *BB, const DataLayout &DL,
Worklist.push_back(BB);
SmallVector<Instruction*, 128> InstrsForInstCombineWorklist;
- DenseMap<ConstantExpr*, Constant*> FoldedConstants;
+ DenseMap<Constant *, Constant *> FoldedConstants;
do {
BB = Worklist.pop_back_val();
@@ -3017,17 +3024,17 @@ static bool AddReachableCodeToWorklist(BasicBlock *BB, const DataLayout &DL,
// See if we can constant fold its operands.
for (User::op_iterator i = Inst->op_begin(), e = Inst->op_end(); i != e;
++i) {
- ConstantExpr *CE = dyn_cast<ConstantExpr>(i);
- if (CE == nullptr)
+ if (!isa<ConstantVector>(i) && !isa<ConstantExpr>(i))
continue;
- Constant *&FoldRes = FoldedConstants[CE];
+ auto *C = cast<Constant>(i);
+ Constant *&FoldRes = FoldedConstants[C];
if (!FoldRes)
- FoldRes = ConstantFoldConstantExpression(CE, DL, TLI);
+ FoldRes = ConstantFoldConstant(C, DL, TLI);
if (!FoldRes)
- FoldRes = CE;
+ FoldRes = C;
- if (FoldRes != CE) {
+ if (FoldRes != C) {
*i = FoldRes;
MadeIRChange = true;
}
@@ -3120,8 +3127,15 @@ combineInstructionsOverFunction(Function &F, InstCombineWorklist &Worklist,
/// Builder - This is an IRBuilder that automatically inserts new
/// instructions into the worklist when they are created.
- IRBuilder<TargetFolder, InstCombineIRInserter> Builder(
- F.getContext(), TargetFolder(DL), InstCombineIRInserter(Worklist, &AC));
+ IRBuilder<TargetFolder, IRBuilderCallbackInserter> Builder(
+ F.getContext(), TargetFolder(DL),
+ IRBuilderCallbackInserter([&Worklist, &AC](Instruction *I) {
+ Worklist.Add(I);
+
+ using namespace llvm::PatternMatch;
+ if (match(I, m_Intrinsic<Intrinsic::assume>()))
+ AC.registerAssumption(cast<CallInst>(I));
+ }));
// Lower dbg.declare intrinsics otherwise their value may be clobbered
// by instcombiner.
@@ -3137,7 +3151,7 @@ combineInstructionsOverFunction(Function &F, InstCombineWorklist &Worklist,
bool Changed = prepareICWorklistFromFunction(F, DL, &TLI, Worklist);
InstCombiner IC(Worklist, &Builder, F.optForMinSize(), ExpensiveCombines,
- AA, &AC, &TLI, &DT, DL, LI);
+ AA, AC, TLI, DT, DL, LI);
Changed |= IC.run();
if (!Changed)
@@ -3148,7 +3162,7 @@ combineInstructionsOverFunction(Function &F, InstCombineWorklist &Worklist,
}
PreservedAnalyses InstCombinePass::run(Function &F,
- AnalysisManager<Function> &AM) {
+ FunctionAnalysisManager &AM) {
auto &AC = AM.getResult<AssumptionAnalysis>(F);
auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
diff --git a/contrib/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/contrib/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index 43d1b37..f5e9e7d 100644
--- a/contrib/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/contrib/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -54,6 +54,9 @@
#include "llvm/Transforms/Utils/ModuleUtils.h"
#include "llvm/Transforms/Utils/PromoteMemToReg.h"
#include <algorithm>
+#include <iomanip>
+#include <limits>
+#include <sstream>
#include <string>
#include <system_error>
@@ -64,8 +67,8 @@ using namespace llvm;
static const uint64_t kDefaultShadowScale = 3;
static const uint64_t kDefaultShadowOffset32 = 1ULL << 29;
static const uint64_t kDefaultShadowOffset64 = 1ULL << 44;
+static const uint64_t kDynamicShadowSentinel = ~(uint64_t)0;
static const uint64_t kIOSShadowOffset32 = 1ULL << 30;
-static const uint64_t kIOSShadowOffset64 = 0x120200000;
static const uint64_t kIOSSimShadowOffset32 = 1ULL << 30;
static const uint64_t kIOSSimShadowOffset64 = kDefaultShadowOffset64;
static const uint64_t kSmallX86_64ShadowOffset = 0x7FFF8000; // < 2G.
@@ -78,8 +81,8 @@ static const uint64_t kAArch64_ShadowOffset64 = 1ULL << 36;
static const uint64_t kFreeBSD_ShadowOffset32 = 1ULL << 30;
static const uint64_t kFreeBSD_ShadowOffset64 = 1ULL << 46;
static const uint64_t kWindowsShadowOffset32 = 3ULL << 28;
-// TODO(wwchrome): Experimental for asan Win64, may change.
-static const uint64_t kWindowsShadowOffset64 = 0x1ULL << 45; // 32TB.
+// The shadow memory space is dynamically allocated.
+static const uint64_t kWindowsShadowOffset64 = kDynamicShadowSentinel;
static const size_t kMinStackMallocSize = 1 << 6; // 64B
static const size_t kMaxStackMallocSize = 1 << 16; // 64K
@@ -111,6 +114,7 @@ static const char *const kAsanStackFreeNameTemplate = "__asan_stack_free_";
static const char *const kAsanGenPrefix = "__asan_gen_";
static const char *const kODRGenPrefix = "__odr_asan_gen_";
static const char *const kSanCovGenPrefix = "__sancov_gen_";
+static const char *const kAsanSetShadowPrefix = "__asan_set_shadow_";
static const char *const kAsanPoisonStackMemoryName =
"__asan_poison_stack_memory";
static const char *const kAsanUnpoisonStackMemoryName =
@@ -121,6 +125,9 @@ static const char *const kAsanGlobalsRegisteredFlagName =
static const char *const kAsanOptionDetectUseAfterReturn =
"__asan_option_detect_stack_use_after_return";
+static const char *const kAsanShadowMemoryDynamicAddress =
+ "__asan_shadow_memory_dynamic_address";
+
static const char *const kAsanAllocaPoison = "__asan_alloca_poison";
static const char *const kAsanAllocasUnpoison = "__asan_allocas_unpoison";
@@ -153,6 +160,11 @@ static cl::opt<bool> ClAlwaysSlowPath(
"asan-always-slow-path",
cl::desc("use instrumentation with slow path for all accesses"), cl::Hidden,
cl::init(false));
+static cl::opt<bool> ClForceDynamicShadow(
+ "asan-force-dynamic-shadow",
+ cl::desc("Load shadow address into a local variable for each function"),
+ cl::Hidden, cl::init(false));
+
// This flag limits the number of instructions to be instrumented
// in any given BB. Normally, this should be set to unlimited (INT_MAX),
// but due to http://llvm.org/bugs/show_bug.cgi?id=12652 we temporary
@@ -164,6 +176,11 @@ static cl::opt<int> ClMaxInsnsToInstrumentPerBB(
// This flag may need to be replaced with -f[no]asan-stack.
static cl::opt<bool> ClStack("asan-stack", cl::desc("Handle stack memory"),
cl::Hidden, cl::init(true));
+static cl::opt<uint32_t> ClMaxInlinePoisoningSize(
+ "asan-max-inline-poisoning-size",
+ cl::desc(
+ "Inline shadow poisoning for blocks up to the given size in bytes."),
+ cl::Hidden, cl::init(64));
static cl::opt<bool> ClUseAfterReturn("asan-use-after-return",
cl::desc("Check stack-use-after-return"),
cl::Hidden, cl::init(true));
@@ -196,9 +213,10 @@ static cl::opt<std::string> ClMemoryAccessCallbackPrefix(
"asan-memory-access-callback-prefix",
cl::desc("Prefix for memory access callbacks"), cl::Hidden,
cl::init("__asan_"));
-static cl::opt<bool> ClInstrumentAllocas("asan-instrument-allocas",
- cl::desc("instrument dynamic allocas"),
- cl::Hidden, cl::init(true));
+static cl::opt<bool>
+ ClInstrumentDynamicAllocas("asan-instrument-dynamic-allocas",
+ cl::desc("instrument dynamic allocas"),
+ cl::Hidden, cl::init(true));
static cl::opt<bool> ClSkipPromotableAllocas(
"asan-skip-promotable-allocas",
cl::desc("Do not instrument promotable allocas"), cl::Hidden,
@@ -250,7 +268,7 @@ static cl::opt<bool>
cl::desc("Use linker features to support dead "
"code stripping of globals "
"(Mach-O only)"),
- cl::Hidden, cl::init(false));
+ cl::Hidden, cl::init(true));
// Debug flags.
static cl::opt<int> ClDebug("asan-debug", cl::desc("debug"), cl::Hidden,
@@ -261,7 +279,7 @@ static cl::opt<std::string> ClDebugFunc("asan-debug-func", cl::Hidden,
cl::desc("Debug func"));
static cl::opt<int> ClDebugMin("asan-debug-min", cl::desc("Debug min inst"),
cl::Hidden, cl::init(-1));
-static cl::opt<int> ClDebugMax("asan-debug-max", cl::desc("Debug man inst"),
+static cl::opt<int> ClDebugMax("asan-debug-max", cl::desc("Debug max inst"),
cl::Hidden, cl::init(-1));
STATISTIC(NumInstrumentedReads, "Number of instrumented reads");
@@ -411,13 +429,19 @@ static ShadowMapping getShadowMapping(Triple &TargetTriple, int LongSize,
Mapping.Offset = kMIPS64_ShadowOffset64;
else if (IsIOS)
// If we're targeting iOS and x86, the binary is built for iOS simulator.
- Mapping.Offset = IsX86_64 ? kIOSSimShadowOffset64 : kIOSShadowOffset64;
+ // We are using dynamic shadow offset on the 64-bit devices.
+ Mapping.Offset =
+ IsX86_64 ? kIOSSimShadowOffset64 : kDynamicShadowSentinel;
else if (IsAArch64)
Mapping.Offset = kAArch64_ShadowOffset64;
else
Mapping.Offset = kDefaultShadowOffset64;
}
+ if (ClForceDynamicShadow) {
+ Mapping.Offset = kDynamicShadowSentinel;
+ }
+
Mapping.Scale = kDefaultShadowScale;
if (ClMappingScale.getNumOccurrences() > 0) {
Mapping.Scale = ClMappingScale;
@@ -433,7 +457,8 @@ static ShadowMapping getShadowMapping(Triple &TargetTriple, int LongSize,
// we could OR the constant in a single instruction, but it's more
// efficient to load it once and use indexed addressing.
Mapping.OrShadowOffset = !IsAArch64 && !IsPPC64 && !IsSystemZ
- && !(Mapping.Offset & (Mapping.Offset - 1));
+ && !(Mapping.Offset & (Mapping.Offset - 1))
+ && Mapping.Offset != kDynamicShadowSentinel;
return Mapping;
}
@@ -450,42 +475,47 @@ struct AddressSanitizer : public FunctionPass {
bool UseAfterScope = false)
: FunctionPass(ID), CompileKernel(CompileKernel || ClEnableKasan),
Recover(Recover || ClRecover),
- UseAfterScope(UseAfterScope || ClUseAfterScope) {
+ UseAfterScope(UseAfterScope || ClUseAfterScope),
+ LocalDynamicShadow(nullptr) {
initializeAddressSanitizerPass(*PassRegistry::getPassRegistry());
}
- const char *getPassName() const override {
+ StringRef getPassName() const override {
return "AddressSanitizerFunctionPass";
}
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<DominatorTreeWrapperPass>();
AU.addRequired<TargetLibraryInfoWrapperPass>();
}
- uint64_t getAllocaSizeInBytes(AllocaInst *AI) const {
+ uint64_t getAllocaSizeInBytes(const AllocaInst &AI) const {
uint64_t ArraySize = 1;
- if (AI->isArrayAllocation()) {
- ConstantInt *CI = dyn_cast<ConstantInt>(AI->getArraySize());
+ if (AI.isArrayAllocation()) {
+ const ConstantInt *CI = dyn_cast<ConstantInt>(AI.getArraySize());
assert(CI && "non-constant array size");
ArraySize = CI->getZExtValue();
}
- Type *Ty = AI->getAllocatedType();
+ Type *Ty = AI.getAllocatedType();
uint64_t SizeInBytes =
- AI->getModule()->getDataLayout().getTypeAllocSize(Ty);
+ AI.getModule()->getDataLayout().getTypeAllocSize(Ty);
return SizeInBytes * ArraySize;
}
/// Check if we want (and can) handle this alloca.
- bool isInterestingAlloca(AllocaInst &AI);
+ bool isInterestingAlloca(const AllocaInst &AI);
/// If it is an interesting memory access, return the PointerOperand
/// and set IsWrite/Alignment. Otherwise return nullptr.
+ /// MaybeMask is an output parameter for the mask Value, if we're looking at a
+ /// masked load/store.
Value *isInterestingMemoryAccess(Instruction *I, bool *IsWrite,
- uint64_t *TypeSize, unsigned *Alignment);
+ uint64_t *TypeSize, unsigned *Alignment,
+ Value **MaybeMask = nullptr);
void instrumentMop(ObjectSizeOffsetVisitor &ObjSizeVis, Instruction *I,
bool UseCalls, const DataLayout &DL);
void instrumentPointerComparisonOrSubtraction(Instruction *I);
void instrumentAddress(Instruction *OrigIns, Instruction *InsertBefore,
Value *Addr, uint32_t TypeSize, bool IsWrite,
Value *SizeArgument, bool UseCalls, uint32_t Exp);
- void instrumentUnusualSizeOrAlignment(Instruction *I, Value *Addr,
+ void instrumentUnusualSizeOrAlignment(Instruction *I,
+ Instruction *InsertBefore, Value *Addr,
uint32_t TypeSize, bool IsWrite,
Value *SizeArgument, bool UseCalls,
uint32_t Exp);
@@ -498,6 +528,7 @@ struct AddressSanitizer : public FunctionPass {
Value *memToShadow(Value *Shadow, IRBuilder<> &IRB);
bool runOnFunction(Function &F) override;
bool maybeInsertAsanInitAtFunctionEntry(Function &F);
+ void maybeInsertDynamicShadowAtFunctionEntry(Function &F);
void markEscapedLocalAllocas(Function &F);
bool doInitialization(Module &M) override;
bool doFinalization(Module &M) override;
@@ -519,8 +550,12 @@ struct AddressSanitizer : public FunctionPass {
FunctionStateRAII(AddressSanitizer *Pass) : Pass(Pass) {
assert(Pass->ProcessedAllocas.empty() &&
"last pass forgot to clear cache");
+ assert(!Pass->LocalDynamicShadow);
+ }
+ ~FunctionStateRAII() {
+ Pass->LocalDynamicShadow = nullptr;
+ Pass->ProcessedAllocas.clear();
}
- ~FunctionStateRAII() { Pass->ProcessedAllocas.clear(); }
};
LLVMContext *C;
@@ -544,8 +579,9 @@ struct AddressSanitizer : public FunctionPass {
Function *AsanMemoryAccessCallbackSized[2][2];
Function *AsanMemmove, *AsanMemcpy, *AsanMemset;
InlineAsm *EmptyAsm;
+ Value *LocalDynamicShadow;
GlobalsMetadata GlobalsMD;
- DenseMap<AllocaInst *, bool> ProcessedAllocas;
+ DenseMap<const AllocaInst *, bool> ProcessedAllocas;
friend struct FunctionStackPoisoner;
};
@@ -558,14 +594,31 @@ class AddressSanitizerModule : public ModulePass {
Recover(Recover || ClRecover) {}
bool runOnModule(Module &M) override;
static char ID; // Pass identification, replacement for typeid
- const char *getPassName() const override { return "AddressSanitizerModule"; }
+ StringRef getPassName() const override { return "AddressSanitizerModule"; }
- private:
+private:
void initializeCallbacks(Module &M);
bool InstrumentGlobals(IRBuilder<> &IRB, Module &M);
+ void InstrumentGlobalsCOFF(IRBuilder<> &IRB, Module &M,
+ ArrayRef<GlobalVariable *> ExtendedGlobals,
+ ArrayRef<Constant *> MetadataInitializers);
+ void InstrumentGlobalsMachO(IRBuilder<> &IRB, Module &M,
+ ArrayRef<GlobalVariable *> ExtendedGlobals,
+ ArrayRef<Constant *> MetadataInitializers);
+ void
+ InstrumentGlobalsWithMetadataArray(IRBuilder<> &IRB, Module &M,
+ ArrayRef<GlobalVariable *> ExtendedGlobals,
+ ArrayRef<Constant *> MetadataInitializers);
+
+ GlobalVariable *CreateMetadataGlobal(Module &M, Constant *Initializer,
+ StringRef OriginalName);
+ void SetComdatForGlobalMetadata(GlobalVariable *G, GlobalVariable *Metadata);
+ IRBuilder<> CreateAsanModuleDtor(Module &M);
+
bool ShouldInstrumentGlobal(GlobalVariable *G);
bool ShouldUseMachOGlobalsSection() const;
+ StringRef getGlobalMetadataSection() const;
void poisonOneInitializer(Function &GlobalInit, GlobalValue *ModuleName);
void createInitializerPoisonCalls(Module &M, GlobalValue *ModuleName);
size_t MinRedzoneSizeForGlobal() const {
@@ -606,12 +659,13 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {
ShadowMapping Mapping;
SmallVector<AllocaInst *, 16> AllocaVec;
- SmallSetVector<AllocaInst *, 16> NonInstrumentedStaticAllocaVec;
+ SmallVector<AllocaInst *, 16> StaticAllocasToMoveUp;
SmallVector<Instruction *, 8> RetVec;
unsigned StackAlignment;
Function *AsanStackMallocFunc[kMaxAsanStackMallocSizeClass + 1],
*AsanStackFreeFunc[kMaxAsanStackMallocSizeClass + 1];
+ Function *AsanSetShadowFunc[0x100] = {};
Function *AsanPoisonStackMemoryFunc, *AsanUnpoisonStackMemoryFunc;
Function *AsanAllocaPoisonFunc, *AsanAllocasUnpoisonFunc;
@@ -622,7 +676,8 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {
uint64_t Size;
bool DoPoison;
};
- SmallVector<AllocaPoisonCall, 8> AllocaPoisonCallVec;
+ SmallVector<AllocaPoisonCall, 8> DynamicAllocaPoisonCallVec;
+ SmallVector<AllocaPoisonCall, 8> StaticAllocaPoisonCallVec;
SmallVector<AllocaInst *, 1> DynamicAllocaVec;
SmallVector<IntrinsicInst *, 1> StackRestoreVec;
@@ -657,7 +712,8 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {
initializeCallbacks(*F.getParent());
- poisonStack();
+ processDynamicAllocas();
+ processStaticAllocas();
if (ClDebugStack) {
DEBUG(dbgs() << F);
@@ -668,7 +724,8 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {
// Finds all Alloca instructions and puts
// poisoned red zones around all of them.
// Then unpoison everything back before the function returns.
- void poisonStack();
+ void processStaticAllocas();
+ void processDynamicAllocas();
void createDynamicAllocasInitStorage();
@@ -676,6 +733,12 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {
/// \brief Collect all Ret instructions.
void visitReturnInst(ReturnInst &RI) { RetVec.push_back(&RI); }
+ /// \brief Collect all Resume instructions.
+ void visitResumeInst(ResumeInst &RI) { RetVec.push_back(&RI); }
+
+ /// \brief Collect all CatchReturnInst instructions.
+ void visitCleanupReturnInst(CleanupReturnInst &CRI) { RetVec.push_back(&CRI); }
+
void unpoisonDynamicAllocasBeforeInst(Instruction *InstBefore,
Value *SavedStack) {
IRBuilder<> IRB(InstBefore);
@@ -724,7 +787,14 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {
/// \brief Collect Alloca instructions we want (and can) handle.
void visitAllocaInst(AllocaInst &AI) {
if (!ASan.isInterestingAlloca(AI)) {
- if (AI.isStaticAlloca()) NonInstrumentedStaticAllocaVec.insert(&AI);
+ if (AI.isStaticAlloca()) {
+ // Skip over allocas that are present *before* the first instrumented
+ // alloca, we don't want to move those around.
+ if (AllocaVec.empty())
+ return;
+
+ StaticAllocasToMoveUp.push_back(&AI);
+ }
return;
}
@@ -761,7 +831,10 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {
return;
bool DoPoison = (ID == Intrinsic::lifetime_end);
AllocaPoisonCall APC = {&II, AI, SizeValue, DoPoison};
- AllocaPoisonCallVec.push_back(APC);
+ if (AI->isStaticAlloca())
+ StaticAllocaPoisonCallVec.push_back(APC);
+ else if (ClInstrumentDynamicAllocas)
+ DynamicAllocaPoisonCallVec.push_back(APC);
}
void visitCallSite(CallSite CS) {
@@ -785,12 +858,21 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {
/// Finds alloca where the value comes from.
AllocaInst *findAllocaForValue(Value *V);
- void poisonRedZones(ArrayRef<uint8_t> ShadowBytes, IRBuilder<> &IRB,
- Value *ShadowBase, bool DoPoison);
+
+ // Copies bytes from ShadowBytes into shadow memory for indexes where
+ // ShadowMask is not zero. If ShadowMask[i] is zero, we assume that
+ // ShadowBytes[i] is constantly zero and doesn't need to be overwritten.
+ void copyToShadow(ArrayRef<uint8_t> ShadowMask, ArrayRef<uint8_t> ShadowBytes,
+ IRBuilder<> &IRB, Value *ShadowBase);
+ void copyToShadow(ArrayRef<uint8_t> ShadowMask, ArrayRef<uint8_t> ShadowBytes,
+ size_t Begin, size_t End, IRBuilder<> &IRB,
+ Value *ShadowBase);
+ void copyToShadowInline(ArrayRef<uint8_t> ShadowMask,
+ ArrayRef<uint8_t> ShadowBytes, size_t Begin,
+ size_t End, IRBuilder<> &IRB, Value *ShadowBase);
+
void poisonAlloca(Value *V, uint64_t Size, IRBuilder<> &IRB, bool DoPoison);
- void SetShadowToStackAfterReturnInlined(IRBuilder<> &IRB, Value *ShadowBase,
- int Size);
Value *createAllocaForLayout(IRBuilder<> &IRB, const ASanStackFrameLayout &L,
bool Dynamic);
PHINode *createPHI(IRBuilder<> &IRB, Value *Cond, Value *ValueIfTrue,
@@ -885,10 +967,15 @@ Value *AddressSanitizer::memToShadow(Value *Shadow, IRBuilder<> &IRB) {
Shadow = IRB.CreateLShr(Shadow, Mapping.Scale);
if (Mapping.Offset == 0) return Shadow;
// (Shadow >> scale) | offset
+ Value *ShadowBase;
+ if (LocalDynamicShadow)
+ ShadowBase = LocalDynamicShadow;
+ else
+ ShadowBase = ConstantInt::get(IntptrTy, Mapping.Offset);
if (Mapping.OrShadowOffset)
- return IRB.CreateOr(Shadow, ConstantInt::get(IntptrTy, Mapping.Offset));
+ return IRB.CreateOr(Shadow, ShadowBase);
else
- return IRB.CreateAdd(Shadow, ConstantInt::get(IntptrTy, Mapping.Offset));
+ return IRB.CreateAdd(Shadow, ShadowBase);
}
// Instrument memset/memmove/memcpy
@@ -911,7 +998,7 @@ void AddressSanitizer::instrumentMemIntrinsic(MemIntrinsic *MI) {
}
/// Check if we want (and can) handle this alloca.
-bool AddressSanitizer::isInterestingAlloca(AllocaInst &AI) {
+bool AddressSanitizer::isInterestingAlloca(const AllocaInst &AI) {
auto PreviouslySeenAllocaInfo = ProcessedAllocas.find(&AI);
if (PreviouslySeenAllocaInfo != ProcessedAllocas.end())
@@ -920,27 +1007,32 @@ bool AddressSanitizer::isInterestingAlloca(AllocaInst &AI) {
bool IsInteresting =
(AI.getAllocatedType()->isSized() &&
// alloca() may be called with 0 size, ignore it.
- ((!AI.isStaticAlloca()) || getAllocaSizeInBytes(&AI) > 0) &&
+ ((!AI.isStaticAlloca()) || getAllocaSizeInBytes(AI) > 0) &&
// We are only interested in allocas not promotable to registers.
// Promotable allocas are common under -O0.
(!ClSkipPromotableAllocas || !isAllocaPromotable(&AI)) &&
// inalloca allocas are not treated as static, and we don't want
// dynamic alloca instrumentation for them as well.
- !AI.isUsedWithInAlloca());
+ !AI.isUsedWithInAlloca() &&
+ // swifterror allocas are register promoted by ISel
+ !AI.isSwiftError());
ProcessedAllocas[&AI] = IsInteresting;
return IsInteresting;
}
-/// If I is an interesting memory access, return the PointerOperand
-/// and set IsWrite/Alignment. Otherwise return nullptr.
Value *AddressSanitizer::isInterestingMemoryAccess(Instruction *I,
bool *IsWrite,
uint64_t *TypeSize,
- unsigned *Alignment) {
+ unsigned *Alignment,
+ Value **MaybeMask) {
// Skip memory accesses inserted by another instrumentation.
if (I->getMetadata("nosanitize")) return nullptr;
+ // Do not instrument the load fetching the dynamic shadow address.
+ if (LocalDynamicShadow == I)
+ return nullptr;
+
Value *PtrOperand = nullptr;
const DataLayout &DL = I->getModule()->getDataLayout();
if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
@@ -967,14 +1059,50 @@ Value *AddressSanitizer::isInterestingMemoryAccess(Instruction *I,
*TypeSize = DL.getTypeStoreSizeInBits(XCHG->getCompareOperand()->getType());
*Alignment = 0;
PtrOperand = XCHG->getPointerOperand();
+ } else if (auto CI = dyn_cast<CallInst>(I)) {
+ auto *F = dyn_cast<Function>(CI->getCalledValue());
+ if (F && (F->getName().startswith("llvm.masked.load.") ||
+ F->getName().startswith("llvm.masked.store."))) {
+ unsigned OpOffset = 0;
+ if (F->getName().startswith("llvm.masked.store.")) {
+ if (!ClInstrumentWrites)
+ return nullptr;
+ // Masked store has an initial operand for the value.
+ OpOffset = 1;
+ *IsWrite = true;
+ } else {
+ if (!ClInstrumentReads)
+ return nullptr;
+ *IsWrite = false;
+ }
+
+ auto BasePtr = CI->getOperand(0 + OpOffset);
+ auto Ty = cast<PointerType>(BasePtr->getType())->getElementType();
+ *TypeSize = DL.getTypeStoreSizeInBits(Ty);
+ if (auto AlignmentConstant =
+ dyn_cast<ConstantInt>(CI->getOperand(1 + OpOffset)))
+ *Alignment = (unsigned)AlignmentConstant->getZExtValue();
+ else
+ *Alignment = 1; // No alignment guarantees. We probably got Undef
+ if (MaybeMask)
+ *MaybeMask = CI->getOperand(2 + OpOffset);
+ PtrOperand = BasePtr;
+ }
}
- // Do not instrument acesses from different address spaces; we cannot deal
- // with them.
if (PtrOperand) {
+ // Do not instrument acesses from different address spaces; we cannot deal
+ // with them.
Type *PtrTy = cast<PointerType>(PtrOperand->getType()->getScalarType());
if (PtrTy->getPointerAddressSpace() != 0)
return nullptr;
+
+ // Ignore swifterror addresses.
+ // swifterror memory addresses are mem2reg promoted by instruction
+ // selection. As such they cannot have regular uses like an instrumentation
+ // function and it makes no sense to track them as memory.
+ if (PtrOperand->isSwiftError())
+ return nullptr;
}
// Treat memory accesses to promotable allocas as non-interesting since they
@@ -1025,13 +1153,71 @@ void AddressSanitizer::instrumentPointerComparisonOrSubtraction(
IRB.CreateCall(F, Param);
}
+static void doInstrumentAddress(AddressSanitizer *Pass, Instruction *I,
+ Instruction *InsertBefore, Value *Addr,
+ unsigned Alignment, unsigned Granularity,
+ uint32_t TypeSize, bool IsWrite,
+ Value *SizeArgument, bool UseCalls,
+ uint32_t Exp) {
+ // Instrument a 1-, 2-, 4-, 8-, or 16- byte access with one check
+ // if the data is properly aligned.
+ if ((TypeSize == 8 || TypeSize == 16 || TypeSize == 32 || TypeSize == 64 ||
+ TypeSize == 128) &&
+ (Alignment >= Granularity || Alignment == 0 || Alignment >= TypeSize / 8))
+ return Pass->instrumentAddress(I, InsertBefore, Addr, TypeSize, IsWrite,
+ nullptr, UseCalls, Exp);
+ Pass->instrumentUnusualSizeOrAlignment(I, InsertBefore, Addr, TypeSize,
+ IsWrite, nullptr, UseCalls, Exp);
+}
+
+static void instrumentMaskedLoadOrStore(AddressSanitizer *Pass,
+ const DataLayout &DL, Type *IntptrTy,
+ Value *Mask, Instruction *I,
+ Value *Addr, unsigned Alignment,
+ unsigned Granularity, uint32_t TypeSize,
+ bool IsWrite, Value *SizeArgument,
+ bool UseCalls, uint32_t Exp) {
+ auto *VTy = cast<PointerType>(Addr->getType())->getElementType();
+ uint64_t ElemTypeSize = DL.getTypeStoreSizeInBits(VTy->getScalarType());
+ unsigned Num = VTy->getVectorNumElements();
+ auto Zero = ConstantInt::get(IntptrTy, 0);
+ for (unsigned Idx = 0; Idx < Num; ++Idx) {
+ Value *InstrumentedAddress = nullptr;
+ Instruction *InsertBefore = I;
+ if (auto *Vector = dyn_cast<ConstantVector>(Mask)) {
+ // dyn_cast as we might get UndefValue
+ if (auto *Masked = dyn_cast<ConstantInt>(Vector->getOperand(Idx))) {
+ if (Masked->isNullValue())
+ // Mask is constant false, so no instrumentation needed.
+ continue;
+ // If we have a true or undef value, fall through to doInstrumentAddress
+ // with InsertBefore == I
+ }
+ } else {
+ IRBuilder<> IRB(I);
+ Value *MaskElem = IRB.CreateExtractElement(Mask, Idx);
+ TerminatorInst *ThenTerm = SplitBlockAndInsertIfThen(MaskElem, I, false);
+ InsertBefore = ThenTerm;
+ }
+
+ IRBuilder<> IRB(InsertBefore);
+ InstrumentedAddress =
+ IRB.CreateGEP(Addr, {Zero, ConstantInt::get(IntptrTy, Idx)});
+ doInstrumentAddress(Pass, I, InsertBefore, InstrumentedAddress, Alignment,
+ Granularity, ElemTypeSize, IsWrite, SizeArgument,
+ UseCalls, Exp);
+ }
+}
+
void AddressSanitizer::instrumentMop(ObjectSizeOffsetVisitor &ObjSizeVis,
Instruction *I, bool UseCalls,
const DataLayout &DL) {
bool IsWrite = false;
unsigned Alignment = 0;
uint64_t TypeSize = 0;
- Value *Addr = isInterestingMemoryAccess(I, &IsWrite, &TypeSize, &Alignment);
+ Value *MaybeMask = nullptr;
+ Value *Addr =
+ isInterestingMemoryAccess(I, &IsWrite, &TypeSize, &Alignment, &MaybeMask);
assert(Addr);
// Optimization experiments.
@@ -1073,15 +1259,14 @@ void AddressSanitizer::instrumentMop(ObjectSizeOffsetVisitor &ObjSizeVis,
NumInstrumentedReads++;
unsigned Granularity = 1 << Mapping.Scale;
- // Instrument a 1-, 2-, 4-, 8-, or 16- byte access with one check
- // if the data is properly aligned.
- if ((TypeSize == 8 || TypeSize == 16 || TypeSize == 32 || TypeSize == 64 ||
- TypeSize == 128) &&
- (Alignment >= Granularity || Alignment == 0 || Alignment >= TypeSize / 8))
- return instrumentAddress(I, I, Addr, TypeSize, IsWrite, nullptr, UseCalls,
- Exp);
- instrumentUnusualSizeOrAlignment(I, Addr, TypeSize, IsWrite, nullptr,
- UseCalls, Exp);
+ if (MaybeMask) {
+ instrumentMaskedLoadOrStore(this, DL, IntptrTy, MaybeMask, I, Addr,
+ Alignment, Granularity, TypeSize, IsWrite,
+ nullptr, UseCalls, Exp);
+ } else {
+ doInstrumentAddress(this, I, I, Addr, Alignment, Granularity, TypeSize,
+ IsWrite, nullptr, UseCalls, Exp);
+ }
}
Instruction *AddressSanitizer::generateCrashCode(Instruction *InsertBefore,
@@ -1196,9 +1381,9 @@ void AddressSanitizer::instrumentAddress(Instruction *OrigIns,
// and the last bytes. We call __asan_report_*_n(addr, real_size) to be able
// to report the actual access size.
void AddressSanitizer::instrumentUnusualSizeOrAlignment(
- Instruction *I, Value *Addr, uint32_t TypeSize, bool IsWrite,
- Value *SizeArgument, bool UseCalls, uint32_t Exp) {
- IRBuilder<> IRB(I);
+ Instruction *I, Instruction *InsertBefore, Value *Addr, uint32_t TypeSize,
+ bool IsWrite, Value *SizeArgument, bool UseCalls, uint32_t Exp) {
+ IRBuilder<> IRB(InsertBefore);
Value *Size = ConstantInt::get(IntptrTy, TypeSize / 8);
Value *AddrLong = IRB.CreatePointerCast(Addr, IntptrTy);
if (UseCalls) {
@@ -1212,8 +1397,8 @@ void AddressSanitizer::instrumentUnusualSizeOrAlignment(
Value *LastByte = IRB.CreateIntToPtr(
IRB.CreateAdd(AddrLong, ConstantInt::get(IntptrTy, TypeSize / 8 - 1)),
Addr->getType());
- instrumentAddress(I, I, Addr, 8, IsWrite, Size, false, Exp);
- instrumentAddress(I, I, LastByte, 8, IsWrite, Size, false, Exp);
+ instrumentAddress(I, InsertBefore, Addr, 8, IsWrite, Size, false, Exp);
+ instrumentAddress(I, InsertBefore, LastByte, 8, IsWrite, Size, false, Exp);
}
}
@@ -1361,6 +1546,16 @@ bool AddressSanitizerModule::ShouldUseMachOGlobalsSection() const {
return false;
}
+StringRef AddressSanitizerModule::getGlobalMetadataSection() const {
+ switch (TargetTriple.getObjectFormat()) {
+ case Triple::COFF: return ".ASAN$GL";
+ case Triple::ELF: return "asan_globals";
+ case Triple::MachO: return "__DATA,__asan_globals,regular";
+ default: break;
+ }
+ llvm_unreachable("unsupported object format");
+}
+
void AddressSanitizerModule::initializeCallbacks(Module &M) {
IRBuilder<> IRB(*C);
@@ -1383,17 +1578,173 @@ void AddressSanitizerModule::initializeCallbacks(Module &M) {
// Declare the functions that find globals in a shared object and then invoke
// the (un)register function on them.
- AsanRegisterImageGlobals = checkSanitizerInterfaceFunction(
- M.getOrInsertFunction(kAsanRegisterImageGlobalsName,
- IRB.getVoidTy(), IntptrTy, nullptr));
+ AsanRegisterImageGlobals =
+ checkSanitizerInterfaceFunction(M.getOrInsertFunction(
+ kAsanRegisterImageGlobalsName, IRB.getVoidTy(), IntptrTy, nullptr));
AsanRegisterImageGlobals->setLinkage(Function::ExternalLinkage);
- AsanUnregisterImageGlobals = checkSanitizerInterfaceFunction(
- M.getOrInsertFunction(kAsanUnregisterImageGlobalsName,
- IRB.getVoidTy(), IntptrTy, nullptr));
+ AsanUnregisterImageGlobals =
+ checkSanitizerInterfaceFunction(M.getOrInsertFunction(
+ kAsanUnregisterImageGlobalsName, IRB.getVoidTy(), IntptrTy, nullptr));
AsanUnregisterImageGlobals->setLinkage(Function::ExternalLinkage);
}
+// Put the metadata and the instrumented global in the same group. This ensures
+// that the metadata is discarded if the instrumented global is discarded.
+void AddressSanitizerModule::SetComdatForGlobalMetadata(
+ GlobalVariable *G, GlobalVariable *Metadata) {
+ Module &M = *G->getParent();
+ Comdat *C = G->getComdat();
+ if (!C) {
+ if (!G->hasName()) {
+ // If G is unnamed, it must be internal. Give it an artificial name
+ // so we can put it in a comdat.
+ assert(G->hasLocalLinkage());
+ G->setName(Twine(kAsanGenPrefix) + "_anon_global");
+ }
+ C = M.getOrInsertComdat(G->getName());
+ // Make this IMAGE_COMDAT_SELECT_NODUPLICATES on COFF.
+ if (TargetTriple.isOSBinFormatCOFF())
+ C->setSelectionKind(Comdat::NoDuplicates);
+ G->setComdat(C);
+ }
+
+ assert(G->hasComdat());
+ Metadata->setComdat(G->getComdat());
+}
+
+// Create a separate metadata global and put it in the appropriate ASan
+// global registration section.
+GlobalVariable *
+AddressSanitizerModule::CreateMetadataGlobal(Module &M, Constant *Initializer,
+ StringRef OriginalName) {
+ GlobalVariable *Metadata =
+ new GlobalVariable(M, Initializer->getType(), false,
+ GlobalVariable::InternalLinkage, Initializer,
+ Twine("__asan_global_") +
+ GlobalValue::getRealLinkageName(OriginalName));
+ Metadata->setSection(getGlobalMetadataSection());
+ return Metadata;
+}
+
+IRBuilder<> AddressSanitizerModule::CreateAsanModuleDtor(Module &M) {
+ Function *AsanDtorFunction =
+ Function::Create(FunctionType::get(Type::getVoidTy(*C), false),
+ GlobalValue::InternalLinkage, kAsanModuleDtorName, &M);
+ BasicBlock *AsanDtorBB = BasicBlock::Create(*C, "", AsanDtorFunction);
+ appendToGlobalDtors(M, AsanDtorFunction, kAsanCtorAndDtorPriority);
+
+ return IRBuilder<>(ReturnInst::Create(*C, AsanDtorBB));
+}
+
+void AddressSanitizerModule::InstrumentGlobalsCOFF(
+ IRBuilder<> &IRB, Module &M, ArrayRef<GlobalVariable *> ExtendedGlobals,
+ ArrayRef<Constant *> MetadataInitializers) {
+ assert(ExtendedGlobals.size() == MetadataInitializers.size());
+ auto &DL = M.getDataLayout();
+
+ for (size_t i = 0; i < ExtendedGlobals.size(); i++) {
+ Constant *Initializer = MetadataInitializers[i];
+ GlobalVariable *G = ExtendedGlobals[i];
+ GlobalVariable *Metadata =
+ CreateMetadataGlobal(M, Initializer, G->getName());
+
+ // The MSVC linker always inserts padding when linking incrementally. We
+ // cope with that by aligning each struct to its size, which must be a power
+ // of two.
+ unsigned SizeOfGlobalStruct = DL.getTypeAllocSize(Initializer->getType());
+ assert(isPowerOf2_32(SizeOfGlobalStruct) &&
+ "global metadata will not be padded appropriately");
+ Metadata->setAlignment(SizeOfGlobalStruct);
+
+ SetComdatForGlobalMetadata(G, Metadata);
+ }
+}
+
+void AddressSanitizerModule::InstrumentGlobalsMachO(
+ IRBuilder<> &IRB, Module &M, ArrayRef<GlobalVariable *> ExtendedGlobals,
+ ArrayRef<Constant *> MetadataInitializers) {
+ assert(ExtendedGlobals.size() == MetadataInitializers.size());
+
+ // On recent Mach-O platforms, use a structure which binds the liveness of
+ // the global variable to the metadata struct. Keep the list of "Liveness" GV
+ // created to be added to llvm.compiler.used
+ StructType *LivenessTy = StructType::get(IntptrTy, IntptrTy, nullptr);
+ SmallVector<GlobalValue *, 16> LivenessGlobals(ExtendedGlobals.size());
+
+ for (size_t i = 0; i < ExtendedGlobals.size(); i++) {
+ Constant *Initializer = MetadataInitializers[i];
+ GlobalVariable *G = ExtendedGlobals[i];
+ GlobalVariable *Metadata =
+ CreateMetadataGlobal(M, Initializer, G->getName());
+
+ // On recent Mach-O platforms, we emit the global metadata in a way that
+ // allows the linker to properly strip dead globals.
+ auto LivenessBinder = ConstantStruct::get(
+ LivenessTy, Initializer->getAggregateElement(0u),
+ ConstantExpr::getPointerCast(Metadata, IntptrTy), nullptr);
+ GlobalVariable *Liveness = new GlobalVariable(
+ M, LivenessTy, false, GlobalVariable::InternalLinkage, LivenessBinder,
+ Twine("__asan_binder_") + G->getName());
+ Liveness->setSection("__DATA,__asan_liveness,regular,live_support");
+ LivenessGlobals[i] = Liveness;
+ }
+
+ // Update llvm.compiler.used, adding the new liveness globals. This is
+ // needed so that during LTO these variables stay alive. The alternative
+ // would be to have the linker handling the LTO symbols, but libLTO
+ // current API does not expose access to the section for each symbol.
+ if (!LivenessGlobals.empty())
+ appendToCompilerUsed(M, LivenessGlobals);
+
+ // RegisteredFlag serves two purposes. First, we can pass it to dladdr()
+ // to look up the loaded image that contains it. Second, we can store in it
+ // whether registration has already occurred, to prevent duplicate
+ // registration.
+ //
+ // common linkage ensures that there is only one global per shared library.
+ GlobalVariable *RegisteredFlag = new GlobalVariable(
+ M, IntptrTy, false, GlobalVariable::CommonLinkage,
+ ConstantInt::get(IntptrTy, 0), kAsanGlobalsRegisteredFlagName);
+ RegisteredFlag->setVisibility(GlobalVariable::HiddenVisibility);
+
+ IRB.CreateCall(AsanRegisterImageGlobals,
+ {IRB.CreatePointerCast(RegisteredFlag, IntptrTy)});
+
+ // We also need to unregister globals at the end, e.g., when a shared library
+ // gets closed.
+ IRBuilder<> IRB_Dtor = CreateAsanModuleDtor(M);
+ IRB_Dtor.CreateCall(AsanUnregisterImageGlobals,
+ {IRB.CreatePointerCast(RegisteredFlag, IntptrTy)});
+}
+
+void AddressSanitizerModule::InstrumentGlobalsWithMetadataArray(
+ IRBuilder<> &IRB, Module &M, ArrayRef<GlobalVariable *> ExtendedGlobals,
+ ArrayRef<Constant *> MetadataInitializers) {
+ assert(ExtendedGlobals.size() == MetadataInitializers.size());
+ unsigned N = ExtendedGlobals.size();
+ assert(N > 0);
+
+ // On platforms that don't have a custom metadata section, we emit an array
+ // of global metadata structures.
+ ArrayType *ArrayOfGlobalStructTy =
+ ArrayType::get(MetadataInitializers[0]->getType(), N);
+ auto AllGlobals = new GlobalVariable(
+ M, ArrayOfGlobalStructTy, false, GlobalVariable::InternalLinkage,
+ ConstantArray::get(ArrayOfGlobalStructTy, MetadataInitializers), "");
+
+ IRB.CreateCall(AsanRegisterGlobals,
+ {IRB.CreatePointerCast(AllGlobals, IntptrTy),
+ ConstantInt::get(IntptrTy, N)});
+
+ // We also need to unregister globals at the end, e.g., when a shared library
+ // gets closed.
+ IRBuilder<> IRB_Dtor = CreateAsanModuleDtor(M);
+ IRB_Dtor.CreateCall(AsanUnregisterGlobals,
+ {IRB.CreatePointerCast(AllGlobals, IntptrTy),
+ ConstantInt::get(IntptrTy, N)});
+}
+
// This function replaces all global variables with new variables that have
// trailing redzones. It also creates a function that poisons
// redzones and inserts this function into llvm.global_ctors.
@@ -1409,6 +1760,8 @@ bool AddressSanitizerModule::InstrumentGlobals(IRBuilder<> &IRB, Module &M) {
size_t n = GlobalsToChange.size();
if (n == 0) return false;
+ auto &DL = M.getDataLayout();
+
// A global is described by a structure
// size_t beg;
// size_t size;
@@ -1422,6 +1775,7 @@ bool AddressSanitizerModule::InstrumentGlobals(IRBuilder<> &IRB, Module &M) {
StructType *GlobalStructTy =
StructType::get(IntptrTy, IntptrTy, IntptrTy, IntptrTy, IntptrTy,
IntptrTy, IntptrTy, IntptrTy, nullptr);
+ SmallVector<GlobalVariable *, 16> NewGlobals(n);
SmallVector<Constant *, 16> Initializers(n);
bool HasDynamicallyInitializedGlobals = false;
@@ -1431,7 +1785,6 @@ bool AddressSanitizerModule::InstrumentGlobals(IRBuilder<> &IRB, Module &M) {
GlobalVariable *ModuleName = createPrivateGlobalForString(
M, M.getModuleIdentifier(), /*AllowMerging*/ false);
- auto &DL = M.getDataLayout();
for (size_t i = 0; i < n; i++) {
static const uint64_t kMaxGlobalRedzone = 1 << 18;
GlobalVariable *G = GlobalsToChange[i];
@@ -1472,6 +1825,21 @@ bool AddressSanitizerModule::InstrumentGlobals(IRBuilder<> &IRB, Module &M) {
NewGlobal->copyAttributesFrom(G);
NewGlobal->setAlignment(MinRZ);
+ // Move null-terminated C strings to "__asan_cstring" section on Darwin.
+ if (TargetTriple.isOSBinFormatMachO() && !G->hasSection() &&
+ G->isConstant()) {
+ auto Seq = dyn_cast<ConstantDataSequential>(G->getInitializer());
+ if (Seq && Seq->isCString())
+ NewGlobal->setSection("__TEXT,__asan_cstring,regular");
+ }
+
+ // Transfer the debug info. The payload starts at offset zero so we can
+ // copy the debug info over as is.
+ SmallVector<DIGlobalVariableExpression *, 1> GVs;
+ G->getDebugInfo(GVs);
+ for (auto *GV : GVs)
+ NewGlobal->addDebugInfo(GV);
+
Value *Indices2[2];
Indices2[0] = IRB.getInt32(0);
Indices2[1] = IRB.getInt32(0);
@@ -1480,6 +1848,7 @@ bool AddressSanitizerModule::InstrumentGlobals(IRBuilder<> &IRB, Module &M) {
ConstantExpr::getGetElementPtr(NewTy, NewGlobal, Indices2, true));
NewGlobal->takeName(G);
G->eraseFromParent();
+ NewGlobals[i] = NewGlobal;
Constant *SourceLoc;
if (!MD.SourceLoc.empty()) {
@@ -1492,7 +1861,8 @@ bool AddressSanitizerModule::InstrumentGlobals(IRBuilder<> &IRB, Module &M) {
Constant *ODRIndicator = ConstantExpr::getNullValue(IRB.getInt8PtrTy());
GlobalValue *InstrumentedGlobal = NewGlobal;
- bool CanUsePrivateAliases = TargetTriple.isOSBinFormatELF();
+ bool CanUsePrivateAliases =
+ TargetTriple.isOSBinFormatELF() || TargetTriple.isOSBinFormatMachO();
if (CanUsePrivateAliases && ClUsePrivateAliasForGlobals) {
// Create local alias for NewGlobal to avoid crash on ODR between
// instrumented and non-instrumented libraries.
@@ -1515,7 +1885,7 @@ bool AddressSanitizerModule::InstrumentGlobals(IRBuilder<> &IRB, Module &M) {
InstrumentedGlobal = GA;
}
- Initializers[i] = ConstantStruct::get(
+ Constant *Initializer = ConstantStruct::get(
GlobalStructTy,
ConstantExpr::getPointerCast(InstrumentedGlobal, IntptrTy),
ConstantInt::get(IntptrTy, SizeInBytes),
@@ -1528,88 +1898,22 @@ bool AddressSanitizerModule::InstrumentGlobals(IRBuilder<> &IRB, Module &M) {
if (ClInitializers && MD.IsDynInit) HasDynamicallyInitializedGlobals = true;
DEBUG(dbgs() << "NEW GLOBAL: " << *NewGlobal << "\n");
- }
+ Initializers[i] = Initializer;
+ }
- GlobalVariable *AllGlobals = nullptr;
- GlobalVariable *RegisteredFlag = nullptr;
-
- // On recent Mach-O platforms, we emit the global metadata in a way that
- // allows the linker to properly strip dead globals.
- if (ShouldUseMachOGlobalsSection()) {
- // RegisteredFlag serves two purposes. First, we can pass it to dladdr()
- // to look up the loaded image that contains it. Second, we can store in it
- // whether registration has already occurred, to prevent duplicate
- // registration.
- //
- // Common linkage allows us to coalesce needles defined in each object
- // file so that there's only one per shared library.
- RegisteredFlag = new GlobalVariable(
- M, IntptrTy, false, GlobalVariable::CommonLinkage,
- ConstantInt::get(IntptrTy, 0), kAsanGlobalsRegisteredFlagName);
-
- // We also emit a structure which binds the liveness of the global
- // variable to the metadata struct.
- StructType *LivenessTy = StructType::get(IntptrTy, IntptrTy, nullptr);
-
- for (size_t i = 0; i < n; i++) {
- GlobalVariable *Metadata = new GlobalVariable(
- M, GlobalStructTy, false, GlobalVariable::InternalLinkage,
- Initializers[i], "");
- Metadata->setSection("__DATA,__asan_globals,regular");
- Metadata->setAlignment(1); // don't leave padding in between
-
- auto LivenessBinder = ConstantStruct::get(LivenessTy,
- Initializers[i]->getAggregateElement(0u),
- ConstantExpr::getPointerCast(Metadata, IntptrTy),
- nullptr);
- GlobalVariable *Liveness = new GlobalVariable(
- M, LivenessTy, false, GlobalVariable::InternalLinkage,
- LivenessBinder, "");
- Liveness->setSection("__DATA,__asan_liveness,regular,live_support");
- }
+ if (TargetTriple.isOSBinFormatCOFF()) {
+ InstrumentGlobalsCOFF(IRB, M, NewGlobals, Initializers);
+ } else if (ShouldUseMachOGlobalsSection()) {
+ InstrumentGlobalsMachO(IRB, M, NewGlobals, Initializers);
} else {
- // On all other platfoms, we just emit an array of global metadata
- // structures.
- ArrayType *ArrayOfGlobalStructTy = ArrayType::get(GlobalStructTy, n);
- AllGlobals = new GlobalVariable(
- M, ArrayOfGlobalStructTy, false, GlobalVariable::InternalLinkage,
- ConstantArray::get(ArrayOfGlobalStructTy, Initializers), "");
+ InstrumentGlobalsWithMetadataArray(IRB, M, NewGlobals, Initializers);
}
// Create calls for poisoning before initializers run and unpoisoning after.
if (HasDynamicallyInitializedGlobals)
createInitializerPoisonCalls(M, ModuleName);
- // Create a call to register the globals with the runtime.
- if (ShouldUseMachOGlobalsSection()) {
- IRB.CreateCall(AsanRegisterImageGlobals,
- {IRB.CreatePointerCast(RegisteredFlag, IntptrTy)});
- } else {
- IRB.CreateCall(AsanRegisterGlobals,
- {IRB.CreatePointerCast(AllGlobals, IntptrTy),
- ConstantInt::get(IntptrTy, n)});
- }
-
- // We also need to unregister globals at the end, e.g., when a shared library
- // gets closed.
- Function *AsanDtorFunction =
- Function::Create(FunctionType::get(Type::getVoidTy(*C), false),
- GlobalValue::InternalLinkage, kAsanModuleDtorName, &M);
- BasicBlock *AsanDtorBB = BasicBlock::Create(*C, "", AsanDtorFunction);
- IRBuilder<> IRB_Dtor(ReturnInst::Create(*C, AsanDtorBB));
-
- if (ShouldUseMachOGlobalsSection()) {
- IRB_Dtor.CreateCall(AsanUnregisterImageGlobals,
- {IRB.CreatePointerCast(RegisteredFlag, IntptrTy)});
- } else {
- IRB_Dtor.CreateCall(AsanUnregisterGlobals,
- {IRB.CreatePointerCast(AllGlobals, IntptrTy),
- ConstantInt::get(IntptrTy, n)});
- }
-
- appendToGlobalDtors(M, AsanDtorFunction, kAsanCtorAndDtorPriority);
-
DEBUG(dbgs() << M);
return true;
}
@@ -1737,6 +2041,17 @@ bool AddressSanitizer::maybeInsertAsanInitAtFunctionEntry(Function &F) {
return false;
}
+void AddressSanitizer::maybeInsertDynamicShadowAtFunctionEntry(Function &F) {
+ // Generate code only when dynamic addressing is needed.
+ if (Mapping.Offset != kDynamicShadowSentinel)
+ return;
+
+ IRBuilder<> IRB(&F.front().front());
+ Value *GlobalDynamicAddress = F.getParent()->getOrInsertGlobal(
+ kAsanShadowMemoryDynamicAddress, IntptrTy);
+ LocalDynamicShadow = IRB.CreateLoad(GlobalDynamicAddress);
+}
+
void AddressSanitizer::markEscapedLocalAllocas(Function &F) {
// Find the one possible call to llvm.localescape and pre-mark allocas passed
// to it as uninteresting. This assumes we haven't started processing allocas
@@ -1768,20 +2083,29 @@ void AddressSanitizer::markEscapedLocalAllocas(Function &F) {
bool AddressSanitizer::runOnFunction(Function &F) {
if (&F == AsanCtorFunction) return false;
if (F.getLinkage() == GlobalValue::AvailableExternallyLinkage) return false;
- DEBUG(dbgs() << "ASAN instrumenting:\n" << F << "\n");
- initializeCallbacks(*F.getParent());
+ if (!ClDebugFunc.empty() && ClDebugFunc == F.getName()) return false;
+ if (F.getName().startswith("__asan_")) return false;
- DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ bool FunctionModified = false;
// If needed, insert __asan_init before checking for SanitizeAddress attr.
- maybeInsertAsanInitAtFunctionEntry(F);
+ // This function needs to be called even if the function body is not
+ // instrumented.
+ if (maybeInsertAsanInitAtFunctionEntry(F))
+ FunctionModified = true;
+
+ // Leave if the function doesn't need instrumentation.
+ if (!F.hasFnAttribute(Attribute::SanitizeAddress)) return FunctionModified;
- if (!F.hasFnAttribute(Attribute::SanitizeAddress)) return false;
+ DEBUG(dbgs() << "ASAN instrumenting:\n" << F << "\n");
- if (!ClDebugFunc.empty() && ClDebugFunc != F.getName()) return false;
+ initializeCallbacks(*F.getParent());
+ DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
FunctionStateRAII CleanupObj(this);
+ maybeInsertDynamicShadowAtFunctionEntry(F);
+
// We can't instrument allocas used with llvm.localescape. Only static allocas
// can be passed to that intrinsic.
markEscapedLocalAllocas(F);
@@ -1807,11 +2131,20 @@ bool AddressSanitizer::runOnFunction(Function &F) {
int NumInsnsPerBB = 0;
for (auto &Inst : BB) {
if (LooksLikeCodeInBug11395(&Inst)) return false;
+ Value *MaybeMask = nullptr;
if (Value *Addr = isInterestingMemoryAccess(&Inst, &IsWrite, &TypeSize,
- &Alignment)) {
+ &Alignment, &MaybeMask)) {
if (ClOpt && ClOptSameTemp) {
- if (!TempsToInstrument.insert(Addr).second)
- continue; // We've seen this temp in the current BB.
+ // If we have a mask, skip instrumentation if we've already
+ // instrumented the full object. But don't add to TempsToInstrument
+ // because we might get another load/store with a different mask.
+ if (MaybeMask) {
+ if (TempsToInstrument.count(Addr))
+ continue; // We've seen this (whole) temp in the current BB.
+ } else {
+ if (!TempsToInstrument.insert(Addr).second)
+ continue; // We've seen this temp in the current BB.
+ }
}
} else if (ClInvalidPointerPairs &&
isInterestingPointerComparisonOrSubtraction(&Inst)) {
@@ -1874,11 +2207,13 @@ bool AddressSanitizer::runOnFunction(Function &F) {
NumInstrumented++;
}
- bool res = NumInstrumented > 0 || ChangedStack || !NoReturnCalls.empty();
+ if (NumInstrumented > 0 || ChangedStack || !NoReturnCalls.empty())
+ FunctionModified = true;
- DEBUG(dbgs() << "ASAN done instrumenting: " << res << " " << F << "\n");
+ DEBUG(dbgs() << "ASAN done instrumenting: " << FunctionModified << " "
+ << F << "\n");
- return res;
+ return FunctionModified;
}
// Workaround for bug 11395: we don't want to instrument stack in functions
@@ -1913,6 +2248,15 @@ void FunctionStackPoisoner::initializeCallbacks(Module &M) {
IntptrTy, IntptrTy, nullptr));
}
+ for (size_t Val : {0x00, 0xf1, 0xf2, 0xf3, 0xf5, 0xf8}) {
+ std::ostringstream Name;
+ Name << kAsanSetShadowPrefix;
+ Name << std::setw(2) << std::setfill('0') << std::hex << Val;
+ AsanSetShadowFunc[Val] =
+ checkSanitizerInterfaceFunction(M.getOrInsertFunction(
+ Name.str(), IRB.getVoidTy(), IntptrTy, IntptrTy, nullptr));
+ }
+
AsanAllocaPoisonFunc = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
kAsanAllocaPoison, IRB.getVoidTy(), IntptrTy, IntptrTy, nullptr));
AsanAllocasUnpoisonFunc =
@@ -1920,31 +2264,93 @@ void FunctionStackPoisoner::initializeCallbacks(Module &M) {
kAsanAllocasUnpoison, IRB.getVoidTy(), IntptrTy, IntptrTy, nullptr));
}
-void FunctionStackPoisoner::poisonRedZones(ArrayRef<uint8_t> ShadowBytes,
- IRBuilder<> &IRB, Value *ShadowBase,
- bool DoPoison) {
- size_t n = ShadowBytes.size();
- size_t i = 0;
- // We need to (un)poison n bytes of stack shadow. Poison as many as we can
- // using 64-bit stores (if we are on 64-bit arch), then poison the rest
- // with 32-bit stores, then with 16-byte stores, then with 8-byte stores.
- for (size_t LargeStoreSizeInBytes = ASan.LongSize / 8;
- LargeStoreSizeInBytes != 0; LargeStoreSizeInBytes /= 2) {
- for (; i + LargeStoreSizeInBytes - 1 < n; i += LargeStoreSizeInBytes) {
- uint64_t Val = 0;
- for (size_t j = 0; j < LargeStoreSizeInBytes; j++) {
- if (F.getParent()->getDataLayout().isLittleEndian())
- Val |= (uint64_t)ShadowBytes[i + j] << (8 * j);
- else
- Val = (Val << 8) | ShadowBytes[i + j];
- }
- if (!Val) continue;
- Value *Ptr = IRB.CreateAdd(ShadowBase, ConstantInt::get(IntptrTy, i));
- Type *StoreTy = Type::getIntNTy(*C, LargeStoreSizeInBytes * 8);
- Value *Poison = ConstantInt::get(StoreTy, DoPoison ? Val : 0);
- IRB.CreateStore(Poison, IRB.CreateIntToPtr(Ptr, StoreTy->getPointerTo()));
+void FunctionStackPoisoner::copyToShadowInline(ArrayRef<uint8_t> ShadowMask,
+ ArrayRef<uint8_t> ShadowBytes,
+ size_t Begin, size_t End,
+ IRBuilder<> &IRB,
+ Value *ShadowBase) {
+ if (Begin >= End)
+ return;
+
+ const size_t LargestStoreSizeInBytes =
+ std::min<size_t>(sizeof(uint64_t), ASan.LongSize / 8);
+
+ const bool IsLittleEndian = F.getParent()->getDataLayout().isLittleEndian();
+
+ // Poison given range in shadow using larges store size with out leading and
+ // trailing zeros in ShadowMask. Zeros never change, so they need neither
+ // poisoning nor up-poisoning. Still we don't mind if some of them get into a
+ // middle of a store.
+ for (size_t i = Begin; i < End;) {
+ if (!ShadowMask[i]) {
+ assert(!ShadowBytes[i]);
+ ++i;
+ continue;
+ }
+
+ size_t StoreSizeInBytes = LargestStoreSizeInBytes;
+ // Fit store size into the range.
+ while (StoreSizeInBytes > End - i)
+ StoreSizeInBytes /= 2;
+
+ // Minimize store size by trimming trailing zeros.
+ for (size_t j = StoreSizeInBytes - 1; j && !ShadowMask[i + j]; --j) {
+ while (j <= StoreSizeInBytes / 2)
+ StoreSizeInBytes /= 2;
+ }
+
+ uint64_t Val = 0;
+ for (size_t j = 0; j < StoreSizeInBytes; j++) {
+ if (IsLittleEndian)
+ Val |= (uint64_t)ShadowBytes[i + j] << (8 * j);
+ else
+ Val = (Val << 8) | ShadowBytes[i + j];
+ }
+
+ Value *Ptr = IRB.CreateAdd(ShadowBase, ConstantInt::get(IntptrTy, i));
+ Value *Poison = IRB.getIntN(StoreSizeInBytes * 8, Val);
+ IRB.CreateAlignedStore(
+ Poison, IRB.CreateIntToPtr(Ptr, Poison->getType()->getPointerTo()), 1);
+
+ i += StoreSizeInBytes;
+ }
+}
+
+void FunctionStackPoisoner::copyToShadow(ArrayRef<uint8_t> ShadowMask,
+ ArrayRef<uint8_t> ShadowBytes,
+ IRBuilder<> &IRB, Value *ShadowBase) {
+ copyToShadow(ShadowMask, ShadowBytes, 0, ShadowMask.size(), IRB, ShadowBase);
+}
+
+void FunctionStackPoisoner::copyToShadow(ArrayRef<uint8_t> ShadowMask,
+ ArrayRef<uint8_t> ShadowBytes,
+ size_t Begin, size_t End,
+ IRBuilder<> &IRB, Value *ShadowBase) {
+ assert(ShadowMask.size() == ShadowBytes.size());
+ size_t Done = Begin;
+ for (size_t i = Begin, j = Begin + 1; i < End; i = j++) {
+ if (!ShadowMask[i]) {
+ assert(!ShadowBytes[i]);
+ continue;
+ }
+ uint8_t Val = ShadowBytes[i];
+ if (!AsanSetShadowFunc[Val])
+ continue;
+
+ // Skip same values.
+ for (; j < End && ShadowMask[j] && Val == ShadowBytes[j]; ++j) {
+ }
+
+ if (j - i >= ClMaxInlinePoisoningSize) {
+ copyToShadowInline(ShadowMask, ShadowBytes, Done, i, IRB, ShadowBase);
+ IRB.CreateCall(AsanSetShadowFunc[Val],
+ {IRB.CreateAdd(ShadowBase, ConstantInt::get(IntptrTy, i)),
+ ConstantInt::get(IntptrTy, j - i)});
+ Done = j;
}
}
+
+ copyToShadowInline(ShadowMask, ShadowBytes, Done, End, IRB, ShadowBase);
}
// Fake stack allocator (asan_fake_stack.h) has 11 size classes
@@ -1957,26 +2363,6 @@ static int StackMallocSizeClass(uint64_t LocalStackSize) {
llvm_unreachable("impossible LocalStackSize");
}
-// Set Size bytes starting from ShadowBase to kAsanStackAfterReturnMagic.
-// We can not use MemSet intrinsic because it may end up calling the actual
-// memset. Size is a multiple of 8.
-// Currently this generates 8-byte stores on x86_64; it may be better to
-// generate wider stores.
-void FunctionStackPoisoner::SetShadowToStackAfterReturnInlined(
- IRBuilder<> &IRB, Value *ShadowBase, int Size) {
- assert(!(Size % 8));
-
- // kAsanStackAfterReturnMagic is 0xf5.
- const uint64_t kAsanStackAfterReturnMagic64 = 0xf5f5f5f5f5f5f5f5ULL;
-
- for (int i = 0; i < Size; i += 8) {
- Value *p = IRB.CreateAdd(ShadowBase, ConstantInt::get(IntptrTy, i));
- IRB.CreateStore(
- ConstantInt::get(IRB.getInt64Ty(), kAsanStackAfterReturnMagic64),
- IRB.CreateIntToPtr(p, IRB.getInt64Ty()->getPointerTo()));
- }
-}
-
PHINode *FunctionStackPoisoner::createPHI(IRBuilder<> &IRB, Value *Cond,
Value *ValueIfTrue,
Instruction *ThenTerm,
@@ -2015,37 +2401,39 @@ void FunctionStackPoisoner::createDynamicAllocasInitStorage() {
DynamicAllocaLayout->setAlignment(32);
}
-void FunctionStackPoisoner::poisonStack() {
- assert(AllocaVec.size() > 0 || DynamicAllocaVec.size() > 0);
+void FunctionStackPoisoner::processDynamicAllocas() {
+ if (!ClInstrumentDynamicAllocas || DynamicAllocaVec.empty()) {
+ assert(DynamicAllocaPoisonCallVec.empty());
+ return;
+ }
- // Insert poison calls for lifetime intrinsics for alloca.
- bool HavePoisonedStaticAllocas = false;
- for (const auto &APC : AllocaPoisonCallVec) {
+ // Insert poison calls for lifetime intrinsics for dynamic allocas.
+ for (const auto &APC : DynamicAllocaPoisonCallVec) {
assert(APC.InsBefore);
assert(APC.AI);
assert(ASan.isInterestingAlloca(*APC.AI));
- bool IsDynamicAlloca = !(*APC.AI).isStaticAlloca();
- if (!ClInstrumentAllocas && IsDynamicAlloca)
- continue;
+ assert(!APC.AI->isStaticAlloca());
IRBuilder<> IRB(APC.InsBefore);
poisonAlloca(APC.AI, APC.Size, IRB, APC.DoPoison);
// Dynamic allocas will be unpoisoned unconditionally below in
// unpoisonDynamicAllocas.
// Flag that we need unpoison static allocas.
- HavePoisonedStaticAllocas |= (APC.DoPoison && !IsDynamicAlloca);
}
- if (ClInstrumentAllocas && DynamicAllocaVec.size() > 0) {
- // Handle dynamic allocas.
- createDynamicAllocasInitStorage();
- for (auto &AI : DynamicAllocaVec) handleDynamicAllocaCall(AI);
+ // Handle dynamic allocas.
+ createDynamicAllocasInitStorage();
+ for (auto &AI : DynamicAllocaVec)
+ handleDynamicAllocaCall(AI);
+ unpoisonDynamicAllocas();
+}
- unpoisonDynamicAllocas();
+void FunctionStackPoisoner::processStaticAllocas() {
+ if (AllocaVec.empty()) {
+ assert(StaticAllocaPoisonCallVec.empty());
+ return;
}
- if (AllocaVec.empty()) return;
-
int StackMallocIdx = -1;
DebugLoc EntryDebugLocation;
if (auto SP = F.getSubprogram())
@@ -2060,10 +2448,9 @@ void FunctionStackPoisoner::poisonStack() {
// regular stack slots.
auto InsBeforeB = InsBefore->getParent();
assert(InsBeforeB == &F.getEntryBlock());
- for (BasicBlock::iterator I(InsBefore); I != InsBeforeB->end(); ++I)
- if (auto *AI = dyn_cast<AllocaInst>(I))
- if (NonInstrumentedStaticAllocaVec.count(AI) > 0)
- AI->moveBefore(InsBefore);
+ for (auto *AI : StaticAllocasToMoveUp)
+ if (AI->getParent() == InsBeforeB)
+ AI->moveBefore(InsBefore);
// If we have a call to llvm.localescape, keep it in the entry block.
if (LocalEscapeCall) LocalEscapeCall->moveBefore(InsBefore);
@@ -2072,16 +2459,46 @@ void FunctionStackPoisoner::poisonStack() {
SVD.reserve(AllocaVec.size());
for (AllocaInst *AI : AllocaVec) {
ASanStackVariableDescription D = {AI->getName().data(),
- ASan.getAllocaSizeInBytes(AI),
- AI->getAlignment(), AI, 0};
+ ASan.getAllocaSizeInBytes(*AI),
+ 0,
+ AI->getAlignment(),
+ AI,
+ 0,
+ 0};
SVD.push_back(D);
}
+
// Minimal header size (left redzone) is 4 pointers,
// i.e. 32 bytes on 64-bit platforms and 16 bytes in 32-bit platforms.
size_t MinHeaderSize = ASan.LongSize / 2;
- ASanStackFrameLayout L;
- ComputeASanStackFrameLayout(SVD, 1ULL << Mapping.Scale, MinHeaderSize, &L);
- DEBUG(dbgs() << L.DescriptionString << " --- " << L.FrameSize << "\n");
+ const ASanStackFrameLayout &L =
+ ComputeASanStackFrameLayout(SVD, 1ULL << Mapping.Scale, MinHeaderSize);
+
+ // Build AllocaToSVDMap for ASanStackVariableDescription lookup.
+ DenseMap<const AllocaInst *, ASanStackVariableDescription *> AllocaToSVDMap;
+ for (auto &Desc : SVD)
+ AllocaToSVDMap[Desc.AI] = &Desc;
+
+ // Update SVD with information from lifetime intrinsics.
+ for (const auto &APC : StaticAllocaPoisonCallVec) {
+ assert(APC.InsBefore);
+ assert(APC.AI);
+ assert(ASan.isInterestingAlloca(*APC.AI));
+ assert(APC.AI->isStaticAlloca());
+
+ ASanStackVariableDescription &Desc = *AllocaToSVDMap[APC.AI];
+ Desc.LifetimeSize = Desc.Size;
+ if (const DILocation *FnLoc = EntryDebugLocation.get()) {
+ if (const DILocation *LifetimeLoc = APC.InsBefore->getDebugLoc().get()) {
+ if (LifetimeLoc->getFile() == FnLoc->getFile())
+ if (unsigned Line = LifetimeLoc->getLine())
+ Desc.Line = std::min(Desc.Line ? Desc.Line : Line, Line);
+ }
+ }
+ }
+
+ auto DescriptionString = ComputeASanStackFrameDescription(SVD);
+ DEBUG(dbgs() << DescriptionString << " --- " << L.FrameSize << "\n");
uint64_t LocalStackSize = L.FrameSize;
bool DoStackMalloc = ClUseAfterReturn && !ASan.CompileKernel &&
LocalStackSize <= kMaxStackMallocSize;
@@ -2164,7 +2581,7 @@ void FunctionStackPoisoner::poisonStack() {
ConstantInt::get(IntptrTy, ASan.LongSize / 8)),
IntptrPtrTy);
GlobalVariable *StackDescriptionGlobal =
- createPrivateGlobalForString(*F.getParent(), L.DescriptionString,
+ createPrivateGlobalForString(*F.getParent(), DescriptionString,
/*AllowMerging*/ true);
Value *Description = IRB.CreatePointerCast(StackDescriptionGlobal, IntptrTy);
IRB.CreateStore(Description, BasePlus1);
@@ -2175,19 +2592,33 @@ void FunctionStackPoisoner::poisonStack() {
IntptrPtrTy);
IRB.CreateStore(IRB.CreatePointerCast(&F, IntptrTy), BasePlus2);
- // Poison the stack redzones at the entry.
- Value *ShadowBase = ASan.memToShadow(LocalStackBase, IRB);
- poisonRedZones(L.ShadowBytes, IRB, ShadowBase, true);
+ const auto &ShadowAfterScope = GetShadowBytesAfterScope(SVD, L);
- auto UnpoisonStack = [&](IRBuilder<> &IRB) {
- if (HavePoisonedStaticAllocas) {
- // If we poisoned some allocas in llvm.lifetime analysis,
- // unpoison whole stack frame now.
- poisonAlloca(LocalStackBase, LocalStackSize, IRB, false);
- } else {
- poisonRedZones(L.ShadowBytes, IRB, ShadowBase, false);
+ // Poison the stack red zones at the entry.
+ Value *ShadowBase = ASan.memToShadow(LocalStackBase, IRB);
+ // As mask we must use most poisoned case: red zones and after scope.
+ // As bytes we can use either the same or just red zones only.
+ copyToShadow(ShadowAfterScope, ShadowAfterScope, IRB, ShadowBase);
+
+ if (!StaticAllocaPoisonCallVec.empty()) {
+ const auto &ShadowInScope = GetShadowBytes(SVD, L);
+
+ // Poison static allocas near lifetime intrinsics.
+ for (const auto &APC : StaticAllocaPoisonCallVec) {
+ const ASanStackVariableDescription &Desc = *AllocaToSVDMap[APC.AI];
+ assert(Desc.Offset % L.Granularity == 0);
+ size_t Begin = Desc.Offset / L.Granularity;
+ size_t End = Begin + (APC.Size + L.Granularity - 1) / L.Granularity;
+
+ IRBuilder<> IRB(APC.InsBefore);
+ copyToShadow(ShadowAfterScope,
+ APC.DoPoison ? ShadowAfterScope : ShadowInScope, Begin, End,
+ IRB, ShadowBase);
}
- };
+ }
+
+ SmallVector<uint8_t, 64> ShadowClean(ShadowAfterScope.size(), 0);
+ SmallVector<uint8_t, 64> ShadowAfterReturn;
// (Un)poison the stack before all ret instructions.
for (auto Ret : RetVec) {
@@ -2215,8 +2646,10 @@ void FunctionStackPoisoner::poisonStack() {
IRBuilder<> IRBPoison(ThenTerm);
if (StackMallocIdx <= 4) {
int ClassSize = kMinStackMallocSize << StackMallocIdx;
- SetShadowToStackAfterReturnInlined(IRBPoison, ShadowBase,
- ClassSize >> Mapping.Scale);
+ ShadowAfterReturn.resize(ClassSize / L.Granularity,
+ kAsanStackUseAfterReturnMagic);
+ copyToShadow(ShadowAfterReturn, ShadowAfterReturn, IRBPoison,
+ ShadowBase);
Value *SavedFlagPtrPtr = IRBPoison.CreateAdd(
FakeStack,
ConstantInt::get(IntptrTy, ClassSize - ASan.LongSize / 8));
@@ -2233,9 +2666,9 @@ void FunctionStackPoisoner::poisonStack() {
}
IRBuilder<> IRBElse(ElseTerm);
- UnpoisonStack(IRBElse);
+ copyToShadow(ShadowAfterScope, ShadowClean, IRBElse, ShadowBase);
} else {
- UnpoisonStack(IRBRet);
+ copyToShadow(ShadowAfterScope, ShadowClean, IRBRet, ShadowBase);
}
}
@@ -2264,7 +2697,7 @@ void FunctionStackPoisoner::poisonAlloca(Value *V, uint64_t Size,
AllocaInst *FunctionStackPoisoner::findAllocaForValue(Value *V) {
if (AllocaInst *AI = dyn_cast<AllocaInst>(V))
- // We're intested only in allocas we can handle.
+ // We're interested only in allocas we can handle.
return ASan.isInterestingAlloca(*AI) ? AI : nullptr;
// See if we've already calculated (or started to calculate) alloca for a
// given value.
@@ -2286,6 +2719,10 @@ AllocaInst *FunctionStackPoisoner::findAllocaForValue(Value *V) {
return nullptr;
Res = IncValueAI;
}
+ } else if (GetElementPtrInst *EP = dyn_cast<GetElementPtrInst>(V)) {
+ Res = findAllocaForValue(EP->getPointerOperand());
+ } else {
+ DEBUG(dbgs() << "Alloca search canceled on unknown instruction: " << *V << "\n");
}
if (Res) AllocaForValue[V] = Res;
return Res;
diff --git a/contrib/llvm/lib/Transforms/Instrumentation/CFGMST.h b/contrib/llvm/lib/Transforms/Instrumentation/CFGMST.h
index 3cd7351..3802f9f 100644
--- a/contrib/llvm/lib/Transforms/Instrumentation/CFGMST.h
+++ b/contrib/llvm/lib/Transforms/Instrumentation/CFGMST.h
@@ -78,6 +78,14 @@ public:
return *It->second.get();
}
+ // Give BB, return the auxiliary information if it's available.
+ BBInfo *findBBInfo(const BasicBlock *BB) const {
+ auto It = BBInfos.find(BB);
+ if (It == BBInfos.end())
+ return nullptr;
+ return It->second.get();
+ }
+
// Traverse the CFG using a stack. Find all the edges and assign the weight.
// Edges with large weight will be put into MST first so they are less likely
// to be instrumented.
diff --git a/contrib/llvm/lib/Transforms/Instrumentation/EfficiencySanitizer.cpp b/contrib/llvm/lib/Transforms/Instrumentation/EfficiencySanitizer.cpp
index fb80f87..05eba6c 100644
--- a/contrib/llvm/lib/Transforms/Instrumentation/EfficiencySanitizer.cpp
+++ b/contrib/llvm/lib/Transforms/Instrumentation/EfficiencySanitizer.cpp
@@ -99,12 +99,23 @@ static const char *const EsanWhichToolName = "__esan_which_tool";
// FIXME: Try to place these shadow constants, the names of the __esan_*
// interface functions, and the ToolType enum into a header shared between
// llvm and compiler-rt.
-static const uint64_t ShadowMask = 0x00000fffffffffffull;
-static const uint64_t ShadowOffs[3] = { // Indexed by scale
- 0x0000130000000000ull,
- 0x0000220000000000ull,
- 0x0000440000000000ull,
+struct ShadowMemoryParams {
+ uint64_t ShadowMask;
+ uint64_t ShadowOffs[3];
};
+
+static const ShadowMemoryParams ShadowParams47 = {
+ 0x00000fffffffffffull,
+ {
+ 0x0000130000000000ull, 0x0000220000000000ull, 0x0000440000000000ull,
+ }};
+
+static const ShadowMemoryParams ShadowParams40 = {
+ 0x0fffffffffull,
+ {
+ 0x1300000000ull, 0x2200000000ull, 0x4400000000ull,
+ }};
+
// This array is indexed by the ToolType enum.
static const int ShadowScale[] = {
0, // ESAN_None.
@@ -154,7 +165,7 @@ public:
EfficiencySanitizer(
const EfficiencySanitizerOptions &Opts = EfficiencySanitizerOptions())
: ModulePass(ID), Options(OverrideOptionsFromCL(Opts)) {}
- const char *getPassName() const override;
+ StringRef getPassName() const override;
void getAnalysisUsage(AnalysisUsage &AU) const override;
bool runOnModule(Module &M) override;
static char ID;
@@ -219,6 +230,7 @@ private:
// Remember the counter variable for each struct type to avoid
// recomputing the variable name later during instrumentation.
std::map<Type *, GlobalVariable *> StructTyMap;
+ ShadowMemoryParams ShadowParams;
};
} // namespace
@@ -231,7 +243,7 @@ INITIALIZE_PASS_END(
EfficiencySanitizer, "esan",
"EfficiencySanitizer: finds performance issues.", false, false)
-const char *EfficiencySanitizer::getPassName() const {
+StringRef EfficiencySanitizer::getPassName() const {
return "EfficiencySanitizer";
}
@@ -301,21 +313,21 @@ void EfficiencySanitizer::createStructCounterName(
else
NameStr += "struct.anon";
// We allow the actual size of the StructCounterName to be larger than
- // MaxStructCounterNameSize and append #NumFields and at least one
+ // MaxStructCounterNameSize and append $NumFields and at least one
// field type id.
- // Append #NumFields.
- NameStr += "#";
+ // Append $NumFields.
+ NameStr += "$";
Twine(StructTy->getNumElements()).toVector(NameStr);
// Append struct field type ids in the reverse order.
for (int i = StructTy->getNumElements() - 1; i >= 0; --i) {
- NameStr += "#";
+ NameStr += "$";
Twine(StructTy->getElementType(i)->getTypeID()).toVector(NameStr);
if (NameStr.size() >= MaxStructCounterNameSize)
break;
}
if (StructTy->isLiteral()) {
- // End with # for literal struct.
- NameStr += "#";
+ // End with $ for literal struct.
+ NameStr += "$";
}
}
@@ -528,6 +540,13 @@ void EfficiencySanitizer::createDestructor(Module &M, Constant *ToolInfoArg) {
}
bool EfficiencySanitizer::initOnModule(Module &M) {
+
+ Triple TargetTriple(M.getTargetTriple());
+ if (TargetTriple.getArch() == Triple::mips64 || TargetTriple.getArch() == Triple::mips64el)
+ ShadowParams = ShadowParams40;
+ else
+ ShadowParams = ShadowParams47;
+
Ctx = &M.getContext();
const DataLayout &DL = M.getDataLayout();
IRBuilder<> IRB(M.getContext());
@@ -559,13 +578,13 @@ bool EfficiencySanitizer::initOnModule(Module &M) {
Value *EfficiencySanitizer::appToShadow(Value *Shadow, IRBuilder<> &IRB) {
// Shadow = ((App & Mask) + Offs) >> Scale
- Shadow = IRB.CreateAnd(Shadow, ConstantInt::get(IntptrTy, ShadowMask));
+ Shadow = IRB.CreateAnd(Shadow, ConstantInt::get(IntptrTy, ShadowParams.ShadowMask));
uint64_t Offs;
int Scale = ShadowScale[Options.ToolType];
if (Scale <= 2)
- Offs = ShadowOffs[Scale];
+ Offs = ShadowParams.ShadowOffs[Scale];
else
- Offs = ShadowOffs[0] << Scale;
+ Offs = ShadowParams.ShadowOffs[0] << Scale;
Shadow = IRB.CreateAdd(Shadow, ConstantInt::get(IntptrTy, Offs));
if (Scale > 0)
Shadow = IRB.CreateLShr(Shadow, Scale);
diff --git a/contrib/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp b/contrib/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp
index b4070b6..56d0f5e 100644
--- a/contrib/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp
+++ b/contrib/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp
@@ -118,7 +118,8 @@ private:
Function *insertFlush(ArrayRef<std::pair<GlobalVariable *, MDNode *>>);
void insertIndirectCounterIncrement();
- std::string mangleName(const DICompileUnit *CU, const char *NewStem);
+ enum class GCovFileType { GCNO, GCDA };
+ std::string mangleName(const DICompileUnit *CU, GCovFileType FileType);
GCOVOptions Options;
@@ -141,7 +142,7 @@ public:
: ModulePass(ID), Profiler(Opts) {
initializeGCOVProfilerLegacyPassPass(*PassRegistry::getPassRegistry());
}
- const char *getPassName() const override { return "GCOV Profiler"; }
+ StringRef getPassName() const override { return "GCOV Profiler"; }
bool runOnModule(Module &M) override { return Profiler.runOnModule(M); }
@@ -251,11 +252,7 @@ namespace {
class GCOVBlock : public GCOVRecord {
public:
GCOVLines &getFile(StringRef Filename) {
- GCOVLines *&Lines = LinesByFile[Filename];
- if (!Lines) {
- Lines = new GCOVLines(Filename, os);
- }
- return *Lines;
+ return LinesByFile.try_emplace(Filename, Filename, os).first->second;
}
void addEdge(GCOVBlock &Successor) {
@@ -264,9 +261,9 @@ namespace {
void writeOut() {
uint32_t Len = 3;
- SmallVector<StringMapEntry<GCOVLines *> *, 32> SortedLinesByFile;
+ SmallVector<StringMapEntry<GCOVLines> *, 32> SortedLinesByFile;
for (auto &I : LinesByFile) {
- Len += I.second->length();
+ Len += I.second.length();
SortedLinesByFile.push_back(&I);
}
@@ -274,21 +271,17 @@ namespace {
write(Len);
write(Number);
- std::sort(SortedLinesByFile.begin(), SortedLinesByFile.end(),
- [](StringMapEntry<GCOVLines *> *LHS,
- StringMapEntry<GCOVLines *> *RHS) {
- return LHS->getKey() < RHS->getKey();
- });
+ std::sort(
+ SortedLinesByFile.begin(), SortedLinesByFile.end(),
+ [](StringMapEntry<GCOVLines> *LHS, StringMapEntry<GCOVLines> *RHS) {
+ return LHS->getKey() < RHS->getKey();
+ });
for (auto &I : SortedLinesByFile)
- I->getValue()->writeOut();
+ I->getValue().writeOut();
write(0);
write(0);
}
- ~GCOVBlock() {
- DeleteContainerSeconds(LinesByFile);
- }
-
GCOVBlock(const GCOVBlock &RHS) : GCOVRecord(RHS), Number(RHS.Number) {
// Only allow copy before edges and lines have been added. After that,
// there are inter-block pointers (eg: edges) that won't take kindly to
@@ -306,7 +299,7 @@ namespace {
}
uint32_t Number;
- StringMap<GCOVLines *> LinesByFile;
+ StringMap<GCOVLines> LinesByFile;
SmallVector<GCOVBlock *, 4> OutEdges;
};
@@ -426,24 +419,40 @@ namespace {
}
std::string GCOVProfiler::mangleName(const DICompileUnit *CU,
- const char *NewStem) {
+ GCovFileType OutputType) {
+ bool Notes = OutputType == GCovFileType::GCNO;
+
if (NamedMDNode *GCov = M->getNamedMetadata("llvm.gcov")) {
for (int i = 0, e = GCov->getNumOperands(); i != e; ++i) {
MDNode *N = GCov->getOperand(i);
- if (N->getNumOperands() != 2) continue;
- MDString *GCovFile = dyn_cast<MDString>(N->getOperand(0));
- MDNode *CompileUnit = dyn_cast<MDNode>(N->getOperand(1));
- if (!GCovFile || !CompileUnit) continue;
- if (CompileUnit == CU) {
- SmallString<128> Filename = GCovFile->getString();
- sys::path::replace_extension(Filename, NewStem);
- return Filename.str();
+ bool ThreeElement = N->getNumOperands() == 3;
+ if (!ThreeElement && N->getNumOperands() != 2)
+ continue;
+ if (dyn_cast<MDNode>(N->getOperand(ThreeElement ? 2 : 1)) != CU)
+ continue;
+
+ if (ThreeElement) {
+ // These nodes have no mangling to apply, it's stored mangled in the
+ // bitcode.
+ MDString *NotesFile = dyn_cast<MDString>(N->getOperand(0));
+ MDString *DataFile = dyn_cast<MDString>(N->getOperand(1));
+ if (!NotesFile || !DataFile)
+ continue;
+ return Notes ? NotesFile->getString() : DataFile->getString();
}
+
+ MDString *GCovFile = dyn_cast<MDString>(N->getOperand(0));
+ if (!GCovFile)
+ continue;
+
+ SmallString<128> Filename = GCovFile->getString();
+ sys::path::replace_extension(Filename, Notes ? "gcno" : "gcda");
+ return Filename.str();
}
}
SmallString<128> Filename = CU->getFilename();
- sys::path::replace_extension(Filename, NewStem);
+ sys::path::replace_extension(Filename, Notes ? "gcno" : "gcda");
StringRef FName = sys::path::filename(Filename);
SmallString<128> CurPath;
if (sys::fs::current_path(CurPath)) return FName;
@@ -461,7 +470,7 @@ bool GCOVProfiler::runOnModule(Module &M) {
}
PreservedAnalyses GCOVProfilerPass::run(Module &M,
- AnalysisManager<Module> &AM) {
+ ModuleAnalysisManager &AM) {
GCOVProfiler Profiler(GCOVOpts);
@@ -509,7 +518,7 @@ void GCOVProfiler::emitProfileNotes() {
continue;
std::error_code EC;
- raw_fd_ostream out(mangleName(CU, "gcno"), EC, sys::fs::F_None);
+ raw_fd_ostream out(mangleName(CU, GCovFileType::GCNO), EC, sys::fs::F_None);
std::string EdgeDestinations;
unsigned FunctionIdent = 0;
@@ -857,7 +866,7 @@ Function *GCOVProfiler::insertCounterWriteout(
if (CU->getDWOId())
continue;
- std::string FilenameGcda = mangleName(CU, "gcda");
+ std::string FilenameGcda = mangleName(CU, GCovFileType::GCDA);
uint32_t CfgChecksum = FileChecksums.empty() ? 0 : FileChecksums[i];
Builder.CreateCall(StartFile,
{Builder.CreateGlobalStringPtr(FilenameGcda),
diff --git a/contrib/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp b/contrib/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
index 202b94b..1ba13bd 100644
--- a/contrib/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
+++ b/contrib/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
@@ -13,29 +13,38 @@
//
//===----------------------------------------------------------------------===//
-#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/Triple.h"
-#include "llvm/Analysis/CFG.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
#include "llvm/Analysis/IndirectCallPromotionAnalysis.h"
#include "llvm/Analysis/IndirectCallSiteVisitor.h"
+#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/CallSite.h"
+#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/Function.h"
#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/InstIterator.h"
-#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/MDBuilder.h"
-#include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/Type.h"
#include "llvm/Pass.h"
-#include "llvm/ProfileData/InstrProfReader.h"
+#include "llvm/PassRegistry.h"
+#include "llvm/PassSupport.h"
+#include "llvm/ProfileData/InstrProf.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
#include "llvm/Transforms/Instrumentation.h"
#include "llvm/Transforms/PGOInstrumentation.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include <string>
-#include <utility>
+#include <cassert>
+#include <cstdint>
#include <vector>
using namespace llvm;
@@ -102,9 +111,7 @@ public:
*PassRegistry::getPassRegistry());
}
- const char *getPassName() const override {
- return "PGOIndirectCallPromotion";
- }
+ StringRef getPassName() const override { return "PGOIndirectCallPromotion"; }
private:
bool runOnModule(Module &M) override;
@@ -208,6 +215,7 @@ public:
ICallPromotionFunc(Function &Func, Module *Modu, InstrProfSymtab *Symtab)
: F(Func), M(Modu), Symtab(Symtab) {
}
+
bool processFunction();
};
} // end anonymous namespace
@@ -474,7 +482,7 @@ static Instruction *createDirectCallInst(const Instruction *Inst,
NewInst);
// Clear the value profile data.
- NewInst->setMetadata(LLVMContext::MD_prof, 0);
+ NewInst->setMetadata(LLVMContext::MD_prof, nullptr);
CallSite NewCS(NewInst);
FunctionType *DirectCalleeType = DirectCallee->getFunctionType();
unsigned ParamNum = DirectCalleeType->getFunctionNumParams();
@@ -610,7 +618,7 @@ bool ICallPromotionFunc::processFunction() {
Changed = true;
// Adjust the MD.prof metadata. First delete the old one.
- I->setMetadata(LLVMContext::MD_prof, 0);
+ I->setMetadata(LLVMContext::MD_prof, nullptr);
// If all promoted, we don't need the MD.prof metadata.
if (TotalCount == 0 || NumPromoted == NumVals)
continue;
@@ -653,7 +661,7 @@ bool PGOIndirectCallPromotionLegacyPass::runOnModule(Module &M) {
return promoteIndirectCalls(M, InLTO | ICPLTOMode);
}
-PreservedAnalyses PGOIndirectCallPromotion::run(Module &M, AnalysisManager<Module> &AM) {
+PreservedAnalyses PGOIndirectCallPromotion::run(Module &M, ModuleAnalysisManager &AM) {
if (!promoteIndirectCalls(M, InLTO | ICPLTOMode))
return PreservedAnalyses::all();
diff --git a/contrib/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/contrib/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
index b11c6be..adea7e7 100644
--- a/contrib/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
+++ b/contrib/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
@@ -15,6 +15,7 @@
#include "llvm/Transforms/InstrProfiling.h"
#include "llvm/ADT/Triple.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Module.h"
@@ -31,6 +32,11 @@ cl::opt<bool> DoNameCompression("enable-name-compression",
cl::desc("Enable name string compression"),
cl::init(true));
+cl::opt<bool> DoHashBasedCounterSplit(
+ "hash-based-counter-split",
+ cl::desc("Rename counter variable of a comdat function based on cfg hash"),
+ cl::init(true));
+
cl::opt<bool> ValueProfileStaticAlloc(
"vp-static-alloc",
cl::desc("Do static counter allocation for value profiler"),
@@ -53,30 +59,38 @@ public:
InstrProfilingLegacyPass() : ModulePass(ID), InstrProf() {}
InstrProfilingLegacyPass(const InstrProfOptions &Options)
: ModulePass(ID), InstrProf(Options) {}
- const char *getPassName() const override {
+ StringRef getPassName() const override {
return "Frontend instrumentation-based coverage lowering";
}
- bool runOnModule(Module &M) override { return InstrProf.run(M); }
+ bool runOnModule(Module &M) override {
+ return InstrProf.run(M, getAnalysis<TargetLibraryInfoWrapperPass>().getTLI());
+ }
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
}
};
} // anonymous namespace
-PreservedAnalyses InstrProfiling::run(Module &M, AnalysisManager<Module> &AM) {
- if (!run(M))
+PreservedAnalyses InstrProfiling::run(Module &M, ModuleAnalysisManager &AM) {
+ auto &TLI = AM.getResult<TargetLibraryAnalysis>(M);
+ if (!run(M, TLI))
return PreservedAnalyses::all();
return PreservedAnalyses::none();
}
char InstrProfilingLegacyPass::ID = 0;
-INITIALIZE_PASS(InstrProfilingLegacyPass, "instrprof",
- "Frontend instrumentation-based coverage lowering.", false,
- false)
+INITIALIZE_PASS_BEGIN(
+ InstrProfilingLegacyPass, "instrprof",
+ "Frontend instrumentation-based coverage lowering.", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(
+ InstrProfilingLegacyPass, "instrprof",
+ "Frontend instrumentation-based coverage lowering.", false, false)
ModulePass *
llvm::createInstrProfilingLegacyPass(const InstrProfOptions &Options) {
@@ -107,10 +121,18 @@ StringRef InstrProfiling::getCoverageSection() const {
return getInstrProfCoverageSectionName(isMachO());
}
-bool InstrProfiling::run(Module &M) {
+static InstrProfIncrementInst *castToIncrementInst(Instruction *Instr) {
+ InstrProfIncrementInst *Inc = dyn_cast<InstrProfIncrementInstStep>(Instr);
+ if (Inc)
+ return Inc;
+ return dyn_cast<InstrProfIncrementInst>(Instr);
+}
+
+bool InstrProfiling::run(Module &M, const TargetLibraryInfo &TLI) {
bool MadeChange = false;
this->M = &M;
+ this->TLI = &TLI;
NamesVar = nullptr;
NamesSize = 0;
ProfileDataMap.clear();
@@ -138,7 +160,8 @@ bool InstrProfiling::run(Module &M) {
for (BasicBlock &BB : F)
for (auto I = BB.begin(), E = BB.end(); I != E;) {
auto Instr = I++;
- if (auto *Inc = dyn_cast<InstrProfIncrementInst>(Instr)) {
+ InstrProfIncrementInst *Inc = castToIncrementInst(&*Instr);
+ if (Inc) {
lowerIncrement(Inc);
MadeChange = true;
} else if (auto *Ind = dyn_cast<InstrProfValueProfileInst>(Instr)) {
@@ -165,7 +188,8 @@ bool InstrProfiling::run(Module &M) {
return true;
}
-static Constant *getOrInsertValueProfilingCall(Module &M) {
+static Constant *getOrInsertValueProfilingCall(Module &M,
+ const TargetLibraryInfo &TLI) {
LLVMContext &Ctx = M.getContext();
auto *ReturnTy = Type::getVoidTy(M.getContext());
Type *ParamTypes[] = {
@@ -174,8 +198,13 @@ static Constant *getOrInsertValueProfilingCall(Module &M) {
};
auto *ValueProfilingCallTy =
FunctionType::get(ReturnTy, makeArrayRef(ParamTypes), false);
- return M.getOrInsertFunction(getInstrProfValueProfFuncName(),
- ValueProfilingCallTy);
+ Constant *Res = M.getOrInsertFunction(getInstrProfValueProfFuncName(),
+ ValueProfilingCallTy);
+ if (Function *FunRes = dyn_cast<Function>(Res)) {
+ if (auto AK = TLI.getExtAttrForI32Param(false))
+ FunRes->addAttribute(3, AK);
+ }
+ return Res;
}
void InstrProfiling::computeNumValueSiteCounts(InstrProfValueProfileInst *Ind) {
@@ -209,8 +238,11 @@ void InstrProfiling::lowerValueProfileInst(InstrProfValueProfileInst *Ind) {
Value *Args[3] = {Ind->getTargetValue(),
Builder.CreateBitCast(DataVar, Builder.getInt8PtrTy()),
Builder.getInt32(Index)};
- Ind->replaceAllUsesWith(
- Builder.CreateCall(getOrInsertValueProfilingCall(*M), Args));
+ CallInst *Call = Builder.CreateCall(getOrInsertValueProfilingCall(*M, *TLI),
+ Args);
+ if (auto AK = TLI->getExtAttrForI32Param(false))
+ Call->addAttribute(3, AK);
+ Ind->replaceAllUsesWith(Call);
Ind->eraseFromParent();
}
@@ -221,7 +253,7 @@ void InstrProfiling::lowerIncrement(InstrProfIncrementInst *Inc) {
uint64_t Index = Inc->getIndex()->getZExtValue();
Value *Addr = Builder.CreateConstInBoundsGEP2_64(Counters, 0, Index);
Value *Count = Builder.CreateLoad(Addr, "pgocount");
- Count = Builder.CreateAdd(Count, Builder.getInt64(1));
+ Count = Builder.CreateAdd(Count, Inc->getStep());
Inc->replaceAllUsesWith(Builder.CreateStore(Count, Addr));
Inc->eraseFromParent();
}
@@ -245,7 +277,16 @@ void InstrProfiling::lowerCoverageData(GlobalVariable *CoverageNamesVar) {
static std::string getVarName(InstrProfIncrementInst *Inc, StringRef Prefix) {
StringRef NamePrefix = getInstrProfNameVarPrefix();
StringRef Name = Inc->getName()->getName().substr(NamePrefix.size());
- return (Prefix + Name).str();
+ Function *F = Inc->getParent()->getParent();
+ Module *M = F->getParent();
+ if (!DoHashBasedCounterSplit || !isIRPGOFlagSet(M) ||
+ !canRenameComdatFunc(*F))
+ return (Prefix + Name).str();
+ uint64_t FuncHash = Inc->getHash()->getZExtValue();
+ SmallVector<char, 24> HashPostfix;
+ if (Name.endswith((Twine(".") + Twine(FuncHash)).toStringRef(HashPostfix)))
+ return (Prefix + Name).str();
+ return (Prefix + Name + "." + Twine(FuncHash)).str();
}
static inline bool shouldRecordFunctionAddr(Function *F) {
@@ -268,33 +309,6 @@ static inline bool shouldRecordFunctionAddr(Function *F) {
return F->hasAddressTaken() || F->hasLinkOnceLinkage();
}
-static inline bool needsComdatForCounter(Function &F, Module &M) {
-
- if (F.hasComdat())
- return true;
-
- Triple TT(M.getTargetTriple());
- if (!TT.isOSBinFormatELF())
- return false;
-
- // See createPGOFuncNameVar for more details. To avoid link errors, profile
- // counters for function with available_externally linkage needs to be changed
- // to linkonce linkage. On ELF based systems, this leads to weak symbols to be
- // created. Without using comdat, duplicate entries won't be removed by the
- // linker leading to increased data segement size and raw profile size. Even
- // worse, since the referenced counter from profile per-function data object
- // will be resolved to the common strong definition, the profile counts for
- // available_externally functions will end up being duplicated in raw profile
- // data. This can result in distorted profile as the counts of those dups
- // will be accumulated by the profile merger.
- GlobalValue::LinkageTypes Linkage = F.getLinkage();
- if (Linkage != GlobalValue::ExternalWeakLinkage &&
- Linkage != GlobalValue::AvailableExternallyLinkage)
- return false;
-
- return true;
-}
-
static inline Comdat *getOrCreateProfileComdat(Module &M, Function &F,
InstrProfIncrementInst *Inc) {
if (!needsComdatForCounter(F, M))
@@ -572,38 +586,30 @@ void InstrProfiling::emitRuntimeHook() {
}
void InstrProfiling::emitUses() {
- if (UsedVars.empty())
- return;
-
- GlobalVariable *LLVMUsed = M->getGlobalVariable("llvm.used");
- std::vector<Constant *> MergedVars;
- if (LLVMUsed) {
- // Collect the existing members of llvm.used.
- ConstantArray *Inits = cast<ConstantArray>(LLVMUsed->getInitializer());
- for (unsigned I = 0, E = Inits->getNumOperands(); I != E; ++I)
- MergedVars.push_back(Inits->getOperand(I));
- LLVMUsed->eraseFromParent();
- }
-
- Type *i8PTy = Type::getInt8PtrTy(M->getContext());
- // Add uses for our data.
- for (auto *Value : UsedVars)
- MergedVars.push_back(
- ConstantExpr::getBitCast(cast<Constant>(Value), i8PTy));
-
- // Recreate llvm.used.
- ArrayType *ATy = ArrayType::get(i8PTy, MergedVars.size());
- LLVMUsed =
- new GlobalVariable(*M, ATy, false, GlobalValue::AppendingLinkage,
- ConstantArray::get(ATy, MergedVars), "llvm.used");
- LLVMUsed->setSection("llvm.metadata");
+ if (!UsedVars.empty())
+ appendToUsed(*M, UsedVars);
}
void InstrProfiling::emitInitialization() {
- std::string InstrProfileOutput = Options.InstrProfileOutput;
+ StringRef InstrProfileOutput = Options.InstrProfileOutput;
+
+ if (!InstrProfileOutput.empty()) {
+ // Create variable for profile name.
+ Constant *ProfileNameConst =
+ ConstantDataArray::getString(M->getContext(), InstrProfileOutput, true);
+ GlobalVariable *ProfileNameVar = new GlobalVariable(
+ *M, ProfileNameConst->getType(), true, GlobalValue::WeakAnyLinkage,
+ ProfileNameConst, INSTR_PROF_QUOTE(INSTR_PROF_PROFILE_NAME_VAR));
+ Triple TT(M->getTargetTriple());
+ if (TT.supportsCOMDAT()) {
+ ProfileNameVar->setLinkage(GlobalValue::ExternalLinkage);
+ ProfileNameVar->setComdat(M->getOrInsertComdat(
+ StringRef(INSTR_PROF_QUOTE(INSTR_PROF_PROFILE_NAME_VAR))));
+ }
+ }
Constant *RegisterF = M->getFunction(getInstrProfRegFuncsName());
- if (!RegisterF && InstrProfileOutput.empty())
+ if (!RegisterF)
return;
// Create the initialization function.
@@ -620,21 +626,6 @@ void InstrProfiling::emitInitialization() {
IRBuilder<> IRB(BasicBlock::Create(M->getContext(), "", F));
if (RegisterF)
IRB.CreateCall(RegisterF, {});
- if (!InstrProfileOutput.empty()) {
- auto *Int8PtrTy = Type::getInt8PtrTy(M->getContext());
- auto *SetNameTy = FunctionType::get(VoidTy, Int8PtrTy, false);
- auto *SetNameF = Function::Create(SetNameTy, GlobalValue::ExternalLinkage,
- getInstrProfFileOverriderFuncName(), M);
-
- // Create variable for profile name.
- Constant *ProfileNameConst =
- ConstantDataArray::getString(M->getContext(), InstrProfileOutput, true);
- GlobalVariable *ProfileName =
- new GlobalVariable(*M, ProfileNameConst->getType(), true,
- GlobalValue::PrivateLinkage, ProfileNameConst);
-
- IRB.CreateCall(SetNameF, IRB.CreatePointerCast(ProfileName, Int8PtrTy));
- }
IRB.CreateRetVoid();
appendToGlobalCtors(*M, F, 0);
diff --git a/contrib/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/contrib/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index 970f9ab..fafb0fc 100644
--- a/contrib/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/contrib/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -242,8 +242,8 @@ static const MemoryMapParams Linux_X86_64_MemoryMapParams = {
// mips64 Linux
static const MemoryMapParams Linux_MIPS64_MemoryMapParams = {
- 0x004000000000, // AndMask
- 0, // XorMask (not used)
+ 0, // AndMask (not used)
+ 0x008000000000, // XorMask
0, // ShadowBase (not used)
0x002000000000, // OriginBase
};
@@ -312,11 +312,12 @@ static const PlatformMemoryMapParams FreeBSD_X86_MemoryMapParams = {
/// uninitialized reads.
class MemorySanitizer : public FunctionPass {
public:
- MemorySanitizer(int TrackOrigins = 0)
+ MemorySanitizer(int TrackOrigins = 0, bool Recover = false)
: FunctionPass(ID),
TrackOrigins(std::max(TrackOrigins, (int)ClTrackOrigins)),
+ Recover(Recover || ClKeepGoing),
WarningFn(nullptr) {}
- const char *getPassName() const override { return "MemorySanitizer"; }
+ StringRef getPassName() const override { return "MemorySanitizer"; }
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<TargetLibraryInfoWrapperPass>();
}
@@ -329,6 +330,7 @@ class MemorySanitizer : public FunctionPass {
/// \brief Track origins (allocation points) of uninitialized values.
int TrackOrigins;
+ bool Recover;
LLVMContext *C;
Type *IntptrTy;
@@ -395,8 +397,8 @@ INITIALIZE_PASS_END(
MemorySanitizer, "msan",
"MemorySanitizer: detects uninitialized reads.", false, false)
-FunctionPass *llvm::createMemorySanitizerPass(int TrackOrigins) {
- return new MemorySanitizer(TrackOrigins);
+FunctionPass *llvm::createMemorySanitizerPass(int TrackOrigins, bool Recover) {
+ return new MemorySanitizer(TrackOrigins, Recover);
}
/// \brief Create a non-const global initialized with the given string.
@@ -421,8 +423,8 @@ void MemorySanitizer::initializeCallbacks(Module &M) {
// Create the callback.
// FIXME: this function should have "Cold" calling conv,
// which is not yet implemented.
- StringRef WarningFnName = ClKeepGoing ? "__msan_warning"
- : "__msan_warning_noreturn";
+ StringRef WarningFnName = Recover ? "__msan_warning"
+ : "__msan_warning_noreturn";
WarningFn = M.getOrInsertFunction(WarningFnName, IRB.getVoidTy(), nullptr);
for (size_t AccessSizeIndex = 0; AccessSizeIndex < kNumberOfAccessSizes;
@@ -566,9 +568,9 @@ bool MemorySanitizer::doInitialization(Module &M) {
new GlobalVariable(M, IRB.getInt32Ty(), true, GlobalValue::WeakODRLinkage,
IRB.getInt32(TrackOrigins), "__msan_track_origins");
- if (ClKeepGoing)
+ if (Recover)
new GlobalVariable(M, IRB.getInt32Ty(), true, GlobalValue::WeakODRLinkage,
- IRB.getInt32(ClKeepGoing), "__msan_keep_going");
+ IRB.getInt32(Recover), "__msan_keep_going");
return true;
}
@@ -792,7 +794,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
}
IRB.CreateCall(MS.WarningFn, {});
IRB.CreateCall(MS.EmptyAsm, {});
- // FIXME: Insert UnreachableInst if !ClKeepGoing?
+ // FIXME: Insert UnreachableInst if !MS.Recover?
// This may invalidate some of the following checks and needs to be done
// at the very end.
}
@@ -815,7 +817,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
getCleanShadow(ConvertedShadow), "_mscmp");
Instruction *CheckTerm = SplitBlockAndInsertIfThen(
Cmp, OrigIns,
- /* Unreachable */ !ClKeepGoing, MS.ColdCallWeights);
+ /* Unreachable */ !MS.Recover, MS.ColdCallWeights);
IRB.SetInsertPoint(CheckTerm);
if (MS.TrackOrigins) {
@@ -2360,6 +2362,29 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
case llvm::Intrinsic::x86_sse_cvttps2pi:
handleVectorConvertIntrinsic(I, 2);
break;
+
+ case llvm::Intrinsic::x86_avx512_psll_w_512:
+ case llvm::Intrinsic::x86_avx512_psll_d_512:
+ case llvm::Intrinsic::x86_avx512_psll_q_512:
+ case llvm::Intrinsic::x86_avx512_pslli_w_512:
+ case llvm::Intrinsic::x86_avx512_pslli_d_512:
+ case llvm::Intrinsic::x86_avx512_pslli_q_512:
+ case llvm::Intrinsic::x86_avx512_psrl_w_512:
+ case llvm::Intrinsic::x86_avx512_psrl_d_512:
+ case llvm::Intrinsic::x86_avx512_psrl_q_512:
+ case llvm::Intrinsic::x86_avx512_psra_w_512:
+ case llvm::Intrinsic::x86_avx512_psra_d_512:
+ case llvm::Intrinsic::x86_avx512_psra_q_512:
+ case llvm::Intrinsic::x86_avx512_psrli_w_512:
+ case llvm::Intrinsic::x86_avx512_psrli_d_512:
+ case llvm::Intrinsic::x86_avx512_psrli_q_512:
+ case llvm::Intrinsic::x86_avx512_psrai_w_512:
+ case llvm::Intrinsic::x86_avx512_psrai_d_512:
+ case llvm::Intrinsic::x86_avx512_psrai_q_512:
+ case llvm::Intrinsic::x86_avx512_psra_q_256:
+ case llvm::Intrinsic::x86_avx512_psra_q_128:
+ case llvm::Intrinsic::x86_avx512_psrai_q_256:
+ case llvm::Intrinsic::x86_avx512_psrai_q_128:
case llvm::Intrinsic::x86_avx2_psll_w:
case llvm::Intrinsic::x86_avx2_psll_d:
case llvm::Intrinsic::x86_avx2_psll_q:
@@ -2412,14 +2437,22 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
break;
case llvm::Intrinsic::x86_avx2_psllv_d:
case llvm::Intrinsic::x86_avx2_psllv_d_256:
+ case llvm::Intrinsic::x86_avx512_psllv_d_512:
case llvm::Intrinsic::x86_avx2_psllv_q:
case llvm::Intrinsic::x86_avx2_psllv_q_256:
+ case llvm::Intrinsic::x86_avx512_psllv_q_512:
case llvm::Intrinsic::x86_avx2_psrlv_d:
case llvm::Intrinsic::x86_avx2_psrlv_d_256:
+ case llvm::Intrinsic::x86_avx512_psrlv_d_512:
case llvm::Intrinsic::x86_avx2_psrlv_q:
case llvm::Intrinsic::x86_avx2_psrlv_q_256:
+ case llvm::Intrinsic::x86_avx512_psrlv_q_512:
case llvm::Intrinsic::x86_avx2_psrav_d:
case llvm::Intrinsic::x86_avx2_psrav_d_256:
+ case llvm::Intrinsic::x86_avx512_psrav_d_512:
+ case llvm::Intrinsic::x86_avx512_psrav_q_128:
+ case llvm::Intrinsic::x86_avx512_psrav_q_256:
+ case llvm::Intrinsic::x86_avx512_psrav_q_512:
handleVectorShiftIntrinsic(I, /* Variable */ true);
break;
diff --git a/contrib/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/contrib/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
index f54d8ad..04f9a64 100644
--- a/contrib/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
+++ b/contrib/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
@@ -51,6 +51,7 @@
#include "llvm/Transforms/PGOInstrumentation.h"
#include "CFGMST.h"
#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/Triple.h"
#include "llvm/Analysis/BlockFrequencyInfo.h"
@@ -59,6 +60,7 @@
#include "llvm/Analysis/IndirectCallSiteVisitor.h"
#include "llvm/IR/CallSite.h"
#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/GlobalValue.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InstIterator.h"
#include "llvm/IR/Instructions.h"
@@ -75,6 +77,7 @@
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include <algorithm>
#include <string>
+#include <unordered_map>
#include <utility>
#include <vector>
@@ -83,6 +86,7 @@ using namespace llvm;
#define DEBUG_TYPE "pgo-instrumentation"
STATISTIC(NumOfPGOInstrument, "Number of edges instrumented.");
+STATISTIC(NumOfPGOSelectInsts, "Number of select instruction instrumented.");
STATISTIC(NumOfPGOEdge, "Number of edges.");
STATISTIC(NumOfPGOBB, "Number of basic-blocks.");
STATISTIC(NumOfPGOSplit, "Number of critical edge splits.");
@@ -112,17 +116,89 @@ static cl::opt<unsigned> MaxNumAnnotations(
cl::desc("Max number of annotations for a single indirect "
"call callsite"));
+// Command line option to control appending FunctionHash to the name of a COMDAT
+// function. This is to avoid the hash mismatch caused by the preinliner.
+static cl::opt<bool> DoComdatRenaming(
+ "do-comdat-renaming", cl::init(false), cl::Hidden,
+ cl::desc("Append function hash to the name of COMDAT function to avoid "
+ "function hash mismatch due to the preinliner"));
+
// Command line option to enable/disable the warning about missing profile
// information.
-static cl::opt<bool> NoPGOWarnMissing("no-pgo-warn-missing", cl::init(false),
- cl::Hidden);
+static cl::opt<bool> PGOWarnMissing("pgo-warn-missing-function",
+ cl::init(false),
+ cl::Hidden);
// Command line option to enable/disable the warning about a hash mismatch in
// the profile data.
static cl::opt<bool> NoPGOWarnMismatch("no-pgo-warn-mismatch", cl::init(false),
cl::Hidden);
+// Command line option to enable/disable the warning about a hash mismatch in
+// the profile data for Comdat functions, which often turns out to be false
+// positive due to the pre-instrumentation inline.
+static cl::opt<bool> NoPGOWarnMismatchComdat("no-pgo-warn-mismatch-comdat",
+ cl::init(true), cl::Hidden);
+
+// Command line option to enable/disable select instruction instrumentation.
+static cl::opt<bool> PGOInstrSelect("pgo-instr-select", cl::init(true),
+ cl::Hidden);
namespace {
+
+/// The select instruction visitor plays three roles specified
+/// by the mode. In \c VM_counting mode, it simply counts the number of
+/// select instructions. In \c VM_instrument mode, it inserts code to count
+/// the number times TrueValue of select is taken. In \c VM_annotate mode,
+/// it reads the profile data and annotate the select instruction with metadata.
+enum VisitMode { VM_counting, VM_instrument, VM_annotate };
+class PGOUseFunc;
+
+/// Instruction Visitor class to visit select instructions.
+struct SelectInstVisitor : public InstVisitor<SelectInstVisitor> {
+ Function &F;
+ unsigned NSIs = 0; // Number of select instructions instrumented.
+ VisitMode Mode = VM_counting; // Visiting mode.
+ unsigned *CurCtrIdx = nullptr; // Pointer to current counter index.
+ unsigned TotalNumCtrs = 0; // Total number of counters
+ GlobalVariable *FuncNameVar = nullptr;
+ uint64_t FuncHash = 0;
+ PGOUseFunc *UseFunc = nullptr;
+
+ SelectInstVisitor(Function &Func) : F(Func) {}
+
+ void countSelects(Function &Func) {
+ Mode = VM_counting;
+ visit(Func);
+ }
+ // Visit the IR stream and instrument all select instructions. \p
+ // Ind is a pointer to the counter index variable; \p TotalNC
+ // is the total number of counters; \p FNV is the pointer to the
+ // PGO function name var; \p FHash is the function hash.
+ void instrumentSelects(Function &Func, unsigned *Ind, unsigned TotalNC,
+ GlobalVariable *FNV, uint64_t FHash) {
+ Mode = VM_instrument;
+ CurCtrIdx = Ind;
+ TotalNumCtrs = TotalNC;
+ FuncHash = FHash;
+ FuncNameVar = FNV;
+ visit(Func);
+ }
+
+ // Visit the IR stream and annotate all select instructions.
+ void annotateSelects(Function &Func, PGOUseFunc *UF, unsigned *Ind) {
+ Mode = VM_annotate;
+ UseFunc = UF;
+ CurCtrIdx = Ind;
+ visit(Func);
+ }
+
+ void instrumentOneSelectInst(SelectInst &SI);
+ void annotateOneSelectInst(SelectInst &SI);
+ // Visit \p SI instruction and perform tasks according to visit mode.
+ void visitSelectInst(SelectInst &SI);
+ unsigned getNumOfSelectInsts() const { return NSIs; }
+};
+
class PGOInstrumentationGenLegacyPass : public ModulePass {
public:
static char ID;
@@ -132,9 +208,7 @@ public:
*PassRegistry::getPassRegistry());
}
- const char *getPassName() const override {
- return "PGOInstrumentationGenPass";
- }
+ StringRef getPassName() const override { return "PGOInstrumentationGenPass"; }
private:
bool runOnModule(Module &M) override;
@@ -157,9 +231,7 @@ public:
*PassRegistry::getPassRegistry());
}
- const char *getPassName() const override {
- return "PGOInstrumentationUsePass";
- }
+ StringRef getPassName() const override { return "PGOInstrumentationUsePass"; }
private:
std::string ProfileFileName;
@@ -169,6 +241,7 @@ private:
AU.addRequired<BlockFrequencyInfoWrapperPass>();
}
};
+
} // end anonymous namespace
char PGOInstrumentationGenLegacyPass::ID = 0;
@@ -238,8 +311,13 @@ template <class Edge, class BBInfo> class FuncPGOInstrumentation {
private:
Function &F;
void computeCFGHash();
+ void renameComdatFunction();
+ // A map that stores the Comdat group in function F.
+ std::unordered_multimap<Comdat *, GlobalValue *> &ComdatMembers;
public:
+ std::vector<Instruction *> IndirectCallSites;
+ SelectInstVisitor SIVisitor;
std::string FuncName;
GlobalVariable *FuncNameVar;
// CFG hash value for this function.
@@ -255,18 +333,32 @@ public:
// Return the auxiliary BB information.
BBInfo &getBBInfo(const BasicBlock *BB) const { return MST.getBBInfo(BB); }
+ // Return the auxiliary BB information if available.
+ BBInfo *findBBInfo(const BasicBlock *BB) const { return MST.findBBInfo(BB); }
+
// Dump edges and BB information.
void dumpInfo(std::string Str = "") const {
MST.dumpEdges(dbgs(), Twine("Dump Function ") + FuncName + " Hash: " +
Twine(FunctionHash) + "\t" + Str);
}
- FuncPGOInstrumentation(Function &Func, bool CreateGlobalVar = false,
- BranchProbabilityInfo *BPI = nullptr,
- BlockFrequencyInfo *BFI = nullptr)
- : F(Func), FunctionHash(0), MST(F, BPI, BFI) {
+ FuncPGOInstrumentation(
+ Function &Func,
+ std::unordered_multimap<Comdat *, GlobalValue *> &ComdatMembers,
+ bool CreateGlobalVar = false, BranchProbabilityInfo *BPI = nullptr,
+ BlockFrequencyInfo *BFI = nullptr)
+ : F(Func), ComdatMembers(ComdatMembers), SIVisitor(Func), FunctionHash(0),
+ MST(F, BPI, BFI) {
+
+ // This should be done before CFG hash computation.
+ SIVisitor.countSelects(Func);
+ NumOfPGOSelectInsts += SIVisitor.getNumOfSelectInsts();
+ IndirectCallSites = findIndirectCallSites(Func);
+
FuncName = getPGOFuncName(F);
computeCFGHash();
+ if (ComdatMembers.size())
+ renameComdatFunction();
DEBUG(dumpInfo("after CFGMST"));
NumOfPGOBB += MST.BBInfos.size();
@@ -281,6 +373,16 @@ public:
if (CreateGlobalVar)
FuncNameVar = createPGOFuncNameVar(F, FuncName);
}
+
+ // Return the number of profile counters needed for the function.
+ unsigned getNumCounters() {
+ unsigned NumCounters = 0;
+ for (auto &E : this->MST.AllEdges) {
+ if (!E->InMST && !E->Removed)
+ NumCounters++;
+ }
+ return NumCounters + SIVisitor.getNumOfSelectInsts();
+ }
};
// Compute Hash value for the CFG: the lower 32 bits are CRC32 of the index
@@ -293,13 +395,90 @@ void FuncPGOInstrumentation<Edge, BBInfo>::computeCFGHash() {
const TerminatorInst *TI = BB.getTerminator();
for (unsigned I = 0, E = TI->getNumSuccessors(); I != E; ++I) {
BasicBlock *Succ = TI->getSuccessor(I);
- uint32_t Index = getBBInfo(Succ).Index;
+ auto BI = findBBInfo(Succ);
+ if (BI == nullptr)
+ continue;
+ uint32_t Index = BI->Index;
for (int J = 0; J < 4; J++)
Indexes.push_back((char)(Index >> (J * 8)));
}
}
JC.update(Indexes);
- FunctionHash = (uint64_t)MST.AllEdges.size() << 32 | JC.getCRC();
+ FunctionHash = (uint64_t)SIVisitor.getNumOfSelectInsts() << 56 |
+ (uint64_t)IndirectCallSites.size() << 48 |
+ (uint64_t)MST.AllEdges.size() << 32 | JC.getCRC();
+}
+
+// Check if we can safely rename this Comdat function.
+static bool canRenameComdat(
+ Function &F,
+ std::unordered_multimap<Comdat *, GlobalValue *> &ComdatMembers) {
+ if (!DoComdatRenaming || !canRenameComdatFunc(F, true))
+ return false;
+
+ // FIXME: Current only handle those Comdat groups that only containing one
+ // function and function aliases.
+ // (1) For a Comdat group containing multiple functions, we need to have a
+ // unique postfix based on the hashes for each function. There is a
+ // non-trivial code refactoring to do this efficiently.
+ // (2) Variables can not be renamed, so we can not rename Comdat function in a
+ // group including global vars.
+ Comdat *C = F.getComdat();
+ for (auto &&CM : make_range(ComdatMembers.equal_range(C))) {
+ if (dyn_cast<GlobalAlias>(CM.second))
+ continue;
+ Function *FM = dyn_cast<Function>(CM.second);
+ if (FM != &F)
+ return false;
+ }
+ return true;
+}
+
+// Append the CFGHash to the Comdat function name.
+template <class Edge, class BBInfo>
+void FuncPGOInstrumentation<Edge, BBInfo>::renameComdatFunction() {
+ if (!canRenameComdat(F, ComdatMembers))
+ return;
+ std::string OrigName = F.getName().str();
+ std::string NewFuncName =
+ Twine(F.getName() + "." + Twine(FunctionHash)).str();
+ F.setName(Twine(NewFuncName));
+ GlobalAlias::create(GlobalValue::WeakAnyLinkage, OrigName, &F);
+ FuncName = Twine(FuncName + "." + Twine(FunctionHash)).str();
+ Comdat *NewComdat;
+ Module *M = F.getParent();
+ // For AvailableExternallyLinkage functions, change the linkage to
+ // LinkOnceODR and put them into comdat. This is because after renaming, there
+ // is no backup external copy available for the function.
+ if (!F.hasComdat()) {
+ assert(F.getLinkage() == GlobalValue::AvailableExternallyLinkage);
+ NewComdat = M->getOrInsertComdat(StringRef(NewFuncName));
+ F.setLinkage(GlobalValue::LinkOnceODRLinkage);
+ F.setComdat(NewComdat);
+ return;
+ }
+
+ // This function belongs to a single function Comdat group.
+ Comdat *OrigComdat = F.getComdat();
+ std::string NewComdatName =
+ Twine(OrigComdat->getName() + "." + Twine(FunctionHash)).str();
+ NewComdat = M->getOrInsertComdat(StringRef(NewComdatName));
+ NewComdat->setSelectionKind(OrigComdat->getSelectionKind());
+
+ for (auto &&CM : make_range(ComdatMembers.equal_range(OrigComdat))) {
+ if (GlobalAlias *GA = dyn_cast<GlobalAlias>(CM.second)) {
+ // For aliases, change the name directly.
+ assert(dyn_cast<Function>(GA->getAliasee()->stripPointerCasts()) == &F);
+ std::string OrigGAName = GA->getName().str();
+ GA->setName(Twine(GA->getName() + "." + Twine(FunctionHash)));
+ GlobalAlias::create(GlobalValue::WeakAnyLinkage, OrigGAName, GA);
+ continue;
+ }
+ // Must be a function.
+ Function *CF = dyn_cast<Function>(CM.second);
+ assert(CF);
+ CF->setComdat(NewComdat);
+ }
}
// Given a CFG E to be instrumented, find which BB to place the instrumented
@@ -340,15 +519,12 @@ BasicBlock *FuncPGOInstrumentation<Edge, BBInfo>::getInstrBB(Edge *E) {
// Visit all edge and instrument the edges not in MST, and do value profiling.
// Critical edges will be split.
-static void instrumentOneFunc(Function &F, Module *M,
- BranchProbabilityInfo *BPI,
- BlockFrequencyInfo *BFI) {
- unsigned NumCounters = 0;
- FuncPGOInstrumentation<PGOEdge, BBInfo> FuncInfo(F, true, BPI, BFI);
- for (auto &E : FuncInfo.MST.AllEdges) {
- if (!E->InMST && !E->Removed)
- NumCounters++;
- }
+static void instrumentOneFunc(
+ Function &F, Module *M, BranchProbabilityInfo *BPI, BlockFrequencyInfo *BFI,
+ std::unordered_multimap<Comdat *, GlobalValue *> &ComdatMembers) {
+ FuncPGOInstrumentation<PGOEdge, BBInfo> FuncInfo(F, ComdatMembers, true, BPI,
+ BFI);
+ unsigned NumCounters = FuncInfo.getNumCounters();
uint32_t I = 0;
Type *I8PtrTy = Type::getInt8PtrTy(M->getContext());
@@ -367,11 +543,16 @@ static void instrumentOneFunc(Function &F, Module *M,
Builder.getInt32(I++)});
}
+ // Now instrument select instructions:
+ FuncInfo.SIVisitor.instrumentSelects(F, &I, NumCounters, FuncInfo.FuncNameVar,
+ FuncInfo.FunctionHash);
+ assert(I == NumCounters);
+
if (DisableValueProfiling)
return;
unsigned NumIndirectCallSites = 0;
- for (auto &I : findIndirectCallSites(F)) {
+ for (auto &I : FuncInfo.IndirectCallSites) {
CallSite CS(I);
Value *Callee = CS.getCalledValue();
DEBUG(dbgs() << "Instrument one indirect call: CallSite Index = "
@@ -456,10 +637,12 @@ static uint64_t sumEdgeCount(const ArrayRef<PGOUseEdge *> Edges) {
class PGOUseFunc {
public:
- PGOUseFunc(Function &Func, Module *Modu, BranchProbabilityInfo *BPI = nullptr,
+ PGOUseFunc(Function &Func, Module *Modu,
+ std::unordered_multimap<Comdat *, GlobalValue *> &ComdatMembers,
+ BranchProbabilityInfo *BPI = nullptr,
BlockFrequencyInfo *BFI = nullptr)
- : F(Func), M(Modu), FuncInfo(Func, false, BPI, BFI),
- FreqAttr(FFA_Normal) {}
+ : F(Func), M(Modu), FuncInfo(Func, ComdatMembers, false, BPI, BFI),
+ CountPosition(0), ProfileCountSize(0), FreqAttr(FFA_Normal) {}
// Read counts for the instrumented BB from profile.
bool readCounters(IndexedInstrProfReader *PGOReader);
@@ -479,24 +662,37 @@ public:
// Return the function hotness from the profile.
FuncFreqAttr getFuncFreqAttr() const { return FreqAttr; }
+ // Return the function hash.
+ uint64_t getFuncHash() const { return FuncInfo.FunctionHash; }
// Return the profile record for this function;
InstrProfRecord &getProfileRecord() { return ProfileRecord; }
+ // Return the auxiliary BB information.
+ UseBBInfo &getBBInfo(const BasicBlock *BB) const {
+ return FuncInfo.getBBInfo(BB);
+ }
+
+ // Return the auxiliary BB information if available.
+ UseBBInfo *findBBInfo(const BasicBlock *BB) const {
+ return FuncInfo.findBBInfo(BB);
+ }
+
private:
Function &F;
Module *M;
// This member stores the shared information with class PGOGenFunc.
FuncPGOInstrumentation<PGOUseEdge, UseBBInfo> FuncInfo;
- // Return the auxiliary BB information.
- UseBBInfo &getBBInfo(const BasicBlock *BB) const {
- return FuncInfo.getBBInfo(BB);
- }
-
// The maximum count value in the profile. This is only used in PGO use
// compilation.
uint64_t ProgramMaxCount;
+ // Position of counter that remains to be read.
+ uint32_t CountPosition;
+
+ // Total size of the profile count for this function.
+ uint32_t ProfileCountSize;
+
// ProfileRecord for this function.
InstrProfRecord ProfileRecord;
@@ -535,6 +731,7 @@ private:
void PGOUseFunc::setInstrumentedCounts(
const std::vector<uint64_t> &CountFromProfile) {
+ assert(FuncInfo.getNumCounters() == CountFromProfile.size());
// Use a worklist as we will update the vector during the iteration.
std::vector<PGOUseEdge *> WorkList;
for (auto &E : FuncInfo.MST.AllEdges)
@@ -564,6 +761,8 @@ void PGOUseFunc::setInstrumentedCounts(
NewEdge1.InMST = true;
getBBInfo(InstrBB).setBBInfoCount(CountValue);
}
+ ProfileCountSize = CountFromProfile.size();
+ CountPosition = I;
}
// Set the count value for the unknown edge. There should be one and only one
@@ -594,11 +793,15 @@ bool PGOUseFunc::readCounters(IndexedInstrProfReader *PGOReader) {
bool SkipWarning = false;
if (Err == instrprof_error::unknown_function) {
NumOfPGOMissing++;
- SkipWarning = NoPGOWarnMissing;
+ SkipWarning = !PGOWarnMissing;
} else if (Err == instrprof_error::hash_mismatch ||
Err == instrprof_error::malformed) {
NumOfPGOMismatch++;
- SkipWarning = NoPGOWarnMismatch;
+ SkipWarning =
+ NoPGOWarnMismatch ||
+ (NoPGOWarnMismatchComdat &&
+ (F.hasComdat() ||
+ F.getLinkage() == GlobalValue::AvailableExternallyLinkage));
}
if (SkipWarning)
@@ -663,27 +866,38 @@ void PGOUseFunc::populateCounters() {
// For efficient traversal, it's better to start from the end as most
// of the instrumented edges are at the end.
for (auto &BB : reverse(F)) {
- UseBBInfo &Count = getBBInfo(&BB);
- if (!Count.CountValid) {
- if (Count.UnknownCountOutEdge == 0) {
- Count.CountValue = sumEdgeCount(Count.OutEdges);
- Count.CountValid = true;
+ UseBBInfo *Count = findBBInfo(&BB);
+ if (Count == nullptr)
+ continue;
+ if (!Count->CountValid) {
+ if (Count->UnknownCountOutEdge == 0) {
+ Count->CountValue = sumEdgeCount(Count->OutEdges);
+ Count->CountValid = true;
Changes = true;
- } else if (Count.UnknownCountInEdge == 0) {
- Count.CountValue = sumEdgeCount(Count.InEdges);
- Count.CountValid = true;
+ } else if (Count->UnknownCountInEdge == 0) {
+ Count->CountValue = sumEdgeCount(Count->InEdges);
+ Count->CountValid = true;
Changes = true;
}
}
- if (Count.CountValid) {
- if (Count.UnknownCountOutEdge == 1) {
- uint64_t Total = Count.CountValue - sumEdgeCount(Count.OutEdges);
- setEdgeCount(Count.OutEdges, Total);
+ if (Count->CountValid) {
+ if (Count->UnknownCountOutEdge == 1) {
+ uint64_t Total = 0;
+ uint64_t OutSum = sumEdgeCount(Count->OutEdges);
+ // If the one of the successor block can early terminate (no-return),
+ // we can end up with situation where out edge sum count is larger as
+ // the source BB's count is collected by a post-dominated block.
+ if (Count->CountValue > OutSum)
+ Total = Count->CountValue - OutSum;
+ setEdgeCount(Count->OutEdges, Total);
Changes = true;
}
- if (Count.UnknownCountInEdge == 1) {
- uint64_t Total = Count.CountValue - sumEdgeCount(Count.InEdges);
- setEdgeCount(Count.InEdges, Total);
+ if (Count->UnknownCountInEdge == 1) {
+ uint64_t Total = 0;
+ uint64_t InSum = sumEdgeCount(Count->InEdges);
+ if (Count->CountValue > InSum)
+ Total = Count->CountValue - InSum;
+ setEdgeCount(Count->InEdges, Total);
Changes = true;
}
}
@@ -693,24 +907,50 @@ void PGOUseFunc::populateCounters() {
DEBUG(dbgs() << "Populate counts in " << NumPasses << " passes.\n");
#ifndef NDEBUG
// Assert every BB has a valid counter.
- for (auto &BB : F)
- assert(getBBInfo(&BB).CountValid && "BB count is not valid");
+ for (auto &BB : F) {
+ auto BI = findBBInfo(&BB);
+ if (BI == nullptr)
+ continue;
+ assert(BI->CountValid && "BB count is not valid");
+ }
#endif
uint64_t FuncEntryCount = getBBInfo(&*F.begin()).CountValue;
F.setEntryCount(FuncEntryCount);
uint64_t FuncMaxCount = FuncEntryCount;
- for (auto &BB : F)
- FuncMaxCount = std::max(FuncMaxCount, getBBInfo(&BB).CountValue);
+ for (auto &BB : F) {
+ auto BI = findBBInfo(&BB);
+ if (BI == nullptr)
+ continue;
+ FuncMaxCount = std::max(FuncMaxCount, BI->CountValue);
+ }
markFunctionAttributes(FuncEntryCount, FuncMaxCount);
+ // Now annotate select instructions
+ FuncInfo.SIVisitor.annotateSelects(F, this, &CountPosition);
+ assert(CountPosition == ProfileCountSize);
+
DEBUG(FuncInfo.dumpInfo("after reading profile."));
}
+static void setProfMetadata(Module *M, Instruction *TI,
+ ArrayRef<uint64_t> EdgeCounts, uint64_t MaxCount) {
+ MDBuilder MDB(M->getContext());
+ assert(MaxCount > 0 && "Bad max count");
+ uint64_t Scale = calculateCountScale(MaxCount);
+ SmallVector<unsigned, 4> Weights;
+ for (const auto &ECI : EdgeCounts)
+ Weights.push_back(scaleBranchCount(ECI, Scale));
+
+ DEBUG(dbgs() << "Weight is: ";
+ for (const auto &W : Weights) { dbgs() << W << " "; }
+ dbgs() << "\n";);
+ TI->setMetadata(llvm::LLVMContext::MD_prof, MDB.createBranchWeights(Weights));
+}
+
// Assign the scaled count values to the BB with multiple out edges.
void PGOUseFunc::setBranchWeights() {
// Generate MD_prof metadata for every branch instruction.
DEBUG(dbgs() << "\nSetting branch weights.\n");
- MDBuilder MDB(M->getContext());
for (auto &BB : F) {
TerminatorInst *TI = BB.getTerminator();
if (TI->getNumSuccessors() < 2)
@@ -723,7 +963,7 @@ void PGOUseFunc::setBranchWeights() {
// We have a non-zero Branch BB.
const UseBBInfo &BBCountInfo = getBBInfo(&BB);
unsigned Size = BBCountInfo.OutEdges.size();
- SmallVector<unsigned, 2> EdgeCounts(Size, 0);
+ SmallVector<uint64_t, 2> EdgeCounts(Size, 0);
uint64_t MaxCount = 0;
for (unsigned s = 0; s < Size; s++) {
const PGOUseEdge *E = BBCountInfo.OutEdges[s];
@@ -737,20 +977,64 @@ void PGOUseFunc::setBranchWeights() {
MaxCount = EdgeCount;
EdgeCounts[SuccNum] = EdgeCount;
}
- assert(MaxCount > 0 && "Bad max count");
- uint64_t Scale = calculateCountScale(MaxCount);
- SmallVector<unsigned, 4> Weights;
- for (const auto &ECI : EdgeCounts)
- Weights.push_back(scaleBranchCount(ECI, Scale));
-
- TI->setMetadata(llvm::LLVMContext::MD_prof,
- MDB.createBranchWeights(Weights));
- DEBUG(dbgs() << "Weight is: ";
- for (const auto &W : Weights) { dbgs() << W << " "; }
- dbgs() << "\n";);
+ setProfMetadata(M, TI, EdgeCounts, MaxCount);
}
}
+void SelectInstVisitor::instrumentOneSelectInst(SelectInst &SI) {
+ Module *M = F.getParent();
+ IRBuilder<> Builder(&SI);
+ Type *Int64Ty = Builder.getInt64Ty();
+ Type *I8PtrTy = Builder.getInt8PtrTy();
+ auto *Step = Builder.CreateZExt(SI.getCondition(), Int64Ty);
+ Builder.CreateCall(
+ Intrinsic::getDeclaration(M, Intrinsic::instrprof_increment_step),
+ {llvm::ConstantExpr::getBitCast(FuncNameVar, I8PtrTy),
+ Builder.getInt64(FuncHash),
+ Builder.getInt32(TotalNumCtrs), Builder.getInt32(*CurCtrIdx), Step});
+ ++(*CurCtrIdx);
+}
+
+void SelectInstVisitor::annotateOneSelectInst(SelectInst &SI) {
+ std::vector<uint64_t> &CountFromProfile = UseFunc->getProfileRecord().Counts;
+ assert(*CurCtrIdx < CountFromProfile.size() &&
+ "Out of bound access of counters");
+ uint64_t SCounts[2];
+ SCounts[0] = CountFromProfile[*CurCtrIdx]; // True count
+ ++(*CurCtrIdx);
+ uint64_t TotalCount = 0;
+ auto BI = UseFunc->findBBInfo(SI.getParent());
+ if (BI != nullptr)
+ TotalCount = BI->CountValue;
+ // False Count
+ SCounts[1] = (TotalCount > SCounts[0] ? TotalCount - SCounts[0] : 0);
+ uint64_t MaxCount = std::max(SCounts[0], SCounts[1]);
+ if (MaxCount)
+ setProfMetadata(F.getParent(), &SI, SCounts, MaxCount);
+}
+
+void SelectInstVisitor::visitSelectInst(SelectInst &SI) {
+ if (!PGOInstrSelect)
+ return;
+ // FIXME: do not handle this yet.
+ if (SI.getCondition()->getType()->isVectorTy())
+ return;
+
+ NSIs++;
+ switch (Mode) {
+ case VM_counting:
+ return;
+ case VM_instrument:
+ instrumentOneSelectInst(SI);
+ return;
+ case VM_annotate:
+ annotateOneSelectInst(SI);
+ return;
+ }
+
+ llvm_unreachable("Unknown visiting mode");
+}
+
// Traverse all the indirect callsites and annotate the instructions.
void PGOUseFunc::annotateIndirectCallSites() {
if (DisableValueProfiling)
@@ -760,7 +1044,7 @@ void PGOUseFunc::annotateIndirectCallSites() {
createPGOFuncNameMetadata(F, FuncInfo.FuncName);
unsigned IndirectCallSiteIndex = 0;
- auto IndirectCallSites = findIndirectCallSites(F);
+ auto &IndirectCallSites = FuncInfo.IndirectCallSites;
unsigned NumValueSites =
ProfileRecord.getNumValueSites(IPVK_IndirectCallTarget);
if (NumValueSites != IndirectCallSites.size()) {
@@ -784,7 +1068,7 @@ void PGOUseFunc::annotateIndirectCallSites() {
}
} // end anonymous namespace
-// Create a COMDAT variable IR_LEVEL_PROF_VARNAME to make the runtime
+// Create a COMDAT variable INSTR_PROF_RAW_VERSION_VAR to make the runtime
// aware this is an ir_level profile so it can set the version flag.
static void createIRLevelProfileFlagVariable(Module &M) {
Type *IntTy64 = Type::getInt64Ty(M.getContext());
@@ -792,26 +1076,47 @@ static void createIRLevelProfileFlagVariable(Module &M) {
auto IRLevelVersionVariable = new GlobalVariable(
M, IntTy64, true, GlobalVariable::ExternalLinkage,
Constant::getIntegerValue(IntTy64, APInt(64, ProfileVersion)),
- INSTR_PROF_QUOTE(IR_LEVEL_PROF_VERSION_VAR));
+ INSTR_PROF_QUOTE(INSTR_PROF_RAW_VERSION_VAR));
IRLevelVersionVariable->setVisibility(GlobalValue::DefaultVisibility);
Triple TT(M.getTargetTriple());
if (!TT.supportsCOMDAT())
IRLevelVersionVariable->setLinkage(GlobalValue::WeakAnyLinkage);
else
IRLevelVersionVariable->setComdat(M.getOrInsertComdat(
- StringRef(INSTR_PROF_QUOTE(IR_LEVEL_PROF_VERSION_VAR))));
+ StringRef(INSTR_PROF_QUOTE(INSTR_PROF_RAW_VERSION_VAR))));
+}
+
+// Collect the set of members for each Comdat in module M and store
+// in ComdatMembers.
+static void collectComdatMembers(
+ Module &M,
+ std::unordered_multimap<Comdat *, GlobalValue *> &ComdatMembers) {
+ if (!DoComdatRenaming)
+ return;
+ for (Function &F : M)
+ if (Comdat *C = F.getComdat())
+ ComdatMembers.insert(std::make_pair(C, &F));
+ for (GlobalVariable &GV : M.globals())
+ if (Comdat *C = GV.getComdat())
+ ComdatMembers.insert(std::make_pair(C, &GV));
+ for (GlobalAlias &GA : M.aliases())
+ if (Comdat *C = GA.getComdat())
+ ComdatMembers.insert(std::make_pair(C, &GA));
}
static bool InstrumentAllFunctions(
Module &M, function_ref<BranchProbabilityInfo *(Function &)> LookupBPI,
function_ref<BlockFrequencyInfo *(Function &)> LookupBFI) {
createIRLevelProfileFlagVariable(M);
+ std::unordered_multimap<Comdat *, GlobalValue *> ComdatMembers;
+ collectComdatMembers(M, ComdatMembers);
+
for (auto &F : M) {
if (F.isDeclaration())
continue;
auto *BPI = LookupBPI(F);
auto *BFI = LookupBFI(F);
- instrumentOneFunc(F, &M, BPI, BFI);
+ instrumentOneFunc(F, &M, BPI, BFI, ComdatMembers);
}
return true;
}
@@ -830,7 +1135,7 @@ bool PGOInstrumentationGenLegacyPass::runOnModule(Module &M) {
}
PreservedAnalyses PGOInstrumentationGen::run(Module &M,
- AnalysisManager<Module> &AM) {
+ ModuleAnalysisManager &AM) {
auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
auto LookupBPI = [&FAM](Function &F) {
@@ -877,6 +1182,8 @@ static bool annotateAllFunctions(
return false;
}
+ std::unordered_multimap<Comdat *, GlobalValue *> ComdatMembers;
+ collectComdatMembers(M, ComdatMembers);
std::vector<Function *> HotFunctions;
std::vector<Function *> ColdFunctions;
for (auto &F : M) {
@@ -884,7 +1191,7 @@ static bool annotateAllFunctions(
continue;
auto *BPI = LookupBPI(F);
auto *BFI = LookupBFI(F);
- PGOUseFunc Func(F, &M, BPI, BFI);
+ PGOUseFunc Func(F, &M, ComdatMembers, BPI, BFI);
if (!Func.readCounters(PGOReader.get()))
continue;
Func.populateCounters();
@@ -910,7 +1217,6 @@ static bool annotateAllFunctions(
F->addFnAttr(llvm::Attribute::Cold);
DEBUG(dbgs() << "Set cold attribute to function: " << F->getName() << "\n");
}
-
return true;
}
@@ -921,7 +1227,7 @@ PGOInstrumentationUse::PGOInstrumentationUse(std::string Filename)
}
PreservedAnalyses PGOInstrumentationUse::run(Module &M,
- AnalysisManager<Module> &AM) {
+ ModuleAnalysisManager &AM) {
auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
auto LookupBPI = [&FAM](Function &F) {
diff --git a/contrib/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp b/contrib/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
index 7d40447..5b4b1fb 100644
--- a/contrib/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
+++ b/contrib/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
@@ -67,11 +67,23 @@ static const char *const SanCovTraceEnterName =
static const char *const SanCovTraceBBName =
"__sanitizer_cov_trace_basic_block";
static const char *const SanCovTracePCName = "__sanitizer_cov_trace_pc";
-static const char *const SanCovTraceCmpName = "__sanitizer_cov_trace_cmp";
+static const char *const SanCovTraceCmp1 = "__sanitizer_cov_trace_cmp1";
+static const char *const SanCovTraceCmp2 = "__sanitizer_cov_trace_cmp2";
+static const char *const SanCovTraceCmp4 = "__sanitizer_cov_trace_cmp4";
+static const char *const SanCovTraceCmp8 = "__sanitizer_cov_trace_cmp8";
+static const char *const SanCovTraceDiv4 = "__sanitizer_cov_trace_div4";
+static const char *const SanCovTraceDiv8 = "__sanitizer_cov_trace_div8";
+static const char *const SanCovTraceGep = "__sanitizer_cov_trace_gep";
static const char *const SanCovTraceSwitchName = "__sanitizer_cov_trace_switch";
static const char *const SanCovModuleCtorName = "sancov.module_ctor";
static const uint64_t SanCtorAndDtorPriority = 2;
+static const char *const SanCovTracePCGuardSection = "__sancov_guards";
+static const char *const SanCovTracePCGuardName =
+ "__sanitizer_cov_trace_pc_guard";
+static const char *const SanCovTracePCGuardInitName =
+ "__sanitizer_cov_trace_pc_guard_init";
+
static cl::opt<int> ClCoverageLevel(
"sanitizer-coverage-level",
cl::desc("Sanitizer Coverage. 0: none, 1: entry block, 2: all blocks, "
@@ -95,11 +107,22 @@ static cl::opt<bool> ClExperimentalTracePC("sanitizer-coverage-trace-pc",
cl::desc("Experimental pc tracing"),
cl::Hidden, cl::init(false));
+static cl::opt<bool> ClTracePCGuard("sanitizer-coverage-trace-pc-guard",
+ cl::desc("pc tracing with a guard"),
+ cl::Hidden, cl::init(false));
+
static cl::opt<bool>
- ClExperimentalCMPTracing("sanitizer-coverage-experimental-trace-compares",
- cl::desc("Experimental tracing of CMP and similar "
- "instructions"),
- cl::Hidden, cl::init(false));
+ ClCMPTracing("sanitizer-coverage-trace-compares",
+ cl::desc("Tracing of CMP and similar instructions"),
+ cl::Hidden, cl::init(false));
+
+static cl::opt<bool> ClDIVTracing("sanitizer-coverage-trace-divs",
+ cl::desc("Tracing of DIV instructions"),
+ cl::Hidden, cl::init(false));
+
+static cl::opt<bool> ClGEPTracing("sanitizer-coverage-trace-geps",
+ cl::desc("Tracing of GEP instructions"),
+ cl::Hidden, cl::init(false));
static cl::opt<bool>
ClPruneBlocks("sanitizer-coverage-prune-blocks",
@@ -147,9 +170,12 @@ SanitizerCoverageOptions OverrideFromCL(SanitizerCoverageOptions Options) {
Options.CoverageType = std::max(Options.CoverageType, CLOpts.CoverageType);
Options.IndirectCalls |= CLOpts.IndirectCalls;
Options.TraceBB |= ClExperimentalTracing;
- Options.TraceCmp |= ClExperimentalCMPTracing;
+ Options.TraceCmp |= ClCMPTracing;
+ Options.TraceDiv |= ClDIVTracing;
+ Options.TraceGep |= ClGEPTracing;
Options.Use8bitCounters |= ClUse8bitCounters;
Options.TracePC |= ClExperimentalTracePC;
+ Options.TracePCGuard |= ClTracePCGuard;
return Options;
}
@@ -163,7 +189,7 @@ public:
bool runOnModule(Module &M) override;
bool runOnFunction(Function &F);
static char ID; // Pass identification, replacement for typeid
- const char *getPassName() const override { return "SanitizerCoverageModule"; }
+ StringRef getPassName() const override { return "SanitizerCoverageModule"; }
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<DominatorTreeWrapperPass>();
@@ -174,11 +200,17 @@ private:
void InjectCoverageForIndirectCalls(Function &F,
ArrayRef<Instruction *> IndirCalls);
void InjectTraceForCmp(Function &F, ArrayRef<Instruction *> CmpTraceTargets);
+ void InjectTraceForDiv(Function &F,
+ ArrayRef<BinaryOperator *> DivTraceTargets);
+ void InjectTraceForGep(Function &F,
+ ArrayRef<GetElementPtrInst *> GepTraceTargets);
void InjectTraceForSwitch(Function &F,
ArrayRef<Instruction *> SwitchTraceTargets);
bool InjectCoverage(Function &F, ArrayRef<BasicBlock *> AllBlocks);
+ void CreateFunctionGuardArray(size_t NumGuards, Function &F);
void SetNoSanitizeMetadata(Instruction *I);
- void InjectCoverageAtBlock(Function &F, BasicBlock &BB, bool UseCalls);
+ void InjectCoverageAtBlock(Function &F, BasicBlock &BB, size_t Idx,
+ bool UseCalls);
unsigned NumberOfInstrumentedBlocks() {
return SanCovFunction->getNumUses() +
SanCovWithCheckFunction->getNumUses() + SanCovTraceBB->getNumUses() +
@@ -187,17 +219,21 @@ private:
Function *SanCovFunction;
Function *SanCovWithCheckFunction;
Function *SanCovIndirCallFunction, *SanCovTracePCIndir;
- Function *SanCovTraceEnter, *SanCovTraceBB, *SanCovTracePC;
- Function *SanCovTraceCmpFunction;
+ Function *SanCovTraceEnter, *SanCovTraceBB, *SanCovTracePC, *SanCovTracePCGuard;
+ Function *SanCovTraceCmpFunction[4];
+ Function *SanCovTraceDivFunction[2];
+ Function *SanCovTraceGepFunction;
Function *SanCovTraceSwitchFunction;
InlineAsm *EmptyAsm;
- Type *IntptrTy, *Int64Ty, *Int64PtrTy;
+ Type *IntptrTy, *IntptrPtrTy, *Int64Ty, *Int64PtrTy, *Int32Ty, *Int32PtrTy;
Module *CurModule;
LLVMContext *C;
const DataLayout *DL;
GlobalVariable *GuardArray;
+ GlobalVariable *FunctionGuardArray; // for trace-pc-guard.
GlobalVariable *EightBitCounterArray;
+ bool HasSancovGuardsSection;
SanitizerCoverageOptions Options;
};
@@ -210,13 +246,16 @@ bool SanitizerCoverageModule::runOnModule(Module &M) {
C = &(M.getContext());
DL = &M.getDataLayout();
CurModule = &M;
+ HasSancovGuardsSection = false;
IntptrTy = Type::getIntNTy(*C, DL->getPointerSizeInBits());
+ IntptrPtrTy = PointerType::getUnqual(IntptrTy);
Type *VoidTy = Type::getVoidTy(*C);
IRBuilder<> IRB(*C);
Type *Int8PtrTy = PointerType::getUnqual(IRB.getInt8Ty());
- Type *Int32PtrTy = PointerType::getUnqual(IRB.getInt32Ty());
Int64PtrTy = PointerType::getUnqual(IRB.getInt64Ty());
+ Int32PtrTy = PointerType::getUnqual(IRB.getInt32Ty());
Int64Ty = IRB.getInt64Ty();
+ Int32Ty = IRB.getInt32Ty();
SanCovFunction = checkSanitizerInterfaceFunction(
M.getOrInsertFunction(SanCovName, VoidTy, Int32PtrTy, nullptr));
@@ -227,9 +266,28 @@ bool SanitizerCoverageModule::runOnModule(Module &M) {
SanCovIndirCallFunction =
checkSanitizerInterfaceFunction(M.getOrInsertFunction(
SanCovIndirCallName, VoidTy, IntptrTy, IntptrTy, nullptr));
- SanCovTraceCmpFunction =
+ SanCovTraceCmpFunction[0] =
+ checkSanitizerInterfaceFunction(M.getOrInsertFunction(
+ SanCovTraceCmp1, VoidTy, IRB.getInt8Ty(), IRB.getInt8Ty(), nullptr));
+ SanCovTraceCmpFunction[1] = checkSanitizerInterfaceFunction(
+ M.getOrInsertFunction(SanCovTraceCmp2, VoidTy, IRB.getInt16Ty(),
+ IRB.getInt16Ty(), nullptr));
+ SanCovTraceCmpFunction[2] = checkSanitizerInterfaceFunction(
+ M.getOrInsertFunction(SanCovTraceCmp4, VoidTy, IRB.getInt32Ty(),
+ IRB.getInt32Ty(), nullptr));
+ SanCovTraceCmpFunction[3] =
checkSanitizerInterfaceFunction(M.getOrInsertFunction(
- SanCovTraceCmpName, VoidTy, Int64Ty, Int64Ty, Int64Ty, nullptr));
+ SanCovTraceCmp8, VoidTy, Int64Ty, Int64Ty, nullptr));
+
+ SanCovTraceDivFunction[0] =
+ checkSanitizerInterfaceFunction(M.getOrInsertFunction(
+ SanCovTraceDiv4, VoidTy, IRB.getInt32Ty(), nullptr));
+ SanCovTraceDivFunction[1] =
+ checkSanitizerInterfaceFunction(M.getOrInsertFunction(
+ SanCovTraceDiv8, VoidTy, Int64Ty, nullptr));
+ SanCovTraceGepFunction =
+ checkSanitizerInterfaceFunction(M.getOrInsertFunction(
+ SanCovTraceGep, VoidTy, IntptrTy, nullptr));
SanCovTraceSwitchFunction =
checkSanitizerInterfaceFunction(M.getOrInsertFunction(
SanCovTraceSwitchName, VoidTy, Int64Ty, Int64PtrTy, nullptr));
@@ -241,6 +299,8 @@ bool SanitizerCoverageModule::runOnModule(Module &M) {
SanCovTracePC = checkSanitizerInterfaceFunction(
M.getOrInsertFunction(SanCovTracePCName, VoidTy, nullptr));
+ SanCovTracePCGuard = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
+ SanCovTracePCGuardName, VoidTy, Int32PtrTy, nullptr));
SanCovTraceEnter = checkSanitizerInterfaceFunction(
M.getOrInsertFunction(SanCovTraceEnterName, VoidTy, Int32PtrTy, nullptr));
SanCovTraceBB = checkSanitizerInterfaceFunction(
@@ -251,9 +311,10 @@ bool SanitizerCoverageModule::runOnModule(Module &M) {
Type *Int32Ty = IRB.getInt32Ty();
Type *Int8Ty = IRB.getInt8Ty();
- GuardArray =
- new GlobalVariable(M, Int32Ty, false, GlobalValue::ExternalLinkage,
- nullptr, "__sancov_gen_cov_tmp");
+ if (!Options.TracePCGuard)
+ GuardArray =
+ new GlobalVariable(M, Int32Ty, false, GlobalValue::ExternalLinkage,
+ nullptr, "__sancov_gen_cov_tmp");
if (Options.Use8bitCounters)
EightBitCounterArray =
new GlobalVariable(M, Int8Ty, false, GlobalVariable::ExternalLinkage,
@@ -264,17 +325,20 @@ bool SanitizerCoverageModule::runOnModule(Module &M) {
auto N = NumberOfInstrumentedBlocks();
- // Now we know how many elements we need. Create an array of guards
- // with one extra element at the beginning for the size.
- Type *Int32ArrayNTy = ArrayType::get(Int32Ty, N + 1);
- GlobalVariable *RealGuardArray = new GlobalVariable(
- M, Int32ArrayNTy, false, GlobalValue::PrivateLinkage,
- Constant::getNullValue(Int32ArrayNTy), "__sancov_gen_cov");
-
- // Replace the dummy array with the real one.
- GuardArray->replaceAllUsesWith(
- IRB.CreatePointerCast(RealGuardArray, Int32PtrTy));
- GuardArray->eraseFromParent();
+ GlobalVariable *RealGuardArray = nullptr;
+ if (!Options.TracePCGuard) {
+ // Now we know how many elements we need. Create an array of guards
+ // with one extra element at the beginning for the size.
+ Type *Int32ArrayNTy = ArrayType::get(Int32Ty, N + 1);
+ RealGuardArray = new GlobalVariable(
+ M, Int32ArrayNTy, false, GlobalValue::PrivateLinkage,
+ Constant::getNullValue(Int32ArrayNTy), "__sancov_gen_cov");
+
+ // Replace the dummy array with the real one.
+ GuardArray->replaceAllUsesWith(
+ IRB.CreatePointerCast(RealGuardArray, Int32PtrTy));
+ GuardArray->eraseFromParent();
+ }
GlobalVariable *RealEightBitCounterArray;
if (Options.Use8bitCounters) {
@@ -293,11 +357,30 @@ bool SanitizerCoverageModule::runOnModule(Module &M) {
// Create variable for module (compilation unit) name
Constant *ModNameStrConst =
ConstantDataArray::getString(M.getContext(), M.getName(), true);
- GlobalVariable *ModuleName =
- new GlobalVariable(M, ModNameStrConst->getType(), true,
- GlobalValue::PrivateLinkage, ModNameStrConst);
+ GlobalVariable *ModuleName = new GlobalVariable(
+ M, ModNameStrConst->getType(), true, GlobalValue::PrivateLinkage,
+ ModNameStrConst, "__sancov_gen_modname");
+ if (Options.TracePCGuard) {
+ if (HasSancovGuardsSection) {
+ Function *CtorFunc;
+ std::string SectionName(SanCovTracePCGuardSection);
+ GlobalVariable *Bounds[2];
+ const char *Prefix[2] = {"__start_", "__stop_"};
+ for (int i = 0; i < 2; i++) {
+ Bounds[i] = new GlobalVariable(M, Int32PtrTy, false,
+ GlobalVariable::ExternalLinkage, nullptr,
+ Prefix[i] + SectionName);
+ Bounds[i]->setVisibility(GlobalValue::HiddenVisibility);
+ }
+ std::tie(CtorFunc, std::ignore) = createSanitizerCtorAndInitFunctions(
+ M, SanCovModuleCtorName, SanCovTracePCGuardInitName,
+ {Int32PtrTy, Int32PtrTy},
+ {IRB.CreatePointerCast(Bounds[0], Int32PtrTy),
+ IRB.CreatePointerCast(Bounds[1], Int32PtrTy)});
- if (!Options.TracePC) {
+ appendToGlobalCtors(M, CtorFunc, SanCtorAndDtorPriority);
+ }
+ } else if (!Options.TracePC) {
Function *CtorFunc;
std::tie(CtorFunc, std::ignore) = createSanitizerCtorAndInitFunctions(
M, SanCovModuleCtorName, SanCovModuleInitName,
@@ -344,6 +427,14 @@ static bool isFullPostDominator(const BasicBlock *BB,
static bool shouldInstrumentBlock(const Function& F, const BasicBlock *BB, const DominatorTree *DT,
const PostDominatorTree *PDT) {
+ // Don't insert coverage for unreachable blocks: we will never call
+ // __sanitizer_cov() for them, so counting them in
+ // NumberOfInstrumentedBlocks() might complicate calculation of code coverage
+ // percentage. Also, unreachable instructions frequently have no debug
+ // locations.
+ if (isa<UnreachableInst>(BB->getTerminator()))
+ return false;
+
if (!ClPruneBlocks || &F.getEntryBlock() == BB)
return true;
@@ -355,6 +446,13 @@ bool SanitizerCoverageModule::runOnFunction(Function &F) {
return false;
if (F.getName().find(".module_ctor") != std::string::npos)
return false; // Should not instrument sanitizer init functions.
+ if (F.getName().startswith("__sanitizer_"))
+ return false; // Don't instrument __sanitizer_* callbacks.
+ // Don't instrument MSVC CRT configuration helpers. They may run before normal
+ // initialization.
+ if (F.getName() == "__local_stdio_printf_options" ||
+ F.getName() == "__local_stdio_scanf_options")
+ return false;
// Don't instrument functions using SEH for now. Splitting basic blocks like
// we do for coverage breaks WinEHPrepare.
// FIXME: Remove this when SEH no longer uses landingpad pattern matching.
@@ -367,6 +465,8 @@ bool SanitizerCoverageModule::runOnFunction(Function &F) {
SmallVector<BasicBlock *, 16> BlocksToInstrument;
SmallVector<Instruction *, 8> CmpTraceTargets;
SmallVector<Instruction *, 8> SwitchTraceTargets;
+ SmallVector<BinaryOperator *, 8> DivTraceTargets;
+ SmallVector<GetElementPtrInst *, 8> GepTraceTargets;
const DominatorTree *DT =
&getAnalysis<DominatorTreeWrapperPass>(F).getDomTree();
@@ -388,28 +488,53 @@ bool SanitizerCoverageModule::runOnFunction(Function &F) {
if (isa<SwitchInst>(&Inst))
SwitchTraceTargets.push_back(&Inst);
}
- }
+ if (Options.TraceDiv)
+ if (BinaryOperator *BO = dyn_cast<BinaryOperator>(&Inst))
+ if (BO->getOpcode() == Instruction::SDiv ||
+ BO->getOpcode() == Instruction::UDiv)
+ DivTraceTargets.push_back(BO);
+ if (Options.TraceGep)
+ if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&Inst))
+ GepTraceTargets.push_back(GEP);
+ }
}
InjectCoverage(F, BlocksToInstrument);
InjectCoverageForIndirectCalls(F, IndirCalls);
InjectTraceForCmp(F, CmpTraceTargets);
InjectTraceForSwitch(F, SwitchTraceTargets);
+ InjectTraceForDiv(F, DivTraceTargets);
+ InjectTraceForGep(F, GepTraceTargets);
return true;
}
+void SanitizerCoverageModule::CreateFunctionGuardArray(size_t NumGuards,
+ Function &F) {
+ if (!Options.TracePCGuard) return;
+ HasSancovGuardsSection = true;
+ ArrayType *ArrayOfInt32Ty = ArrayType::get(Int32Ty, NumGuards);
+ FunctionGuardArray = new GlobalVariable(
+ *CurModule, ArrayOfInt32Ty, false, GlobalVariable::PrivateLinkage,
+ Constant::getNullValue(ArrayOfInt32Ty), "__sancov_gen_");
+ if (auto Comdat = F.getComdat())
+ FunctionGuardArray->setComdat(Comdat);
+ FunctionGuardArray->setSection(SanCovTracePCGuardSection);
+}
bool SanitizerCoverageModule::InjectCoverage(Function &F,
ArrayRef<BasicBlock *> AllBlocks) {
+ if (AllBlocks.empty()) return false;
switch (Options.CoverageType) {
case SanitizerCoverageOptions::SCK_None:
return false;
case SanitizerCoverageOptions::SCK_Function:
- InjectCoverageAtBlock(F, F.getEntryBlock(), false);
+ CreateFunctionGuardArray(1, F);
+ InjectCoverageAtBlock(F, F.getEntryBlock(), 0, false);
return true;
default: {
bool UseCalls = ClCoverageBlockThreshold < AllBlocks.size();
- for (auto BB : AllBlocks)
- InjectCoverageAtBlock(F, *BB, UseCalls);
+ CreateFunctionGuardArray(AllBlocks.size(), F);
+ for (size_t i = 0, N = AllBlocks.size(); i < N; i++)
+ InjectCoverageAtBlock(F, *AllBlocks[i], i, UseCalls);
return true;
}
}
@@ -439,7 +564,7 @@ void SanitizerCoverageModule::InjectCoverageForIndirectCalls(
*F.getParent(), Ty, false, GlobalValue::PrivateLinkage,
Constant::getNullValue(Ty), "__sancov_gen_callee_cache");
CalleeCache->setAlignment(CacheAlignment);
- if (Options.TracePC)
+ if (Options.TracePC || Options.TracePCGuard)
IRB.CreateCall(SanCovTracePCIndir,
IRB.CreatePointerCast(Callee, IntptrTy));
else
@@ -476,6 +601,11 @@ void SanitizerCoverageModule::InjectTraceForSwitch(
C = ConstantExpr::getCast(CastInst::ZExt, It.getCaseValue(), Int64Ty);
Initializers.push_back(C);
}
+ std::sort(Initializers.begin() + 2, Initializers.end(),
+ [](const Constant *A, const Constant *B) {
+ return cast<ConstantInt>(A)->getLimitedValue() <
+ cast<ConstantInt>(B)->getLimitedValue();
+ });
ArrayType *ArrayOfInt64Ty = ArrayType::get(Int64Ty, Initializers.size());
GlobalVariable *GV = new GlobalVariable(
*CurModule, ArrayOfInt64Ty, false, GlobalVariable::InternalLinkage,
@@ -487,6 +617,35 @@ void SanitizerCoverageModule::InjectTraceForSwitch(
}
}
+void SanitizerCoverageModule::InjectTraceForDiv(
+ Function &, ArrayRef<BinaryOperator *> DivTraceTargets) {
+ for (auto BO : DivTraceTargets) {
+ IRBuilder<> IRB(BO);
+ Value *A1 = BO->getOperand(1);
+ if (isa<ConstantInt>(A1)) continue;
+ if (!A1->getType()->isIntegerTy())
+ continue;
+ uint64_t TypeSize = DL->getTypeStoreSizeInBits(A1->getType());
+ int CallbackIdx = TypeSize == 32 ? 0 :
+ TypeSize == 64 ? 1 : -1;
+ if (CallbackIdx < 0) continue;
+ auto Ty = Type::getIntNTy(*C, TypeSize);
+ IRB.CreateCall(SanCovTraceDivFunction[CallbackIdx],
+ {IRB.CreateIntCast(A1, Ty, true)});
+ }
+}
+
+void SanitizerCoverageModule::InjectTraceForGep(
+ Function &, ArrayRef<GetElementPtrInst *> GepTraceTargets) {
+ for (auto GEP : GepTraceTargets) {
+ IRBuilder<> IRB(GEP);
+ for (auto I = GEP->idx_begin(); I != GEP->idx_end(); ++I)
+ if (!isa<ConstantInt>(*I) && (*I)->getType()->isIntegerTy())
+ IRB.CreateCall(SanCovTraceGepFunction,
+ {IRB.CreateIntCast(*I, IntptrTy, true)});
+ }
+}
+
void SanitizerCoverageModule::InjectTraceForCmp(
Function &, ArrayRef<Instruction *> CmpTraceTargets) {
for (auto I : CmpTraceTargets) {
@@ -497,12 +656,16 @@ void SanitizerCoverageModule::InjectTraceForCmp(
if (!A0->getType()->isIntegerTy())
continue;
uint64_t TypeSize = DL->getTypeStoreSizeInBits(A0->getType());
+ int CallbackIdx = TypeSize == 8 ? 0 :
+ TypeSize == 16 ? 1 :
+ TypeSize == 32 ? 2 :
+ TypeSize == 64 ? 3 : -1;
+ if (CallbackIdx < 0) continue;
// __sanitizer_cov_trace_cmp((type_size << 32) | predicate, A0, A1);
+ auto Ty = Type::getIntNTy(*C, TypeSize);
IRB.CreateCall(
- SanCovTraceCmpFunction,
- {ConstantInt::get(Int64Ty, (TypeSize << 32) | ICMP->getPredicate()),
- IRB.CreateIntCast(A0, Int64Ty, true),
- IRB.CreateIntCast(A1, Int64Ty, true)});
+ SanCovTraceCmpFunction[CallbackIdx],
+ {IRB.CreateIntCast(A0, Ty, true), IRB.CreateIntCast(A1, Ty, true)});
}
}
}
@@ -513,16 +676,8 @@ void SanitizerCoverageModule::SetNoSanitizeMetadata(Instruction *I) {
}
void SanitizerCoverageModule::InjectCoverageAtBlock(Function &F, BasicBlock &BB,
- bool UseCalls) {
- // Don't insert coverage for unreachable blocks: we will never call
- // __sanitizer_cov() for them, so counting them in
- // NumberOfInstrumentedBlocks() might complicate calculation of code coverage
- // percentage. Also, unreachable instructions frequently have no debug
- // locations.
- if (isa<UnreachableInst>(BB.getTerminator()))
- return;
+ size_t Idx, bool UseCalls) {
BasicBlock::iterator IP = BB.getFirstInsertionPt();
-
bool IsEntryBB = &BB == &F.getEntryBlock();
DebugLoc EntryLoc;
if (IsEntryBB) {
@@ -538,32 +693,52 @@ void SanitizerCoverageModule::InjectCoverageAtBlock(Function &F, BasicBlock &BB,
IRBuilder<> IRB(&*IP);
IRB.SetCurrentDebugLocation(EntryLoc);
- Value *GuardP = IRB.CreateAdd(
- IRB.CreatePointerCast(GuardArray, IntptrTy),
- ConstantInt::get(IntptrTy, (1 + NumberOfInstrumentedBlocks()) * 4));
- Type *Int32PtrTy = PointerType::getUnqual(IRB.getInt32Ty());
- GuardP = IRB.CreateIntToPtr(GuardP, Int32PtrTy);
if (Options.TracePC) {
IRB.CreateCall(SanCovTracePC); // gets the PC using GET_CALLER_PC.
IRB.CreateCall(EmptyAsm, {}); // Avoids callback merge.
- } else if (Options.TraceBB) {
- IRB.CreateCall(IsEntryBB ? SanCovTraceEnter : SanCovTraceBB, GuardP);
- } else if (UseCalls) {
- IRB.CreateCall(SanCovWithCheckFunction, GuardP);
- } else {
- LoadInst *Load = IRB.CreateLoad(GuardP);
- Load->setAtomic(AtomicOrdering::Monotonic);
- Load->setAlignment(4);
- SetNoSanitizeMetadata(Load);
- Value *Cmp =
- IRB.CreateICmpSGE(Constant::getNullValue(Load->getType()), Load);
- Instruction *Ins = SplitBlockAndInsertIfThen(
- Cmp, &*IP, false, MDBuilder(*C).createBranchWeights(1, 100000));
- IRB.SetInsertPoint(Ins);
- IRB.SetCurrentDebugLocation(EntryLoc);
- // __sanitizer_cov gets the PC of the instruction using GET_CALLER_PC.
- IRB.CreateCall(SanCovFunction, GuardP);
+ } else if (Options.TracePCGuard) {
+ auto GuardPtr = IRB.CreateIntToPtr(
+ IRB.CreateAdd(IRB.CreatePointerCast(FunctionGuardArray, IntptrTy),
+ ConstantInt::get(IntptrTy, Idx * 4)),
+ Int32PtrTy);
+ if (!UseCalls) {
+ auto GuardLoad = IRB.CreateLoad(GuardPtr);
+ GuardLoad->setAtomic(AtomicOrdering::Monotonic);
+ GuardLoad->setAlignment(8);
+ SetNoSanitizeMetadata(GuardLoad); // Don't instrument with e.g. asan.
+ auto Cmp = IRB.CreateICmpNE(
+ GuardLoad, Constant::getNullValue(GuardLoad->getType()));
+ auto Ins = SplitBlockAndInsertIfThen(
+ Cmp, &*IP, false, MDBuilder(*C).createBranchWeights(1, 100000));
+ IRB.SetInsertPoint(Ins);
+ IRB.SetCurrentDebugLocation(EntryLoc);
+ }
+ IRB.CreateCall(SanCovTracePCGuard, GuardPtr);
IRB.CreateCall(EmptyAsm, {}); // Avoids callback merge.
+ } else {
+ Value *GuardP = IRB.CreateAdd(
+ IRB.CreatePointerCast(GuardArray, IntptrTy),
+ ConstantInt::get(IntptrTy, (1 + NumberOfInstrumentedBlocks()) * 4));
+ GuardP = IRB.CreateIntToPtr(GuardP, Int32PtrTy);
+ if (Options.TraceBB) {
+ IRB.CreateCall(IsEntryBB ? SanCovTraceEnter : SanCovTraceBB, GuardP);
+ } else if (UseCalls) {
+ IRB.CreateCall(SanCovWithCheckFunction, GuardP);
+ } else {
+ LoadInst *Load = IRB.CreateLoad(GuardP);
+ Load->setAtomic(AtomicOrdering::Monotonic);
+ Load->setAlignment(4);
+ SetNoSanitizeMetadata(Load);
+ Value *Cmp =
+ IRB.CreateICmpSGE(Constant::getNullValue(Load->getType()), Load);
+ Instruction *Ins = SplitBlockAndInsertIfThen(
+ Cmp, &*IP, false, MDBuilder(*C).createBranchWeights(1, 100000));
+ IRB.SetInsertPoint(Ins);
+ IRB.SetCurrentDebugLocation(EntryLoc);
+ // __sanitizer_cov gets the PC of the instruction using GET_CALLER_PC.
+ IRB.CreateCall(SanCovFunction, GuardP);
+ IRB.CreateCall(EmptyAsm, {}); // Avoids callback merge.
+ }
}
if (Options.Use8bitCounters) {
diff --git a/contrib/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp b/contrib/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
index 41041c7..52035c7 100644
--- a/contrib/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
+++ b/contrib/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
@@ -43,6 +43,7 @@
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/EscapeEnumerator.h"
#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/ModuleUtils.h"
@@ -56,6 +57,10 @@ static cl::opt<bool> ClInstrumentMemoryAccesses(
static cl::opt<bool> ClInstrumentFuncEntryExit(
"tsan-instrument-func-entry-exit", cl::init(true),
cl::desc("Instrument function entry and exit"), cl::Hidden);
+static cl::opt<bool> ClHandleCxxExceptions(
+ "tsan-handle-cxx-exceptions", cl::init(true),
+ cl::desc("Handle C++ exceptions (insert cleanup blocks for unwinding)"),
+ cl::Hidden);
static cl::opt<bool> ClInstrumentAtomics(
"tsan-instrument-atomics", cl::init(true),
cl::desc("Instrument atomics"), cl::Hidden);
@@ -83,7 +88,7 @@ namespace {
/// ThreadSanitizer: instrument the code in module to find races.
struct ThreadSanitizer : public FunctionPass {
ThreadSanitizer() : FunctionPass(ID) {}
- const char *getPassName() const override;
+ StringRef getPassName() const override;
void getAnalysisUsage(AnalysisUsage &AU) const override;
bool runOnFunction(Function &F) override;
bool doInitialization(Module &M) override;
@@ -99,12 +104,15 @@ struct ThreadSanitizer : public FunctionPass {
const DataLayout &DL);
bool addrPointsToConstantData(Value *Addr);
int getMemoryAccessFuncIndex(Value *Addr, const DataLayout &DL);
+ void InsertRuntimeIgnores(Function &F);
Type *IntptrTy;
IntegerType *OrdTy;
// Callbacks to run-time library are computed in doInitialization.
Function *TsanFuncEntry;
Function *TsanFuncExit;
+ Function *TsanIgnoreBegin;
+ Function *TsanIgnoreEnd;
// Accesses sizes are powers of two: 1, 2, 4, 8, 16.
static const size_t kNumberOfAccessSizes = 5;
Function *TsanRead[kNumberOfAccessSizes];
@@ -135,9 +143,7 @@ INITIALIZE_PASS_END(
"ThreadSanitizer: detects data races.",
false, false)
-const char *ThreadSanitizer::getPassName() const {
- return "ThreadSanitizer";
-}
+StringRef ThreadSanitizer::getPassName() const { return "ThreadSanitizer"; }
void ThreadSanitizer::getAnalysisUsage(AnalysisUsage &AU) const {
AU.addRequired<TargetLibraryInfoWrapperPass>();
@@ -149,11 +155,17 @@ FunctionPass *llvm::createThreadSanitizerPass() {
void ThreadSanitizer::initializeCallbacks(Module &M) {
IRBuilder<> IRB(M.getContext());
+ AttributeSet Attr;
+ Attr = Attr.addAttribute(M.getContext(), AttributeSet::FunctionIndex, Attribute::NoUnwind);
// Initialize the callbacks.
TsanFuncEntry = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
- "__tsan_func_entry", IRB.getVoidTy(), IRB.getInt8PtrTy(), nullptr));
+ "__tsan_func_entry", Attr, IRB.getVoidTy(), IRB.getInt8PtrTy(), nullptr));
TsanFuncExit = checkSanitizerInterfaceFunction(
- M.getOrInsertFunction("__tsan_func_exit", IRB.getVoidTy(), nullptr));
+ M.getOrInsertFunction("__tsan_func_exit", Attr, IRB.getVoidTy(), nullptr));
+ TsanIgnoreBegin = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
+ "__tsan_ignore_thread_begin", Attr, IRB.getVoidTy(), nullptr));
+ TsanIgnoreEnd = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
+ "__tsan_ignore_thread_end", Attr, IRB.getVoidTy(), nullptr));
OrdTy = IRB.getInt32Ty();
for (size_t i = 0; i < kNumberOfAccessSizes; ++i) {
const unsigned ByteSize = 1U << i;
@@ -162,31 +174,31 @@ void ThreadSanitizer::initializeCallbacks(Module &M) {
std::string BitSizeStr = utostr(BitSize);
SmallString<32> ReadName("__tsan_read" + ByteSizeStr);
TsanRead[i] = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
- ReadName, IRB.getVoidTy(), IRB.getInt8PtrTy(), nullptr));
+ ReadName, Attr, IRB.getVoidTy(), IRB.getInt8PtrTy(), nullptr));
SmallString<32> WriteName("__tsan_write" + ByteSizeStr);
TsanWrite[i] = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
- WriteName, IRB.getVoidTy(), IRB.getInt8PtrTy(), nullptr));
+ WriteName, Attr, IRB.getVoidTy(), IRB.getInt8PtrTy(), nullptr));
SmallString<64> UnalignedReadName("__tsan_unaligned_read" + ByteSizeStr);
TsanUnalignedRead[i] =
checkSanitizerInterfaceFunction(M.getOrInsertFunction(
- UnalignedReadName, IRB.getVoidTy(), IRB.getInt8PtrTy(), nullptr));
+ UnalignedReadName, Attr, IRB.getVoidTy(), IRB.getInt8PtrTy(), nullptr));
SmallString<64> UnalignedWriteName("__tsan_unaligned_write" + ByteSizeStr);
TsanUnalignedWrite[i] =
checkSanitizerInterfaceFunction(M.getOrInsertFunction(
- UnalignedWriteName, IRB.getVoidTy(), IRB.getInt8PtrTy(), nullptr));
+ UnalignedWriteName, Attr, IRB.getVoidTy(), IRB.getInt8PtrTy(), nullptr));
Type *Ty = Type::getIntNTy(M.getContext(), BitSize);
Type *PtrTy = Ty->getPointerTo();
SmallString<32> AtomicLoadName("__tsan_atomic" + BitSizeStr + "_load");
TsanAtomicLoad[i] = checkSanitizerInterfaceFunction(
- M.getOrInsertFunction(AtomicLoadName, Ty, PtrTy, OrdTy, nullptr));
+ M.getOrInsertFunction(AtomicLoadName, Attr, Ty, PtrTy, OrdTy, nullptr));
SmallString<32> AtomicStoreName("__tsan_atomic" + BitSizeStr + "_store");
TsanAtomicStore[i] = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
- AtomicStoreName, IRB.getVoidTy(), PtrTy, Ty, OrdTy, nullptr));
+ AtomicStoreName, Attr, IRB.getVoidTy(), PtrTy, Ty, OrdTy, nullptr));
for (int op = AtomicRMWInst::FIRST_BINOP;
op <= AtomicRMWInst::LAST_BINOP; ++op) {
@@ -210,32 +222,32 @@ void ThreadSanitizer::initializeCallbacks(Module &M) {
continue;
SmallString<32> RMWName("__tsan_atomic" + itostr(BitSize) + NamePart);
TsanAtomicRMW[op][i] = checkSanitizerInterfaceFunction(
- M.getOrInsertFunction(RMWName, Ty, PtrTy, Ty, OrdTy, nullptr));
+ M.getOrInsertFunction(RMWName, Attr, Ty, PtrTy, Ty, OrdTy, nullptr));
}
SmallString<32> AtomicCASName("__tsan_atomic" + BitSizeStr +
"_compare_exchange_val");
TsanAtomicCAS[i] = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
- AtomicCASName, Ty, PtrTy, Ty, Ty, OrdTy, OrdTy, nullptr));
+ AtomicCASName, Attr, Ty, PtrTy, Ty, Ty, OrdTy, OrdTy, nullptr));
}
TsanVptrUpdate = checkSanitizerInterfaceFunction(
- M.getOrInsertFunction("__tsan_vptr_update", IRB.getVoidTy(),
+ M.getOrInsertFunction("__tsan_vptr_update", Attr, IRB.getVoidTy(),
IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), nullptr));
TsanVptrLoad = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
- "__tsan_vptr_read", IRB.getVoidTy(), IRB.getInt8PtrTy(), nullptr));
+ "__tsan_vptr_read", Attr, IRB.getVoidTy(), IRB.getInt8PtrTy(), nullptr));
TsanAtomicThreadFence = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
- "__tsan_atomic_thread_fence", IRB.getVoidTy(), OrdTy, nullptr));
+ "__tsan_atomic_thread_fence", Attr, IRB.getVoidTy(), OrdTy, nullptr));
TsanAtomicSignalFence = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
- "__tsan_atomic_signal_fence", IRB.getVoidTy(), OrdTy, nullptr));
+ "__tsan_atomic_signal_fence", Attr, IRB.getVoidTy(), OrdTy, nullptr));
MemmoveFn = checkSanitizerInterfaceFunction(
- M.getOrInsertFunction("memmove", IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
+ M.getOrInsertFunction("memmove", Attr, IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
IRB.getInt8PtrTy(), IntptrTy, nullptr));
MemcpyFn = checkSanitizerInterfaceFunction(
- M.getOrInsertFunction("memcpy", IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
+ M.getOrInsertFunction("memcpy", Attr, IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
IRB.getInt8PtrTy(), IntptrTy, nullptr));
MemsetFn = checkSanitizerInterfaceFunction(
- M.getOrInsertFunction("memset", IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
+ M.getOrInsertFunction("memset", Attr, IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
IRB.getInt32Ty(), IntptrTy, nullptr));
}
@@ -378,13 +390,21 @@ static bool isAtomic(Instruction *I) {
return false;
}
+void ThreadSanitizer::InsertRuntimeIgnores(Function &F) {
+ IRBuilder<> IRB(F.getEntryBlock().getFirstNonPHI());
+ IRB.CreateCall(TsanIgnoreBegin);
+ EscapeEnumerator EE(F, "tsan_ignore_cleanup", ClHandleCxxExceptions);
+ while (IRBuilder<> *AtExit = EE.Next()) {
+ AtExit->CreateCall(TsanIgnoreEnd);
+ }
+}
+
bool ThreadSanitizer::runOnFunction(Function &F) {
// This is required to prevent instrumenting call to __tsan_init from within
// the module constructor.
if (&F == TsanCtorFunction)
return false;
initializeCallbacks(*F.getParent());
- SmallVector<Instruction*, 8> RetVec;
SmallVector<Instruction*, 8> AllLoadsAndStores;
SmallVector<Instruction*, 8> LocalLoadsAndStores;
SmallVector<Instruction*, 8> AtomicAccesses;
@@ -403,8 +423,6 @@ bool ThreadSanitizer::runOnFunction(Function &F) {
AtomicAccesses.push_back(&Inst);
else if (isa<LoadInst>(Inst) || isa<StoreInst>(Inst))
LocalLoadsAndStores.push_back(&Inst);
- else if (isa<ReturnInst>(Inst))
- RetVec.push_back(&Inst);
else if (isa<CallInst>(Inst) || isa<InvokeInst>(Inst)) {
if (CallInst *CI = dyn_cast<CallInst>(&Inst))
maybeMarkSanitizerLibraryCallNoBuiltin(CI, TLI);
@@ -440,6 +458,12 @@ bool ThreadSanitizer::runOnFunction(Function &F) {
Res |= instrumentMemIntrinsic(Inst);
}
+ if (F.hasFnAttribute("sanitize_thread_no_checking_at_run_time")) {
+ assert(!F.hasFnAttribute(Attribute::SanitizeThread));
+ if (HasCalls)
+ InsertRuntimeIgnores(F);
+ }
+
// Instrument function entry/exit points if there were instrumented accesses.
if ((Res || HasCalls) && ClInstrumentFuncEntryExit) {
IRBuilder<> IRB(F.getEntryBlock().getFirstNonPHI());
@@ -447,9 +471,10 @@ bool ThreadSanitizer::runOnFunction(Function &F) {
Intrinsic::getDeclaration(F.getParent(), Intrinsic::returnaddress),
IRB.getInt32(0));
IRB.CreateCall(TsanFuncEntry, ReturnAddress);
- for (auto RetInst : RetVec) {
- IRBuilder<> IRBRet(RetInst);
- IRBRet.CreateCall(TsanFuncExit, {});
+
+ EscapeEnumerator EE(F, "tsan_cleanup", ClHandleCxxExceptions);
+ while (IRBuilder<> *AtExit = EE.Next()) {
+ AtExit->CreateCall(TsanFuncExit, {});
}
Res = true;
}
@@ -463,6 +488,13 @@ bool ThreadSanitizer::instrumentLoadOrStore(Instruction *I,
Value *Addr = IsWrite
? cast<StoreInst>(I)->getPointerOperand()
: cast<LoadInst>(I)->getPointerOperand();
+
+ // swifterror memory addresses are mem2reg promoted by instruction selection.
+ // As such they cannot have regular uses like an instrumentation function and
+ // it makes no sense to track them as memory.
+ if (Addr->isSwiftError())
+ return false;
+
int Idx = getMemoryAccessFuncIndex(Addr, DL);
if (Idx < 0)
return false;
@@ -511,7 +543,7 @@ static ConstantInt *createOrdering(IRBuilder<> *IRB, AtomicOrdering ord) {
switch (ord) {
case AtomicOrdering::NotAtomic:
llvm_unreachable("unexpected atomic ordering!");
- case AtomicOrdering::Unordered: // Fall-through.
+ case AtomicOrdering::Unordered: LLVM_FALLTHROUGH;
case AtomicOrdering::Monotonic: v = 0; break;
// Not specified yet:
// case AtomicOrdering::Consume: v = 1; break;
@@ -551,11 +583,6 @@ bool ThreadSanitizer::instrumentMemIntrinsic(Instruction *I) {
return false;
}
-static Value *createIntOrPtrToIntCast(Value *V, Type* Ty, IRBuilder<> &IRB) {
- return isa<PointerType>(V->getType()) ?
- IRB.CreatePtrToInt(V, Ty) : IRB.CreateIntCast(V, Ty, false);
-}
-
// Both llvm and ThreadSanitizer atomic operations are based on C++11/C1x
// standards. For background see C++11 standard. A slightly older, publicly
// available draft of the standard (not entirely up-to-date, but close enough
@@ -578,15 +605,9 @@ bool ThreadSanitizer::instrumentAtomic(Instruction *I, const DataLayout &DL) {
Value *Args[] = {IRB.CreatePointerCast(Addr, PtrTy),
createOrdering(&IRB, LI->getOrdering())};
Type *OrigTy = cast<PointerType>(Addr->getType())->getElementType();
- if (Ty == OrigTy) {
- Instruction *C = CallInst::Create(TsanAtomicLoad[Idx], Args);
- ReplaceInstWithInst(I, C);
- } else {
- // We are loading a pointer, so we need to cast the return value.
- Value *C = IRB.CreateCall(TsanAtomicLoad[Idx], Args);
- Instruction *Cast = CastInst::Create(Instruction::IntToPtr, C, OrigTy);
- ReplaceInstWithInst(I, Cast);
- }
+ Value *C = IRB.CreateCall(TsanAtomicLoad[Idx], Args);
+ Value *Cast = IRB.CreateBitOrPointerCast(C, OrigTy);
+ I->replaceAllUsesWith(Cast);
} else if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
Value *Addr = SI->getPointerOperand();
int Idx = getMemoryAccessFuncIndex(Addr, DL);
@@ -597,7 +618,7 @@ bool ThreadSanitizer::instrumentAtomic(Instruction *I, const DataLayout &DL) {
Type *Ty = Type::getIntNTy(IRB.getContext(), BitSize);
Type *PtrTy = Ty->getPointerTo();
Value *Args[] = {IRB.CreatePointerCast(Addr, PtrTy),
- createIntOrPtrToIntCast(SI->getValueOperand(), Ty, IRB),
+ IRB.CreateBitOrPointerCast(SI->getValueOperand(), Ty),
createOrdering(&IRB, SI->getOrdering())};
CallInst *C = CallInst::Create(TsanAtomicStore[Idx], Args);
ReplaceInstWithInst(I, C);
@@ -628,9 +649,9 @@ bool ThreadSanitizer::instrumentAtomic(Instruction *I, const DataLayout &DL) {
Type *Ty = Type::getIntNTy(IRB.getContext(), BitSize);
Type *PtrTy = Ty->getPointerTo();
Value *CmpOperand =
- createIntOrPtrToIntCast(CASI->getCompareOperand(), Ty, IRB);
+ IRB.CreateBitOrPointerCast(CASI->getCompareOperand(), Ty);
Value *NewOperand =
- createIntOrPtrToIntCast(CASI->getNewValOperand(), Ty, IRB);
+ IRB.CreateBitOrPointerCast(CASI->getNewValOperand(), Ty);
Value *Args[] = {IRB.CreatePointerCast(Addr, PtrTy),
CmpOperand,
NewOperand,
diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ARCRuntimeEntryPoints.h b/contrib/llvm/lib/Transforms/ObjCARC/ARCRuntimeEntryPoints.h
index d4fef10..c748272 100644
--- a/contrib/llvm/lib/Transforms/ObjCARC/ARCRuntimeEntryPoints.h
+++ b/contrib/llvm/lib/Transforms/ObjCARC/ARCRuntimeEntryPoints.h
@@ -121,8 +121,7 @@ private:
/// Declaration for objc_retainAutoreleaseReturnValue().
Constant *RetainAutoreleaseRV;
- Constant *getVoidRetI8XEntryPoint(Constant *&Decl,
- const char *Name) {
+ Constant *getVoidRetI8XEntryPoint(Constant *&Decl, StringRef Name) {
if (Decl)
return Decl;
@@ -136,8 +135,7 @@ private:
return Decl = TheModule->getOrInsertFunction(Name, Fty, Attr);
}
- Constant *getI8XRetI8XEntryPoint(Constant *& Decl,
- const char *Name,
+ Constant *getI8XRetI8XEntryPoint(Constant *&Decl, StringRef Name,
bool NoUnwind = false) {
if (Decl)
return Decl;
@@ -155,8 +153,7 @@ private:
return Decl = TheModule->getOrInsertFunction(Name, Fty, Attr);
}
- Constant *getI8XRetI8XXI8XEntryPoint(Constant *&Decl,
- const char *Name) {
+ Constant *getI8XRetI8XXI8XEntryPoint(Constant *&Decl, StringRef Name) {
if (Decl)
return Decl;
diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp
index 11e2d03e..23c1f59 100644
--- a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp
+++ b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp
@@ -423,7 +423,7 @@ bool ObjCARCContract::tryToPeepholeInstruction(
if (!optimizeRetainCall(F, Inst))
return false;
// If we succeed in our optimization, fall through.
- // FALLTHROUGH
+ LLVM_FALLTHROUGH;
case ARCInstKind::RetainRV:
case ARCInstKind::ClaimRV: {
// If we're compiling for a target which needs a special inline-asm
@@ -547,13 +547,13 @@ bool ObjCARCContract::runOnFunction(Function &F) {
// Don't use GetArgRCIdentityRoot because we don't want to look through bitcasts
// and such; to do the replacement, the argument must have type i8*.
- Value *Arg = cast<CallInst>(Inst)->getArgOperand(0);
- // TODO: Change this to a do-while.
- for (;;) {
+ // Function for replacing uses of Arg dominated by Inst.
+ auto ReplaceArgUses = [Inst, this](Value *Arg) {
// If we're compiling bugpointed code, don't get in trouble.
if (!isa<Instruction>(Arg) && !isa<Argument>(Arg))
- break;
+ return;
+
// Look through the uses of the pointer.
for (Value::use_iterator UI = Arg->use_begin(), UE = Arg->use_end();
UI != UE; ) {
@@ -598,6 +598,15 @@ bool ObjCARCContract::runOnFunction(Function &F) {
}
}
}
+ };
+
+
+ Value *Arg = cast<CallInst>(Inst)->getArgOperand(0);
+ Value *OrigArg = Arg;
+
+ // TODO: Change this to a do-while.
+ for (;;) {
+ ReplaceArgUses(Arg);
// If Arg is a no-op casted pointer, strip one level of casts and iterate.
if (const BitCastInst *BI = dyn_cast<BitCastInst>(Arg))
@@ -611,6 +620,24 @@ bool ObjCARCContract::runOnFunction(Function &F) {
else
break;
}
+
+ // Replace bitcast users of Arg that are dominated by Inst.
+ SmallVector<BitCastInst *, 2> BitCastUsers;
+
+ // Add all bitcast users of the function argument first.
+ for (User *U : OrigArg->users())
+ if (auto *BC = dyn_cast<BitCastInst>(U))
+ BitCastUsers.push_back(BC);
+
+ // Replace the bitcasts with the call return. Iterate until list is empty.
+ while (!BitCastUsers.empty()) {
+ auto *BC = BitCastUsers.pop_back_val();
+ for (User *U : BC->users())
+ if (auto *B = dyn_cast<BitCastInst>(U))
+ BitCastUsers.push_back(B);
+
+ ReplaceArgUses(BC);
+ }
}
// If this function has no escaping allocas or suspicious vararg usage,
diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp
index a6907b5..136d54a 100644
--- a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp
+++ b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp
@@ -53,6 +53,11 @@ using namespace llvm::objcarc;
/// \brief This is similar to GetRCIdentityRoot but it stops as soon
/// as it finds a value with multiple uses.
static const Value *FindSingleUseIdentifiedObject(const Value *Arg) {
+ // ConstantData (like ConstantPointerNull and UndefValue) is used across
+ // modules. It's never a single-use value.
+ if (isa<ConstantData>(Arg))
+ return nullptr;
+
if (Arg->hasOneUse()) {
if (const BitCastInst *BC = dyn_cast<BitCastInst>(Arg))
return FindSingleUseIdentifiedObject(BC->getOperand(0));
@@ -644,6 +649,12 @@ void ObjCARCOpt::OptimizeAutoreleaseRVCall(Function &F,
ARCInstKind &Class) {
// Check for a return of the pointer value.
const Value *Ptr = GetArgRCIdentityRoot(AutoreleaseRV);
+
+ // If the argument is ConstantPointerNull or UndefValue, its other users
+ // aren't actually interesting to look at.
+ if (isa<ConstantData>(Ptr))
+ return;
+
SmallVector<const Value *, 2> Users;
Users.push_back(Ptr);
do {
@@ -2075,12 +2086,11 @@ void ObjCARCOpt::OptimizeReturns(Function &F) {
SmallPtrSet<const BasicBlock *, 4> Visited;
for (BasicBlock &BB: F) {
ReturnInst *Ret = dyn_cast<ReturnInst>(&BB.back());
-
- DEBUG(dbgs() << "Visiting: " << *Ret << "\n");
-
if (!Ret)
continue;
+ DEBUG(dbgs() << "Visiting: " << *Ret << "\n");
+
const Value *Arg = GetRCIdentityRoot(Ret->getOperand(0));
// Look for an ``autorelease'' instruction that is a predecessor of Ret and
diff --git a/contrib/llvm/lib/Transforms/ObjCARC/PtrState.cpp b/contrib/llvm/lib/Transforms/ObjCARC/PtrState.cpp
index df64fa3..a5afc8a 100644
--- a/contrib/llvm/lib/Transforms/ObjCARC/PtrState.cpp
+++ b/contrib/llvm/lib/Transforms/ObjCARC/PtrState.cpp
@@ -201,7 +201,7 @@ bool BottomUpPtrState::MatchWithRetain() {
// imprecise release, clear our reverse insertion points.
if (OldSeq != S_Use || IsTrackingImpreciseReleases())
ClearReverseInsertPts();
- // FALL THROUGH
+ LLVM_FALLTHROUGH;
case S_CanRelease:
return true;
case S_None:
@@ -332,7 +332,7 @@ bool TopDownPtrState::MatchWithRelease(ARCMDKindCache &Cache,
case S_CanRelease:
if (OldSeq == S_Retain || ReleaseMetadata != nullptr)
ClearReverseInsertPts();
- // FALL THROUGH
+ LLVM_FALLTHROUGH;
case S_Use:
SetReleaseMetadata(ReleaseMetadata);
SetTailCallRelease(cast<CallInst>(Release)->isTailCall());
diff --git a/contrib/llvm/lib/Transforms/Scalar/ADCE.cpp b/contrib/llvm/lib/Transforms/Scalar/ADCE.cpp
index 0eed024..adc903c 100644
--- a/contrib/llvm/lib/Transforms/Scalar/ADCE.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/ADCE.cpp
@@ -15,14 +15,19 @@
//===----------------------------------------------------------------------===//
#include "llvm/Transforms/Scalar/ADCE.h"
+
#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/IteratedDominanceFrontier.h"
+#include "llvm/Analysis/PostDominators.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/CFG.h"
#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InstIterator.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
@@ -34,9 +39,372 @@ using namespace llvm;
#define DEBUG_TYPE "adce"
STATISTIC(NumRemoved, "Number of instructions removed");
+STATISTIC(NumBranchesRemoved, "Number of branch instructions removed");
+
+// This is a tempoary option until we change the interface
+// to this pass based on optimization level.
+static cl::opt<bool> RemoveControlFlowFlag("adce-remove-control-flow",
+ cl::init(true), cl::Hidden);
+
+// This option enables removing of may-be-infinite loops which have no other
+// effect.
+static cl::opt<bool> RemoveLoops("adce-remove-loops", cl::init(false),
+ cl::Hidden);
+
+namespace {
+/// Information about Instructions
+struct InstInfoType {
+ /// True if the associated instruction is live.
+ bool Live = false;
+ /// Quick access to information for block containing associated Instruction.
+ struct BlockInfoType *Block = nullptr;
+};
+
+/// Information about basic blocks relevant to dead code elimination.
+struct BlockInfoType {
+ /// True when this block contains a live instructions.
+ bool Live = false;
+ /// True when this block ends in an unconditional branch.
+ bool UnconditionalBranch = false;
+ /// True when this block is known to have live PHI nodes.
+ bool HasLivePhiNodes = false;
+ /// Control dependence sources need to be live for this block.
+ bool CFLive = false;
+
+ /// Quick access to the LiveInfo for the terminator,
+ /// holds the value &InstInfo[Terminator]
+ InstInfoType *TerminatorLiveInfo = nullptr;
+
+ bool terminatorIsLive() const { return TerminatorLiveInfo->Live; }
+
+ /// Corresponding BasicBlock.
+ BasicBlock *BB = nullptr;
+
+ /// Cache of BB->getTerminator().
+ TerminatorInst *Terminator = nullptr;
+
+ /// Post-order numbering of reverse control flow graph.
+ unsigned PostOrder;
+};
+
+class AggressiveDeadCodeElimination {
+ Function &F;
+ PostDominatorTree &PDT;
+
+ /// Mapping of blocks to associated information, an element in BlockInfoVec.
+ DenseMap<BasicBlock *, BlockInfoType> BlockInfo;
+ bool isLive(BasicBlock *BB) { return BlockInfo[BB].Live; }
+
+ /// Mapping of instructions to associated information.
+ DenseMap<Instruction *, InstInfoType> InstInfo;
+ bool isLive(Instruction *I) { return InstInfo[I].Live; }
+
+ /// Instructions known to be live where we need to mark
+ /// reaching definitions as live.
+ SmallVector<Instruction *, 128> Worklist;
+ /// Debug info scopes around a live instruction.
+ SmallPtrSet<const Metadata *, 32> AliveScopes;
+
+ /// Set of blocks with not known to have live terminators.
+ SmallPtrSet<BasicBlock *, 16> BlocksWithDeadTerminators;
+
+ /// The set of blocks which we have determined whose control
+ /// dependence sources must be live and which have not had
+ /// those dependences analyized.
+ SmallPtrSet<BasicBlock *, 16> NewLiveBlocks;
+
+ /// Set up auxiliary data structures for Instructions and BasicBlocks and
+ /// initialize the Worklist to the set of must-be-live Instruscions.
+ void initialize();
+ /// Return true for operations which are always treated as live.
+ bool isAlwaysLive(Instruction &I);
+ /// Return true for instrumentation instructions for value profiling.
+ bool isInstrumentsConstant(Instruction &I);
+
+ /// Propagate liveness to reaching definitions.
+ void markLiveInstructions();
+ /// Mark an instruction as live.
+ void markLive(Instruction *I);
+ /// Mark a block as live.
+ void markLive(BlockInfoType &BB);
+ void markLive(BasicBlock *BB) { markLive(BlockInfo[BB]); }
+
+ /// Mark terminators of control predecessors of a PHI node live.
+ void markPhiLive(PHINode *PN);
+
+ /// Record the Debug Scopes which surround live debug information.
+ void collectLiveScopes(const DILocalScope &LS);
+ void collectLiveScopes(const DILocation &DL);
+
+ /// Analyze dead branches to find those whose branches are the sources
+ /// of control dependences impacting a live block. Those branches are
+ /// marked live.
+ void markLiveBranchesFromControlDependences();
+
+ /// Remove instructions not marked live, return if any any instruction
+ /// was removed.
+ bool removeDeadInstructions();
+
+ /// Identify connected sections of the control flow grap which have
+ /// dead terminators and rewrite the control flow graph to remove them.
+ void updateDeadRegions();
+
+ /// Set the BlockInfo::PostOrder field based on a post-order
+ /// numbering of the reverse control flow graph.
+ void computeReversePostOrder();
+
+ /// Make the terminator of this block an unconditional branch to \p Target.
+ void makeUnconditional(BasicBlock *BB, BasicBlock *Target);
+
+public:
+ AggressiveDeadCodeElimination(Function &F, PostDominatorTree &PDT)
+ : F(F), PDT(PDT) {}
+ bool performDeadCodeElimination();
+};
+}
+
+bool AggressiveDeadCodeElimination::performDeadCodeElimination() {
+ initialize();
+ markLiveInstructions();
+ return removeDeadInstructions();
+}
+
+static bool isUnconditionalBranch(TerminatorInst *Term) {
+ auto *BR = dyn_cast<BranchInst>(Term);
+ return BR && BR->isUnconditional();
+}
+
+void AggressiveDeadCodeElimination::initialize() {
+
+ auto NumBlocks = F.size();
+
+ // We will have an entry in the map for each block so we grow the
+ // structure to twice that size to keep the load factor low in the hash table.
+ BlockInfo.reserve(NumBlocks);
+ size_t NumInsts = 0;
+
+ // Iterate over blocks and initialize BlockInfoVec entries, count
+ // instructions to size the InstInfo hash table.
+ for (auto &BB : F) {
+ NumInsts += BB.size();
+ auto &Info = BlockInfo[&BB];
+ Info.BB = &BB;
+ Info.Terminator = BB.getTerminator();
+ Info.UnconditionalBranch = isUnconditionalBranch(Info.Terminator);
+ }
+
+ // Initialize instruction map and set pointers to block info.
+ InstInfo.reserve(NumInsts);
+ for (auto &BBInfo : BlockInfo)
+ for (Instruction &I : *BBInfo.second.BB)
+ InstInfo[&I].Block = &BBInfo.second;
+
+ // Since BlockInfoVec holds pointers into InstInfo and vice-versa, we may not
+ // add any more elements to either after this point.
+ for (auto &BBInfo : BlockInfo)
+ BBInfo.second.TerminatorLiveInfo = &InstInfo[BBInfo.second.Terminator];
+
+ // Collect the set of "root" instructions that are known live.
+ for (Instruction &I : instructions(F))
+ if (isAlwaysLive(I))
+ markLive(&I);
+
+ if (!RemoveControlFlowFlag)
+ return;
+
+ if (!RemoveLoops) {
+ // This stores state for the depth-first iterator. In addition
+ // to recording which nodes have been visited we also record whether
+ // a node is currently on the "stack" of active ancestors of the current
+ // node.
+ typedef DenseMap<BasicBlock *, bool> StatusMap ;
+ class DFState : public StatusMap {
+ public:
+ std::pair<StatusMap::iterator, bool> insert(BasicBlock *BB) {
+ return StatusMap::insert(std::make_pair(BB, true));
+ }
+
+ // Invoked after we have visited all children of a node.
+ void completed(BasicBlock *BB) { (*this)[BB] = false; }
+
+ // Return true if \p BB is currently on the active stack
+ // of ancestors.
+ bool onStack(BasicBlock *BB) {
+ auto Iter = find(BB);
+ return Iter != end() && Iter->second;
+ }
+ } State;
+
+ State.reserve(F.size());
+ // Iterate over blocks in depth-first pre-order and
+ // treat all edges to a block already seen as loop back edges
+ // and mark the branch live it if there is a back edge.
+ for (auto *BB: depth_first_ext(&F.getEntryBlock(), State)) {
+ TerminatorInst *Term = BB->getTerminator();
+ if (isLive(Term))
+ continue;
+
+ for (auto *Succ : successors(BB))
+ if (State.onStack(Succ)) {
+ // back edge....
+ markLive(Term);
+ break;
+ }
+ }
+ }
+
+ // Mark blocks live if there is no path from the block to the
+ // return of the function or a successor for which this is true.
+ // This protects IDFCalculator which cannot handle such blocks.
+ for (auto &BBInfoPair : BlockInfo) {
+ auto &BBInfo = BBInfoPair.second;
+ if (BBInfo.terminatorIsLive())
+ continue;
+ auto *BB = BBInfo.BB;
+ if (!PDT.getNode(BB)) {
+ markLive(BBInfo.Terminator);
+ continue;
+ }
+ for (auto *Succ : successors(BB))
+ if (!PDT.getNode(Succ)) {
+ markLive(BBInfo.Terminator);
+ break;
+ }
+ }
+
+ // Mark blocks live if there is no path from the block to the
+ // return of the function or a successor for which this is true.
+ // This protects IDFCalculator which cannot handle such blocks.
+ for (auto &BBInfoPair : BlockInfo) {
+ auto &BBInfo = BBInfoPair.second;
+ if (BBInfo.terminatorIsLive())
+ continue;
+ auto *BB = BBInfo.BB;
+ if (!PDT.getNode(BB)) {
+ DEBUG(dbgs() << "Not post-dominated by return: " << BB->getName()
+ << '\n';);
+ markLive(BBInfo.Terminator);
+ continue;
+ }
+ for (auto *Succ : successors(BB))
+ if (!PDT.getNode(Succ)) {
+ DEBUG(dbgs() << "Successor not post-dominated by return: "
+ << BB->getName() << '\n';);
+ markLive(BBInfo.Terminator);
+ break;
+ }
+ }
+
+ // Treat the entry block as always live
+ auto *BB = &F.getEntryBlock();
+ auto &EntryInfo = BlockInfo[BB];
+ EntryInfo.Live = true;
+ if (EntryInfo.UnconditionalBranch)
+ markLive(EntryInfo.Terminator);
+
+ // Build initial collection of blocks with dead terminators
+ for (auto &BBInfo : BlockInfo)
+ if (!BBInfo.second.terminatorIsLive())
+ BlocksWithDeadTerminators.insert(BBInfo.second.BB);
+}
+
+bool AggressiveDeadCodeElimination::isAlwaysLive(Instruction &I) {
+ // TODO -- use llvm::isInstructionTriviallyDead
+ if (I.isEHPad() || I.mayHaveSideEffects()) {
+ // Skip any value profile instrumentation calls if they are
+ // instrumenting constants.
+ if (isInstrumentsConstant(I))
+ return false;
+ return true;
+ }
+ if (!isa<TerminatorInst>(I))
+ return false;
+ if (RemoveControlFlowFlag && (isa<BranchInst>(I) || isa<SwitchInst>(I)))
+ return false;
+ return true;
+}
+
+// Check if this instruction is a runtime call for value profiling and
+// if it's instrumenting a constant.
+bool AggressiveDeadCodeElimination::isInstrumentsConstant(Instruction &I) {
+ // TODO -- move this test into llvm::isInstructionTriviallyDead
+ if (CallInst *CI = dyn_cast<CallInst>(&I))
+ if (Function *Callee = CI->getCalledFunction())
+ if (Callee->getName().equals(getInstrProfValueProfFuncName()))
+ if (isa<Constant>(CI->getArgOperand(0)))
+ return true;
+ return false;
+}
+
+void AggressiveDeadCodeElimination::markLiveInstructions() {
+
+ // Propagate liveness backwards to operands.
+ do {
+ // Worklist holds newly discovered live instructions
+ // where we need to mark the inputs as live.
+ while (!Worklist.empty()) {
+ Instruction *LiveInst = Worklist.pop_back_val();
+ DEBUG(dbgs() << "work live: "; LiveInst->dump(););
+
+ for (Use &OI : LiveInst->operands())
+ if (Instruction *Inst = dyn_cast<Instruction>(OI))
+ markLive(Inst);
+
+ if (auto *PN = dyn_cast<PHINode>(LiveInst))
+ markPhiLive(PN);
+ }
-static void collectLiveScopes(const DILocalScope &LS,
- SmallPtrSetImpl<const Metadata *> &AliveScopes) {
+ // After data flow liveness has been identified, examine which branch
+ // decisions are required to determine live instructions are executed.
+ markLiveBranchesFromControlDependences();
+
+ } while (!Worklist.empty());
+}
+
+void AggressiveDeadCodeElimination::markLive(Instruction *I) {
+
+ auto &Info = InstInfo[I];
+ if (Info.Live)
+ return;
+
+ DEBUG(dbgs() << "mark live: "; I->dump());
+ Info.Live = true;
+ Worklist.push_back(I);
+
+ // Collect the live debug info scopes attached to this instruction.
+ if (const DILocation *DL = I->getDebugLoc())
+ collectLiveScopes(*DL);
+
+ // Mark the containing block live
+ auto &BBInfo = *Info.Block;
+ if (BBInfo.Terminator == I) {
+ BlocksWithDeadTerminators.erase(BBInfo.BB);
+ // For live terminators, mark destination blocks
+ // live to preserve this control flow edges.
+ if (!BBInfo.UnconditionalBranch)
+ for (auto *BB : successors(I->getParent()))
+ markLive(BB);
+ }
+ markLive(BBInfo);
+}
+
+void AggressiveDeadCodeElimination::markLive(BlockInfoType &BBInfo) {
+ if (BBInfo.Live)
+ return;
+ DEBUG(dbgs() << "mark block live: " << BBInfo.BB->getName() << '\n');
+ BBInfo.Live = true;
+ if (!BBInfo.CFLive) {
+ BBInfo.CFLive = true;
+ NewLiveBlocks.insert(BBInfo.BB);
+ }
+
+ // Mark unconditional branches at the end of live
+ // blocks as live since there is no work to do for them later
+ if (BBInfo.UnconditionalBranch)
+ markLive(BBInfo.Terminator);
+}
+
+void AggressiveDeadCodeElimination::collectLiveScopes(const DILocalScope &LS) {
if (!AliveScopes.insert(&LS).second)
return;
@@ -44,75 +412,115 @@ static void collectLiveScopes(const DILocalScope &LS,
return;
// Tail-recurse through the scope chain.
- collectLiveScopes(cast<DILocalScope>(*LS.getScope()), AliveScopes);
+ collectLiveScopes(cast<DILocalScope>(*LS.getScope()));
}
-static void collectLiveScopes(const DILocation &DL,
- SmallPtrSetImpl<const Metadata *> &AliveScopes) {
+void AggressiveDeadCodeElimination::collectLiveScopes(const DILocation &DL) {
// Even though DILocations are not scopes, shove them into AliveScopes so we
// don't revisit them.
if (!AliveScopes.insert(&DL).second)
return;
// Collect live scopes from the scope chain.
- collectLiveScopes(*DL.getScope(), AliveScopes);
+ collectLiveScopes(*DL.getScope());
// Tail-recurse through the inlined-at chain.
if (const DILocation *IA = DL.getInlinedAt())
- collectLiveScopes(*IA, AliveScopes);
+ collectLiveScopes(*IA);
}
-// Check if this instruction is a runtime call for value profiling and
-// if it's instrumenting a constant.
-static bool isInstrumentsConstant(Instruction &I) {
- if (CallInst *CI = dyn_cast<CallInst>(&I))
- if (Function *Callee = CI->getCalledFunction())
- if (Callee->getName().equals(getInstrProfValueProfFuncName()))
- if (isa<Constant>(CI->getArgOperand(0)))
- return true;
- return false;
+void AggressiveDeadCodeElimination::markPhiLive(PHINode *PN) {
+ auto &Info = BlockInfo[PN->getParent()];
+ // Only need to check this once per block.
+ if (Info.HasLivePhiNodes)
+ return;
+ Info.HasLivePhiNodes = true;
+
+ // If a predecessor block is not live, mark it as control-flow live
+ // which will trigger marking live branches upon which
+ // that block is control dependent.
+ for (auto *PredBB : predecessors(Info.BB)) {
+ auto &Info = BlockInfo[PredBB];
+ if (!Info.CFLive) {
+ Info.CFLive = true;
+ NewLiveBlocks.insert(PredBB);
+ }
+ }
}
-static bool aggressiveDCE(Function& F) {
- SmallPtrSet<Instruction*, 32> Alive;
- SmallVector<Instruction*, 128> Worklist;
+void AggressiveDeadCodeElimination::markLiveBranchesFromControlDependences() {
- // Collect the set of "root" instructions that are known live.
- for (Instruction &I : instructions(F)) {
- if (isa<TerminatorInst>(I) || I.isEHPad() || I.mayHaveSideEffects()) {
- // Skip any value profile instrumentation calls if they are
- // instrumenting constants.
- if (isInstrumentsConstant(I))
- continue;
- Alive.insert(&I);
- Worklist.push_back(&I);
- }
+ if (BlocksWithDeadTerminators.empty())
+ return;
+
+ DEBUG({
+ dbgs() << "new live blocks:\n";
+ for (auto *BB : NewLiveBlocks)
+ dbgs() << "\t" << BB->getName() << '\n';
+ dbgs() << "dead terminator blocks:\n";
+ for (auto *BB : BlocksWithDeadTerminators)
+ dbgs() << "\t" << BB->getName() << '\n';
+ });
+
+ // The dominance frontier of a live block X in the reverse
+ // control graph is the set of blocks upon which X is control
+ // dependent. The following sequence computes the set of blocks
+ // which currently have dead terminators that are control
+ // dependence sources of a block which is in NewLiveBlocks.
+
+ SmallVector<BasicBlock *, 32> IDFBlocks;
+ ReverseIDFCalculator IDFs(PDT);
+ IDFs.setDefiningBlocks(NewLiveBlocks);
+ IDFs.setLiveInBlocks(BlocksWithDeadTerminators);
+ IDFs.calculate(IDFBlocks);
+ NewLiveBlocks.clear();
+
+ // Dead terminators which control live blocks are now marked live.
+ for (auto *BB : IDFBlocks) {
+ DEBUG(dbgs() << "live control in: " << BB->getName() << '\n');
+ markLive(BB->getTerminator());
}
+}
- // Propagate liveness backwards to operands. Keep track of live debug info
- // scopes.
- SmallPtrSet<const Metadata *, 32> AliveScopes;
- while (!Worklist.empty()) {
- Instruction *Curr = Worklist.pop_back_val();
+//===----------------------------------------------------------------------===//
+//
+// Routines to update the CFG and SSA information before removing dead code.
+//
+//===----------------------------------------------------------------------===//
+bool AggressiveDeadCodeElimination::removeDeadInstructions() {
- // Collect the live debug info scopes attached to this instruction.
- if (const DILocation *DL = Curr->getDebugLoc())
- collectLiveScopes(*DL, AliveScopes);
+ // Updates control and dataflow around dead blocks
+ updateDeadRegions();
- for (Use &OI : Curr->operands()) {
- if (Instruction *Inst = dyn_cast<Instruction>(OI))
- if (Alive.insert(Inst).second)
- Worklist.push_back(Inst);
+ DEBUG({
+ for (Instruction &I : instructions(F)) {
+ // Check if the instruction is alive.
+ if (isLive(&I))
+ continue;
+
+ if (auto *DII = dyn_cast<DbgInfoIntrinsic>(&I)) {
+ // Check if the scope of this variable location is alive.
+ if (AliveScopes.count(DII->getDebugLoc()->getScope()))
+ continue;
+
+ // If intrinsic is pointing at a live SSA value, there may be an
+ // earlier optimization bug: if we know the location of the variable,
+ // why isn't the scope of the location alive?
+ if (Value *V = DII->getVariableLocation())
+ if (Instruction *II = dyn_cast<Instruction>(V))
+ if (isLive(II))
+ dbgs() << "Dropping debug info for " << *DII << "\n";
+ }
}
- }
+ });
// The inverse of the live set is the dead set. These are those instructions
- // which have no side effects and do not influence the control flow or return
+ // that have no side effects and do not influence the control flow or return
// value of the function, and may therefore be deleted safely.
// NOTE: We reuse the Worklist vector here for memory efficiency.
for (Instruction &I : instructions(F)) {
// Check if the instruction is alive.
- if (Alive.count(&I))
+ if (isLive(&I))
continue;
if (auto *DII = dyn_cast<DbgInfoIntrinsic>(&I)) {
@@ -121,15 +529,6 @@ static bool aggressiveDCE(Function& F) {
continue;
// Fallthrough and drop the intrinsic.
- DEBUG({
- // If intrinsic is pointing at a live SSA value, there may be an
- // earlier optimization bug: if we know the location of the variable,
- // why isn't the scope of the location alive?
- if (Value *V = DII->getVariableLocation())
- if (Instruction *II = dyn_cast<Instruction>(V))
- if (Alive.count(II))
- dbgs() << "Dropping debug info for " << *DII << "\n";
- });
}
// Prepare to delete.
@@ -145,8 +544,104 @@ static bool aggressiveDCE(Function& F) {
return !Worklist.empty();
}
-PreservedAnalyses ADCEPass::run(Function &F, FunctionAnalysisManager &) {
- if (!aggressiveDCE(F))
+// A dead region is the set of dead blocks with a common live post-dominator.
+void AggressiveDeadCodeElimination::updateDeadRegions() {
+
+ DEBUG({
+ dbgs() << "final dead terminator blocks: " << '\n';
+ for (auto *BB : BlocksWithDeadTerminators)
+ dbgs() << '\t' << BB->getName()
+ << (BlockInfo[BB].Live ? " LIVE\n" : "\n");
+ });
+
+ // Don't compute the post ordering unless we needed it.
+ bool HavePostOrder = false;
+
+ for (auto *BB : BlocksWithDeadTerminators) {
+ auto &Info = BlockInfo[BB];
+ if (Info.UnconditionalBranch) {
+ InstInfo[Info.Terminator].Live = true;
+ continue;
+ }
+
+ if (!HavePostOrder) {
+ computeReversePostOrder();
+ HavePostOrder = true;
+ }
+
+ // Add an unconditional branch to the successor closest to the
+ // end of the function which insures a path to the exit for each
+ // live edge.
+ BlockInfoType *PreferredSucc = nullptr;
+ for (auto *Succ : successors(BB)) {
+ auto *Info = &BlockInfo[Succ];
+ if (!PreferredSucc || PreferredSucc->PostOrder < Info->PostOrder)
+ PreferredSucc = Info;
+ }
+ assert((PreferredSucc && PreferredSucc->PostOrder > 0) &&
+ "Failed to find safe successor for dead branc");
+ bool First = true;
+ for (auto *Succ : successors(BB)) {
+ if (!First || Succ != PreferredSucc->BB)
+ Succ->removePredecessor(BB);
+ else
+ First = false;
+ }
+ makeUnconditional(BB, PreferredSucc->BB);
+ NumBranchesRemoved += 1;
+ }
+}
+
+// reverse top-sort order
+void AggressiveDeadCodeElimination::computeReversePostOrder() {
+
+ // This provides a post-order numbering of the reverse conrtol flow graph
+ // Note that it is incomplete in the presence of infinite loops but we don't
+ // need numbers blocks which don't reach the end of the functions since
+ // all branches in those blocks are forced live.
+
+ // For each block without successors, extend the DFS from the bloack
+ // backward through the graph
+ SmallPtrSet<BasicBlock*, 16> Visited;
+ unsigned PostOrder = 0;
+ for (auto &BB : F) {
+ if (succ_begin(&BB) != succ_end(&BB))
+ continue;
+ for (BasicBlock *Block : inverse_post_order_ext(&BB,Visited))
+ BlockInfo[Block].PostOrder = PostOrder++;
+ }
+}
+
+void AggressiveDeadCodeElimination::makeUnconditional(BasicBlock *BB,
+ BasicBlock *Target) {
+ TerminatorInst *PredTerm = BB->getTerminator();
+ // Collect the live debug info scopes attached to this instruction.
+ if (const DILocation *DL = PredTerm->getDebugLoc())
+ collectLiveScopes(*DL);
+
+ // Just mark live an existing unconditional branch
+ if (isUnconditionalBranch(PredTerm)) {
+ PredTerm->setSuccessor(0, Target);
+ InstInfo[PredTerm].Live = true;
+ return;
+ }
+ DEBUG(dbgs() << "making unconditional " << BB->getName() << '\n');
+ NumBranchesRemoved += 1;
+ IRBuilder<> Builder(PredTerm);
+ auto *NewTerm = Builder.CreateBr(Target);
+ InstInfo[NewTerm].Live = true;
+ if (const DILocation *DL = PredTerm->getDebugLoc())
+ NewTerm->setDebugLoc(DL);
+}
+
+//===----------------------------------------------------------------------===//
+//
+// Pass Manager integration code
+//
+//===----------------------------------------------------------------------===//
+PreservedAnalyses ADCEPass::run(Function &F, FunctionAnalysisManager &FAM) {
+ auto &PDT = FAM.getResult<PostDominatorTreeAnalysis>(F);
+ if (!AggressiveDeadCodeElimination(F, PDT).performDeadCodeElimination())
return PreservedAnalyses::all();
// FIXME: This should also 'preserve the CFG'.
@@ -162,21 +657,27 @@ struct ADCELegacyPass : public FunctionPass {
initializeADCELegacyPassPass(*PassRegistry::getPassRegistry());
}
- bool runOnFunction(Function& F) override {
+ bool runOnFunction(Function &F) override {
if (skipFunction(F))
return false;
- return aggressiveDCE(F);
+ auto &PDT = getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
+ return AggressiveDeadCodeElimination(F, PDT).performDeadCodeElimination();
}
- void getAnalysisUsage(AnalysisUsage& AU) const override {
- AU.setPreservesCFG();
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<PostDominatorTreeWrapperPass>();
+ if (!RemoveControlFlowFlag)
+ AU.setPreservesCFG();
AU.addPreserved<GlobalsAAWrapperPass>();
}
};
}
char ADCELegacyPass::ID = 0;
-INITIALIZE_PASS(ADCELegacyPass, "adce", "Aggressive Dead Code Elimination",
- false, false)
+INITIALIZE_PASS_BEGIN(ADCELegacyPass, "adce",
+ "Aggressive Dead Code Elimination", false, false)
+INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
+INITIALIZE_PASS_END(ADCELegacyPass, "adce", "Aggressive Dead Code Elimination",
+ false, false)
FunctionPass *llvm::createAggressiveDCEPass() { return new ADCELegacyPass(); }
diff --git a/contrib/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp b/contrib/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp
index 7f8b8ce..c1df317 100644
--- a/contrib/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp
@@ -297,6 +297,11 @@ bool AlignmentFromAssumptionsPass::processAssumption(CallInst *ACall) {
if (!extractAlignmentInfo(ACall, AAPtr, AlignSCEV, OffSCEV))
return false;
+ // Skip ConstantPointerNull and UndefValue. Assumptions on these shouldn't
+ // affect other users.
+ if (isa<ConstantData>(AAPtr))
+ return false;
+
const SCEV *AASCEV = SE->getSCEV(AAPtr);
// Apply the assumption to all other users of the specified pointer.
@@ -434,6 +439,11 @@ AlignmentFromAssumptionsPass::run(Function &F, FunctionAnalysisManager &AM) {
ScalarEvolution &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F);
bool Changed = runImpl(F, AC, &SE, &DT);
+
+ // FIXME: We need to invalidate this to avoid PR28400. Is there a better
+ // solution?
+ AM.invalidate<ScalarEvolutionAnalysis>(F);
+
if (!Changed)
return PreservedAnalyses::all();
PreservedAnalyses PA;
diff --git a/contrib/llvm/lib/Transforms/Scalar/BDCE.cpp b/contrib/llvm/lib/Transforms/Scalar/BDCE.cpp
index 4f6225f..251b387 100644
--- a/contrib/llvm/lib/Transforms/Scalar/BDCE.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/BDCE.cpp
@@ -39,6 +39,12 @@ static bool bitTrackingDCE(Function &F, DemandedBits &DB) {
SmallVector<Instruction*, 128> Worklist;
bool Changed = false;
for (Instruction &I : instructions(F)) {
+ // If the instruction has side effects and no non-dbg uses,
+ // skip it. This way we avoid computing known bits on an instruction
+ // that will not help us.
+ if (I.mayHaveSideEffects() && I.use_empty())
+ continue;
+
if (I.getType()->isIntegerTy() &&
!DB.getDemandedBits(&I).getBoolValue()) {
// For live instructions that have all dead bits, first make them dead by
@@ -50,7 +56,7 @@ static bool bitTrackingDCE(Function &F, DemandedBits &DB) {
// undef, poison, etc.
Value *Zero = ConstantInt::get(I.getType(), 0);
++NumSimplified;
- I.replaceAllUsesWith(Zero);
+ I.replaceNonMetadataUsesWith(Zero);
Changed = true;
}
if (!DB.isInstructionDead(&I))
diff --git a/contrib/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp b/contrib/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp
index 913e939..3826251 100644
--- a/contrib/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp
@@ -64,7 +64,7 @@ public:
bool runOnFunction(Function &Fn) override;
- const char *getPassName() const override { return "Constant Hoisting"; }
+ StringRef getPassName() const override { return "Constant Hoisting"; }
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
@@ -444,7 +444,7 @@ void ConstantHoistingPass::findBaseConstants() {
/// \brief Updates the operand at Idx in instruction Inst with the result of
/// instruction Mat. If the instruction is a PHI node then special
-/// handling for duplicate values form the same incomming basic block is
+/// handling for duplicate values form the same incoming basic block is
/// required.
/// \return The update will always succeed, but the return value indicated if
/// Mat was used for the update or not.
diff --git a/contrib/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/contrib/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
index c0fed05..84f9373 100644
--- a/contrib/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
@@ -18,6 +18,7 @@
#include "llvm/Analysis/InstructionSimplify.h"
#include "llvm/Analysis/LazyValueInfo.h"
#include "llvm/IR/CFG.h"
+#include "llvm/IR/ConstantRange.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/Instructions.h"
@@ -37,8 +38,11 @@ STATISTIC(NumCmps, "Number of comparisons propagated");
STATISTIC(NumReturns, "Number of return values propagated");
STATISTIC(NumDeadCases, "Number of switch cases removed");
STATISTIC(NumSDivs, "Number of sdiv converted to udiv");
+STATISTIC(NumAShrs, "Number of ashr converted to lshr");
STATISTIC(NumSRems, "Number of srem converted to urem");
+static cl::opt<bool> DontProcessAdds("cvp-dont-process-adds", cl::init(true));
+
namespace {
class CorrelatedValuePropagation : public FunctionPass {
public:
@@ -381,6 +385,81 @@ static bool processSDiv(BinaryOperator *SDI, LazyValueInfo *LVI) {
return true;
}
+static bool processAShr(BinaryOperator *SDI, LazyValueInfo *LVI) {
+ if (SDI->getType()->isVectorTy() || hasLocalDefs(SDI))
+ return false;
+
+ Constant *Zero = ConstantInt::get(SDI->getType(), 0);
+ if (LVI->getPredicateAt(ICmpInst::ICMP_SGE, SDI->getOperand(0), Zero, SDI) !=
+ LazyValueInfo::True)
+ return false;
+
+ ++NumAShrs;
+ auto *BO = BinaryOperator::CreateLShr(SDI->getOperand(0), SDI->getOperand(1),
+ SDI->getName(), SDI);
+ BO->setIsExact(SDI->isExact());
+ SDI->replaceAllUsesWith(BO);
+ SDI->eraseFromParent();
+
+ return true;
+}
+
+static bool processAdd(BinaryOperator *AddOp, LazyValueInfo *LVI) {
+ typedef OverflowingBinaryOperator OBO;
+
+ if (DontProcessAdds)
+ return false;
+
+ if (AddOp->getType()->isVectorTy() || hasLocalDefs(AddOp))
+ return false;
+
+ bool NSW = AddOp->hasNoSignedWrap();
+ bool NUW = AddOp->hasNoUnsignedWrap();
+ if (NSW && NUW)
+ return false;
+
+ BasicBlock *BB = AddOp->getParent();
+
+ Value *LHS = AddOp->getOperand(0);
+ Value *RHS = AddOp->getOperand(1);
+
+ ConstantRange LRange = LVI->getConstantRange(LHS, BB, AddOp);
+
+ // Initialize RRange only if we need it. If we know that guaranteed no wrap
+ // range for the given LHS range is empty don't spend time calculating the
+ // range for the RHS.
+ Optional<ConstantRange> RRange;
+ auto LazyRRange = [&] () {
+ if (!RRange)
+ RRange = LVI->getConstantRange(RHS, BB, AddOp);
+ return RRange.getValue();
+ };
+
+ bool Changed = false;
+ if (!NUW) {
+ ConstantRange NUWRange =
+ LRange.makeGuaranteedNoWrapRegion(BinaryOperator::Add, LRange,
+ OBO::NoUnsignedWrap);
+ if (!NUWRange.isEmptySet()) {
+ bool NewNUW = NUWRange.contains(LazyRRange());
+ AddOp->setHasNoUnsignedWrap(NewNUW);
+ Changed |= NewNUW;
+ }
+ }
+ if (!NSW) {
+ ConstantRange NSWRange =
+ LRange.makeGuaranteedNoWrapRegion(BinaryOperator::Add, LRange,
+ OBO::NoSignedWrap);
+ if (!NSWRange.isEmptySet()) {
+ bool NewNSW = NSWRange.contains(LazyRRange());
+ AddOp->setHasNoSignedWrap(NewNSW);
+ Changed |= NewNSW;
+ }
+ }
+
+ return Changed;
+}
+
static Constant *getConstantAt(Value *V, Instruction *At, LazyValueInfo *LVI) {
if (Constant *C = LVI->getConstant(V, At->getParent(), At))
return C;
@@ -407,9 +486,14 @@ static Constant *getConstantAt(Value *V, Instruction *At, LazyValueInfo *LVI) {
static bool runImpl(Function &F, LazyValueInfo *LVI) {
bool FnChanged = false;
- for (BasicBlock &BB : F) {
+ // Visiting in a pre-order depth-first traversal causes us to simplify early
+ // blocks before querying later blocks (which require us to analyze early
+ // blocks). Eagerly simplifying shallow blocks means there is strictly less
+ // work to do for deep blocks. This also means we don't visit unreachable
+ // blocks.
+ for (BasicBlock *BB : depth_first(&F.getEntryBlock())) {
bool BBChanged = false;
- for (BasicBlock::iterator BI = BB.begin(), BE = BB.end(); BI != BE;) {
+ for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE;) {
Instruction *II = &*BI++;
switch (II->getOpcode()) {
case Instruction::Select:
@@ -436,10 +520,16 @@ static bool runImpl(Function &F, LazyValueInfo *LVI) {
case Instruction::SDiv:
BBChanged |= processSDiv(cast<BinaryOperator>(II), LVI);
break;
+ case Instruction::AShr:
+ BBChanged |= processAShr(cast<BinaryOperator>(II), LVI);
+ break;
+ case Instruction::Add:
+ BBChanged |= processAdd(cast<BinaryOperator>(II), LVI);
+ break;
}
}
- Instruction *Term = BB.getTerminator();
+ Instruction *Term = BB->getTerminator();
switch (Term->getOpcode()) {
case Instruction::Switch:
BBChanged |= processSwitch(cast<SwitchInst>(Term), LVI);
diff --git a/contrib/llvm/lib/Transforms/Scalar/DCE.cpp b/contrib/llvm/lib/Transforms/Scalar/DCE.cpp
index f73809d..cc2a3cf 100644
--- a/contrib/llvm/lib/Transforms/Scalar/DCE.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/DCE.cpp
@@ -123,7 +123,7 @@ static bool eliminateDeadCode(Function &F, TargetLibraryInfo *TLI) {
return MadeChange;
}
-PreservedAnalyses DCEPass::run(Function &F, AnalysisManager<Function> &AM) {
+PreservedAnalyses DCEPass::run(Function &F, FunctionAnalysisManager &AM) {
if (eliminateDeadCode(F, AM.getCachedResult<TargetLibraryAnalysis>(F)))
return PreservedAnalyses::none();
return PreservedAnalyses::all();
diff --git a/contrib/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/contrib/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
index ed58a87..4d4c3ba 100644
--- a/contrib/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -59,6 +59,8 @@ EnablePartialOverwriteTracking("enable-dse-partial-overwrite-tracking",
//===----------------------------------------------------------------------===//
// Helper functions
//===----------------------------------------------------------------------===//
+typedef std::map<int64_t, int64_t> OverlapIntervalsTy;
+typedef DenseMap<Instruction *, OverlapIntervalsTy> InstOverlapIntervalsTy;
/// Delete this instruction. Before we do, go through and zero out all the
/// operands of this instruction. If any of them become dead, delete them and
@@ -67,6 +69,8 @@ EnablePartialOverwriteTracking("enable-dse-partial-overwrite-tracking",
static void
deleteDeadInstruction(Instruction *I, BasicBlock::iterator *BBI,
MemoryDependenceResults &MD, const TargetLibraryInfo &TLI,
+ InstOverlapIntervalsTy &IOL,
+ DenseMap<Instruction*, size_t> *InstrOrdering,
SmallSetVector<Value *, 16> *ValueSet = nullptr) {
SmallVector<Instruction*, 32> NowDeadInsts;
@@ -99,13 +103,14 @@ deleteDeadInstruction(Instruction *I, BasicBlock::iterator *BBI,
NowDeadInsts.push_back(OpI);
}
+ if (ValueSet) ValueSet->remove(DeadInst);
+ InstrOrdering->erase(DeadInst);
+ IOL.erase(DeadInst);
if (NewIter == DeadInst->getIterator())
NewIter = DeadInst->eraseFromParent();
else
DeadInst->eraseFromParent();
-
- if (ValueSet) ValueSet->remove(DeadInst);
} while (!NowDeadInsts.empty());
*BBI = NewIter;
}
@@ -290,9 +295,6 @@ enum OverwriteResult {
};
}
-typedef DenseMap<Instruction *,
- std::map<int64_t, int64_t>> InstOverlapIntervalsTy;
-
/// Return 'OverwriteComplete' if a store to the 'Later' location completely
/// overwrites a store to the 'Earlier' location, 'OverwriteEnd' if the end of
/// the 'Earlier' location is completely overwritten by 'Later',
@@ -438,9 +440,9 @@ static OverwriteResult isOverwrite(const MemoryLocation &Later,
//
// In this case we may want to trim the size of earlier to avoid generating
// writes to addresses which will definitely be overwritten later
- if (LaterOff > EarlierOff &&
- LaterOff < int64_t(EarlierOff + Earlier.Size) &&
- int64_t(LaterOff + Later.Size) >= int64_t(EarlierOff + Earlier.Size))
+ if (!EnablePartialOverwriteTracking &&
+ (LaterOff > EarlierOff && LaterOff < int64_t(EarlierOff + Earlier.Size) &&
+ int64_t(LaterOff + Later.Size) >= int64_t(EarlierOff + Earlier.Size)))
return OverwriteEnd;
// Finally, we also need to check if the later store overwrites the beginning
@@ -452,9 +454,11 @@ static OverwriteResult isOverwrite(const MemoryLocation &Later,
// In this case we may want to move the destination address and trim the size
// of earlier to avoid generating writes to addresses which will definitely
// be overwritten later.
- if (LaterOff <= EarlierOff && int64_t(LaterOff + Later.Size) > EarlierOff) {
- assert (int64_t(LaterOff + Later.Size) < int64_t(EarlierOff + Earlier.Size)
- && "Expect to be handled as OverwriteComplete" );
+ if (!EnablePartialOverwriteTracking &&
+ (LaterOff <= EarlierOff && int64_t(LaterOff + Later.Size) > EarlierOff)) {
+ assert(int64_t(LaterOff + Later.Size) <
+ int64_t(EarlierOff + Earlier.Size) &&
+ "Expect to be handled as OverwriteComplete");
return OverwriteBegin;
}
// Otherwise, they don't completely overlap.
@@ -505,7 +509,6 @@ static bool isPossibleSelfRead(Instruction *Inst,
return true;
}
-
/// Returns true if the memory which is accessed by the second instruction is not
/// modified between the first and the second instruction.
/// Precondition: Second instruction must be dominated by the first
@@ -585,7 +588,9 @@ static void findUnconditionalPreds(SmallVectorImpl<BasicBlock *> &Blocks,
/// to a field of that structure.
static bool handleFree(CallInst *F, AliasAnalysis *AA,
MemoryDependenceResults *MD, DominatorTree *DT,
- const TargetLibraryInfo *TLI) {
+ const TargetLibraryInfo *TLI,
+ InstOverlapIntervalsTy &IOL,
+ DenseMap<Instruction*, size_t> *InstrOrdering) {
bool MadeChange = false;
MemoryLocation Loc = MemoryLocation(F->getOperand(0));
@@ -612,9 +617,12 @@ static bool handleFree(CallInst *F, AliasAnalysis *AA,
if (!AA->isMustAlias(F->getArgOperand(0), DepPointer))
break;
+ DEBUG(dbgs() << "DSE: Dead Store to soon to be freed memory:\n DEAD: "
+ << *Dependency << '\n');
+
// DCE instructions only used to calculate that store.
BasicBlock::iterator BBI(Dependency);
- deleteDeadInstruction(Dependency, &BBI, *MD, *TLI);
+ deleteDeadInstruction(Dependency, &BBI, *MD, *TLI, IOL, InstrOrdering);
++NumFastStores;
MadeChange = true;
@@ -669,7 +677,9 @@ static void removeAccessedObjects(const MemoryLocation &LoadedLoc,
/// ret void
static bool handleEndBlock(BasicBlock &BB, AliasAnalysis *AA,
MemoryDependenceResults *MD,
- const TargetLibraryInfo *TLI) {
+ const TargetLibraryInfo *TLI,
+ InstOverlapIntervalsTy &IOL,
+ DenseMap<Instruction*, size_t> *InstrOrdering) {
bool MadeChange = false;
// Keep track of all of the stack objects that are dead at the end of the
@@ -728,7 +738,7 @@ static bool handleEndBlock(BasicBlock &BB, AliasAnalysis *AA,
dbgs() << '\n');
// DCE instructions only used to calculate that store.
- deleteDeadInstruction(Dead, &BBI, *MD, *TLI, &DeadStackObjects);
+ deleteDeadInstruction(Dead, &BBI, *MD, *TLI, IOL, InstrOrdering, &DeadStackObjects);
++NumFastStores;
MadeChange = true;
continue;
@@ -737,7 +747,9 @@ static bool handleEndBlock(BasicBlock &BB, AliasAnalysis *AA,
// Remove any dead non-memory-mutating instructions.
if (isInstructionTriviallyDead(&*BBI, TLI)) {
- deleteDeadInstruction(&*BBI, &BBI, *MD, *TLI, &DeadStackObjects);
+ DEBUG(dbgs() << "DSE: Removing trivially dead instruction:\n DEAD: "
+ << *&*BBI << '\n');
+ deleteDeadInstruction(&*BBI, &BBI, *MD, *TLI, IOL, InstrOrdering, &DeadStackObjects);
++NumFastOther;
MadeChange = true;
continue;
@@ -819,10 +831,125 @@ static bool handleEndBlock(BasicBlock &BB, AliasAnalysis *AA,
return MadeChange;
}
+static bool tryToShorten(Instruction *EarlierWrite, int64_t &EarlierOffset,
+ int64_t &EarlierSize, int64_t LaterOffset,
+ int64_t LaterSize, bool IsOverwriteEnd) {
+ // TODO: base this on the target vector size so that if the earlier
+ // store was too small to get vector writes anyway then its likely
+ // a good idea to shorten it
+ // Power of 2 vector writes are probably always a bad idea to optimize
+ // as any store/memset/memcpy is likely using vector instructions so
+ // shortening it to not vector size is likely to be slower
+ MemIntrinsic *EarlierIntrinsic = cast<MemIntrinsic>(EarlierWrite);
+ unsigned EarlierWriteAlign = EarlierIntrinsic->getAlignment();
+ if (!IsOverwriteEnd)
+ LaterOffset = int64_t(LaterOffset + LaterSize);
+
+ if (!(llvm::isPowerOf2_64(LaterOffset) && EarlierWriteAlign <= LaterOffset) &&
+ !((EarlierWriteAlign != 0) && LaterOffset % EarlierWriteAlign == 0))
+ return false;
+
+ DEBUG(dbgs() << "DSE: Remove Dead Store:\n OW "
+ << (IsOverwriteEnd ? "END" : "BEGIN") << ": " << *EarlierWrite
+ << "\n KILLER (offset " << LaterOffset << ", " << EarlierSize
+ << ")\n");
+
+ int64_t NewLength = IsOverwriteEnd
+ ? LaterOffset - EarlierOffset
+ : EarlierSize - (LaterOffset - EarlierOffset);
+
+ Value *EarlierWriteLength = EarlierIntrinsic->getLength();
+ Value *TrimmedLength =
+ ConstantInt::get(EarlierWriteLength->getType(), NewLength);
+ EarlierIntrinsic->setLength(TrimmedLength);
+
+ EarlierSize = NewLength;
+ if (!IsOverwriteEnd) {
+ int64_t OffsetMoved = (LaterOffset - EarlierOffset);
+ Value *Indices[1] = {
+ ConstantInt::get(EarlierWriteLength->getType(), OffsetMoved)};
+ GetElementPtrInst *NewDestGEP = GetElementPtrInst::CreateInBounds(
+ EarlierIntrinsic->getRawDest(), Indices, "", EarlierWrite);
+ EarlierIntrinsic->setDest(NewDestGEP);
+ EarlierOffset = EarlierOffset + OffsetMoved;
+ }
+ return true;
+}
+
+static bool tryToShortenEnd(Instruction *EarlierWrite,
+ OverlapIntervalsTy &IntervalMap,
+ int64_t &EarlierStart, int64_t &EarlierSize) {
+ if (IntervalMap.empty() || !isShortenableAtTheEnd(EarlierWrite))
+ return false;
+
+ OverlapIntervalsTy::iterator OII = --IntervalMap.end();
+ int64_t LaterStart = OII->second;
+ int64_t LaterSize = OII->first - LaterStart;
+
+ if (LaterStart > EarlierStart && LaterStart < EarlierStart + EarlierSize &&
+ LaterStart + LaterSize >= EarlierStart + EarlierSize) {
+ if (tryToShorten(EarlierWrite, EarlierStart, EarlierSize, LaterStart,
+ LaterSize, true)) {
+ IntervalMap.erase(OII);
+ return true;
+ }
+ }
+ return false;
+}
+
+static bool tryToShortenBegin(Instruction *EarlierWrite,
+ OverlapIntervalsTy &IntervalMap,
+ int64_t &EarlierStart, int64_t &EarlierSize) {
+ if (IntervalMap.empty() || !isShortenableAtTheBeginning(EarlierWrite))
+ return false;
+
+ OverlapIntervalsTy::iterator OII = IntervalMap.begin();
+ int64_t LaterStart = OII->second;
+ int64_t LaterSize = OII->first - LaterStart;
+
+ if (LaterStart <= EarlierStart && LaterStart + LaterSize > EarlierStart) {
+ assert(LaterStart + LaterSize < EarlierStart + EarlierSize &&
+ "Should have been handled as OverwriteComplete");
+ if (tryToShorten(EarlierWrite, EarlierStart, EarlierSize, LaterStart,
+ LaterSize, false)) {
+ IntervalMap.erase(OII);
+ return true;
+ }
+ }
+ return false;
+}
+
+static bool removePartiallyOverlappedStores(AliasAnalysis *AA,
+ const DataLayout &DL,
+ InstOverlapIntervalsTy &IOL) {
+ bool Changed = false;
+ for (auto OI : IOL) {
+ Instruction *EarlierWrite = OI.first;
+ MemoryLocation Loc = getLocForWrite(EarlierWrite, *AA);
+ assert(isRemovable(EarlierWrite) && "Expect only removable instruction");
+ assert(Loc.Size != MemoryLocation::UnknownSize && "Unexpected mem loc");
+
+ const Value *Ptr = Loc.Ptr->stripPointerCasts();
+ int64_t EarlierStart = 0;
+ int64_t EarlierSize = int64_t(Loc.Size);
+ GetPointerBaseWithConstantOffset(Ptr, EarlierStart, DL);
+ OverlapIntervalsTy &IntervalMap = OI.second;
+ Changed |=
+ tryToShortenEnd(EarlierWrite, IntervalMap, EarlierStart, EarlierSize);
+ if (IntervalMap.empty())
+ continue;
+ Changed |=
+ tryToShortenBegin(EarlierWrite, IntervalMap, EarlierStart, EarlierSize);
+ }
+ return Changed;
+}
+
static bool eliminateNoopStore(Instruction *Inst, BasicBlock::iterator &BBI,
AliasAnalysis *AA, MemoryDependenceResults *MD,
const DataLayout &DL,
- const TargetLibraryInfo *TLI) {
+ const TargetLibraryInfo *TLI,
+ InstOverlapIntervalsTy &IOL,
+ DenseMap<Instruction*, size_t> *InstrOrdering) {
// Must be a store instruction.
StoreInst *SI = dyn_cast<StoreInst>(Inst);
if (!SI)
@@ -837,7 +964,7 @@ static bool eliminateNoopStore(Instruction *Inst, BasicBlock::iterator &BBI,
DEBUG(dbgs() << "DSE: Remove Store Of Load from same pointer:\n LOAD: "
<< *DepLoad << "\n STORE: " << *SI << '\n');
- deleteDeadInstruction(SI, &BBI, *MD, *TLI);
+ deleteDeadInstruction(SI, &BBI, *MD, *TLI, IOL, InstrOrdering);
++NumRedundantStores;
return true;
}
@@ -855,7 +982,7 @@ static bool eliminateNoopStore(Instruction *Inst, BasicBlock::iterator &BBI,
dbgs() << "DSE: Remove null store to the calloc'ed object:\n DEAD: "
<< *Inst << "\n OBJECT: " << *UnderlyingPointer << '\n');
- deleteDeadInstruction(SI, &BBI, *MD, *TLI);
+ deleteDeadInstruction(SI, &BBI, *MD, *TLI, IOL, InstrOrdering);
++NumRedundantStores;
return true;
}
@@ -869,6 +996,12 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA,
const DataLayout &DL = BB.getModule()->getDataLayout();
bool MadeChange = false;
+ // FIXME: Maybe change this to use some abstraction like OrderedBasicBlock?
+ // The current OrderedBasicBlock can't deal with mutation at the moment.
+ size_t LastThrowingInstIndex = 0;
+ DenseMap<Instruction*, size_t> InstrOrdering;
+ size_t InstrIndex = 1;
+
// A map of interval maps representing partially-overwritten value parts.
InstOverlapIntervalsTy IOL;
@@ -876,7 +1009,7 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA,
for (BasicBlock::iterator BBI = BB.begin(), BBE = BB.end(); BBI != BBE; ) {
// Handle 'free' calls specially.
if (CallInst *F = isFreeCall(&*BBI, TLI)) {
- MadeChange |= handleFree(F, AA, MD, DT, TLI);
+ MadeChange |= handleFree(F, AA, MD, DT, TLI, IOL, &InstrOrdering);
// Increment BBI after handleFree has potentially deleted instructions.
// This ensures we maintain a valid iterator.
++BBI;
@@ -885,12 +1018,19 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA,
Instruction *Inst = &*BBI++;
+ size_t CurInstNumber = InstrIndex++;
+ InstrOrdering.insert(std::make_pair(Inst, CurInstNumber));
+ if (Inst->mayThrow()) {
+ LastThrowingInstIndex = CurInstNumber;
+ continue;
+ }
+
// Check to see if Inst writes to memory. If not, continue.
if (!hasMemoryWrite(Inst, *TLI))
continue;
// eliminateNoopStore will update in iterator, if necessary.
- if (eliminateNoopStore(Inst, BBI, AA, MD, DL, TLI)) {
+ if (eliminateNoopStore(Inst, BBI, AA, MD, DL, TLI, IOL, &InstrOrdering)) {
MadeChange = true;
continue;
}
@@ -910,6 +1050,13 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA,
if (!Loc.Ptr)
continue;
+ // Loop until we find a store we can eliminate or a load that
+ // invalidates the analysis. Without an upper bound on the number of
+ // instructions examined, this analysis can become very time-consuming.
+ // However, the potential gain diminishes as we process more instructions
+ // without eliminating any of them. Therefore, we limit the number of
+ // instructions we look at.
+ auto Limit = MD->getDefaultBlockScanLimit();
while (InstDep.isDef() || InstDep.isClobber()) {
// Get the memory clobbered by the instruction we depend on. MemDep will
// skip any instructions that 'Loc' clearly doesn't interact with. If we
@@ -924,6 +1071,31 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA,
if (!DepLoc.Ptr)
break;
+ // Make sure we don't look past a call which might throw. This is an
+ // issue because MemoryDependenceAnalysis works in the wrong direction:
+ // it finds instructions which dominate the current instruction, rather than
+ // instructions which are post-dominated by the current instruction.
+ //
+ // If the underlying object is a non-escaping memory allocation, any store
+ // to it is dead along the unwind edge. Otherwise, we need to preserve
+ // the store.
+ size_t DepIndex = InstrOrdering.lookup(DepWrite);
+ assert(DepIndex && "Unexpected instruction");
+ if (DepIndex <= LastThrowingInstIndex) {
+ const Value* Underlying = GetUnderlyingObject(DepLoc.Ptr, DL);
+ bool IsStoreDeadOnUnwind = isa<AllocaInst>(Underlying);
+ if (!IsStoreDeadOnUnwind) {
+ // We're looking for a call to an allocation function
+ // where the allocation doesn't escape before the last
+ // throwing instruction; PointerMayBeCaptured
+ // reasonably fast approximation.
+ IsStoreDeadOnUnwind = isAllocLikeFn(Underlying, TLI) &&
+ !PointerMayBeCaptured(Underlying, false, true);
+ }
+ if (!IsStoreDeadOnUnwind)
+ break;
+ }
+
// If we find a write that is a) removable (i.e., non-volatile), b) is
// completely obliterated by the store to 'Loc', and c) which we know that
// 'Inst' doesn't load from, then we can remove it.
@@ -938,7 +1110,7 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA,
<< *DepWrite << "\n KILLER: " << *Inst << '\n');
// Delete the store and now-dead instructions that feed it.
- deleteDeadInstruction(DepWrite, &BBI, *MD, *TLI);
+ deleteDeadInstruction(DepWrite, &BBI, *MD, *TLI, IOL, &InstrOrdering);
++NumFastStores;
MadeChange = true;
@@ -948,48 +1120,14 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA,
} else if ((OR == OverwriteEnd && isShortenableAtTheEnd(DepWrite)) ||
((OR == OverwriteBegin &&
isShortenableAtTheBeginning(DepWrite)))) {
- // TODO: base this on the target vector size so that if the earlier
- // store was too small to get vector writes anyway then its likely
- // a good idea to shorten it
- // Power of 2 vector writes are probably always a bad idea to optimize
- // as any store/memset/memcpy is likely using vector instructions so
- // shortening it to not vector size is likely to be slower
- MemIntrinsic *DepIntrinsic = cast<MemIntrinsic>(DepWrite);
- unsigned DepWriteAlign = DepIntrinsic->getAlignment();
+ assert(!EnablePartialOverwriteTracking && "Do not expect to perform "
+ "when partial-overwrite "
+ "tracking is enabled");
+ int64_t EarlierSize = DepLoc.Size;
+ int64_t LaterSize = Loc.Size;
bool IsOverwriteEnd = (OR == OverwriteEnd);
- if (!IsOverwriteEnd)
- InstWriteOffset = int64_t(InstWriteOffset + Loc.Size);
-
- if ((llvm::isPowerOf2_64(InstWriteOffset) &&
- DepWriteAlign <= InstWriteOffset) ||
- ((DepWriteAlign != 0) && InstWriteOffset % DepWriteAlign == 0)) {
-
- DEBUG(dbgs() << "DSE: Remove Dead Store:\n OW "
- << (IsOverwriteEnd ? "END" : "BEGIN") << ": "
- << *DepWrite << "\n KILLER (offset "
- << InstWriteOffset << ", " << DepLoc.Size << ")"
- << *Inst << '\n');
-
- int64_t NewLength =
- IsOverwriteEnd
- ? InstWriteOffset - DepWriteOffset
- : DepLoc.Size - (InstWriteOffset - DepWriteOffset);
-
- Value *DepWriteLength = DepIntrinsic->getLength();
- Value *TrimmedLength =
- ConstantInt::get(DepWriteLength->getType(), NewLength);
- DepIntrinsic->setLength(TrimmedLength);
-
- if (!IsOverwriteEnd) {
- int64_t OffsetMoved = (InstWriteOffset - DepWriteOffset);
- Value *Indices[1] = {
- ConstantInt::get(DepWriteLength->getType(), OffsetMoved)};
- GetElementPtrInst *NewDestGEP = GetElementPtrInst::CreateInBounds(
- DepIntrinsic->getRawDest(), Indices, "", DepWrite);
- DepIntrinsic->setDest(NewDestGEP);
- }
- MadeChange = true;
- }
+ MadeChange |= tryToShorten(DepWrite, DepWriteOffset, EarlierSize,
+ InstWriteOffset, LaterSize, IsOverwriteEnd);
}
}
@@ -1007,15 +1145,19 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA,
if (AA->getModRefInfo(DepWrite, Loc) & MRI_Ref)
break;
- InstDep = MD->getPointerDependencyFrom(Loc, false,
- DepWrite->getIterator(), &BB);
+ InstDep = MD->getPointerDependencyFrom(Loc, /*isLoad=*/ false,
+ DepWrite->getIterator(), &BB,
+ /*QueryInst=*/ nullptr, &Limit);
}
}
+ if (EnablePartialOverwriteTracking)
+ MadeChange |= removePartiallyOverlappedStores(AA, DL, IOL);
+
// If this block ends in a return, unwind, or unreachable, all allocas are
// dead at its end, which means stores to them are also dead.
if (BB.getTerminator()->getNumSuccessors() == 0)
- MadeChange |= handleEndBlock(BB, AA, MD, TLI);
+ MadeChange |= handleEndBlock(BB, AA, MD, TLI, IOL, &InstrOrdering);
return MadeChange;
}
@@ -1029,6 +1171,7 @@ static bool eliminateDeadStores(Function &F, AliasAnalysis *AA,
// cycles that will confuse alias analysis.
if (DT->isReachableFromEntry(&BB))
MadeChange |= eliminateDeadStores(BB, AA, MD, DT, TLI);
+
return MadeChange;
}
diff --git a/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp b/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
index 0b16e27..16e08ee 100644
--- a/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
@@ -32,6 +32,7 @@
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/MemorySSA.h"
#include <deque>
using namespace llvm;
using namespace llvm::PatternMatch;
@@ -251,6 +252,7 @@ public:
const TargetTransformInfo &TTI;
DominatorTree &DT;
AssumptionCache &AC;
+ MemorySSA *MSSA;
typedef RecyclingAllocator<
BumpPtrAllocator, ScopedHashTableVal<SimpleValue, Value *>> AllocatorTy;
typedef ScopedHashTable<SimpleValue, Value *, DenseMapInfo<SimpleValue>,
@@ -312,8 +314,8 @@ public:
/// \brief Set up the EarlyCSE runner for a particular function.
EarlyCSE(const TargetLibraryInfo &TLI, const TargetTransformInfo &TTI,
- DominatorTree &DT, AssumptionCache &AC)
- : TLI(TLI), TTI(TTI), DT(DT), AC(AC), CurrentGeneration(0) {}
+ DominatorTree &DT, AssumptionCache &AC, MemorySSA *MSSA)
+ : TLI(TLI), TTI(TTI), DT(DT), AC(AC), MSSA(MSSA), CurrentGeneration(0) {}
bool run();
@@ -338,7 +340,7 @@ private:
};
// Contains all the needed information to create a stack for doing a depth
- // first tranversal of the tree. This includes scopes for values, loads, and
+ // first traversal of the tree. This includes scopes for values, loads, and
// calls as well as the generation. There is a child iterator so that the
// children do not need to be store separately.
class StackNode {
@@ -479,17 +481,93 @@ private:
bool processNode(DomTreeNode *Node);
Value *getOrCreateResult(Value *Inst, Type *ExpectedType) const {
- if (LoadInst *LI = dyn_cast<LoadInst>(Inst))
+ if (auto *LI = dyn_cast<LoadInst>(Inst))
return LI;
- else if (StoreInst *SI = dyn_cast<StoreInst>(Inst))
+ if (auto *SI = dyn_cast<StoreInst>(Inst))
return SI->getValueOperand();
assert(isa<IntrinsicInst>(Inst) && "Instruction not supported");
return TTI.getOrCreateResultFromMemIntrinsic(cast<IntrinsicInst>(Inst),
ExpectedType);
}
+
+ bool isSameMemGeneration(unsigned EarlierGeneration, unsigned LaterGeneration,
+ Instruction *EarlierInst, Instruction *LaterInst);
+
+ void removeMSSA(Instruction *Inst) {
+ if (!MSSA)
+ return;
+ // Removing a store here can leave MemorySSA in an unoptimized state by
+ // creating MemoryPhis that have identical arguments and by creating
+ // MemoryUses whose defining access is not an actual clobber. We handle the
+ // phi case eagerly here. The non-optimized MemoryUse case is lazily
+ // updated by MemorySSA getClobberingMemoryAccess.
+ if (MemoryAccess *MA = MSSA->getMemoryAccess(Inst)) {
+ // Optimize MemoryPhi nodes that may become redundant by having all the
+ // same input values once MA is removed.
+ SmallVector<MemoryPhi *, 4> PhisToCheck;
+ SmallVector<MemoryAccess *, 8> WorkQueue;
+ WorkQueue.push_back(MA);
+ // Process MemoryPhi nodes in FIFO order using a ever-growing vector since
+ // we shouldn't be processing that many phis and this will avoid an
+ // allocation in almost all cases.
+ for (unsigned I = 0; I < WorkQueue.size(); ++I) {
+ MemoryAccess *WI = WorkQueue[I];
+
+ for (auto *U : WI->users())
+ if (MemoryPhi *MP = dyn_cast<MemoryPhi>(U))
+ PhisToCheck.push_back(MP);
+
+ MSSA->removeMemoryAccess(WI);
+
+ for (MemoryPhi *MP : PhisToCheck) {
+ MemoryAccess *FirstIn = MP->getIncomingValue(0);
+ if (all_of(MP->incoming_values(),
+ [=](Use &In) { return In == FirstIn; }))
+ WorkQueue.push_back(MP);
+ }
+ PhisToCheck.clear();
+ }
+ }
+ }
};
}
+/// Determine if the memory referenced by LaterInst is from the same heap
+/// version as EarlierInst.
+/// This is currently called in two scenarios:
+///
+/// load p
+/// ...
+/// load p
+///
+/// and
+///
+/// x = load p
+/// ...
+/// store x, p
+///
+/// in both cases we want to verify that there are no possible writes to the
+/// memory referenced by p between the earlier and later instruction.
+bool EarlyCSE::isSameMemGeneration(unsigned EarlierGeneration,
+ unsigned LaterGeneration,
+ Instruction *EarlierInst,
+ Instruction *LaterInst) {
+ // Check the simple memory generation tracking first.
+ if (EarlierGeneration == LaterGeneration)
+ return true;
+
+ if (!MSSA)
+ return false;
+
+ // Since we know LaterDef dominates LaterInst and EarlierInst dominates
+ // LaterInst, if LaterDef dominates EarlierInst then it can't occur between
+ // EarlierInst and LaterInst and neither can any other write that potentially
+ // clobbers LaterInst.
+ MemoryAccess *LaterDef =
+ MSSA->getWalker()->getClobberingMemoryAccess(LaterInst);
+ return MSSA->dominates(LaterDef, MSSA->getMemoryAccess(EarlierInst));
+}
+
bool EarlyCSE::processNode(DomTreeNode *Node) {
bool Changed = false;
BasicBlock *BB = Node->getBlock();
@@ -547,6 +625,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
// Dead instructions should just be removed.
if (isInstructionTriviallyDead(Inst, &TLI)) {
DEBUG(dbgs() << "EarlyCSE DCE: " << *Inst << '\n');
+ removeMSSA(Inst);
Inst->eraseFromParent();
Changed = true;
++NumSimplify;
@@ -562,6 +641,19 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
continue;
}
+ // Skip invariant.start intrinsics since they only read memory, and we can
+ // forward values across it. Also, we dont need to consume the last store
+ // since the semantics of invariant.start allow us to perform DSE of the
+ // last store, if there was a store following invariant.start. Consider:
+ //
+ // store 30, i8* p
+ // invariant.start(p)
+ // store 40, i8* p
+ // We can DSE the store to 30, since the store 40 to invariant location p
+ // causes undefined behaviour.
+ if (match(Inst, m_Intrinsic<Intrinsic::invariant_start>()))
+ continue;
+
if (match(Inst, m_Intrinsic<Intrinsic::experimental_guard>())) {
if (auto *CondI =
dyn_cast<Instruction>(cast<CallInst>(Inst)->getArgOperand(0))) {
@@ -588,6 +680,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
Changed = true;
}
if (isInstructionTriviallyDead(Inst, &TLI)) {
+ removeMSSA(Inst);
Inst->eraseFromParent();
Changed = true;
Killed = true;
@@ -606,6 +699,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
if (auto *I = dyn_cast<Instruction>(V))
I->andIRFlags(Inst);
Inst->replaceAllUsesWith(V);
+ removeMSSA(Inst);
Inst->eraseFromParent();
Changed = true;
++NumCSE;
@@ -631,24 +725,26 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
// generation or the load is known to be from an invariant location,
// replace this instruction.
//
- // A dominating invariant load implies that the location loaded from is
- // unchanging beginning at the point of the invariant load, so the load
- // we're CSE'ing _away_ does not need to be invariant, only the available
- // load we're CSE'ing _to_ does.
+ // If either the dominating load or the current load are invariant, then
+ // we can assume the current load loads the same value as the dominating
+ // load.
LoadValue InVal = AvailableLoads.lookup(MemInst.getPointerOperand());
if (InVal.DefInst != nullptr &&
- (InVal.Generation == CurrentGeneration || InVal.IsInvariant) &&
InVal.MatchingId == MemInst.getMatchingId() &&
// We don't yet handle removing loads with ordering of any kind.
!MemInst.isVolatile() && MemInst.isUnordered() &&
// We can't replace an atomic load with one which isn't also atomic.
- InVal.IsAtomic >= MemInst.isAtomic()) {
+ InVal.IsAtomic >= MemInst.isAtomic() &&
+ (InVal.IsInvariant || MemInst.isInvariantLoad() ||
+ isSameMemGeneration(InVal.Generation, CurrentGeneration,
+ InVal.DefInst, Inst))) {
Value *Op = getOrCreateResult(InVal.DefInst, Inst->getType());
if (Op != nullptr) {
DEBUG(dbgs() << "EarlyCSE CSE LOAD: " << *Inst
<< " to: " << *InVal.DefInst << '\n');
if (!Inst->use_empty())
Inst->replaceAllUsesWith(Op);
+ removeMSSA(Inst);
Inst->eraseFromParent();
Changed = true;
++NumCSELoad;
@@ -679,11 +775,14 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
// If we have an available version of this call, and if it is the right
// generation, replace this instruction.
std::pair<Instruction *, unsigned> InVal = AvailableCalls.lookup(Inst);
- if (InVal.first != nullptr && InVal.second == CurrentGeneration) {
+ if (InVal.first != nullptr &&
+ isSameMemGeneration(InVal.second, CurrentGeneration, InVal.first,
+ Inst)) {
DEBUG(dbgs() << "EarlyCSE CSE CALL: " << *Inst
<< " to: " << *InVal.first << '\n');
if (!Inst->use_empty())
Inst->replaceAllUsesWith(InVal.first);
+ removeMSSA(Inst);
Inst->eraseFromParent();
Changed = true;
++NumCSECall;
@@ -716,15 +815,22 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
LoadValue InVal = AvailableLoads.lookup(MemInst.getPointerOperand());
if (InVal.DefInst &&
InVal.DefInst == getOrCreateResult(Inst, InVal.DefInst->getType()) &&
- InVal.Generation == CurrentGeneration &&
InVal.MatchingId == MemInst.getMatchingId() &&
// We don't yet handle removing stores with ordering of any kind.
- !MemInst.isVolatile() && MemInst.isUnordered()) {
+ !MemInst.isVolatile() && MemInst.isUnordered() &&
+ isSameMemGeneration(InVal.Generation, CurrentGeneration,
+ InVal.DefInst, Inst)) {
+ // It is okay to have a LastStore to a different pointer here if MemorySSA
+ // tells us that the load and store are from the same memory generation.
+ // In that case, LastStore should keep its present value since we're
+ // removing the current store.
assert((!LastStore ||
ParseMemoryInst(LastStore, TTI).getPointerOperand() ==
- MemInst.getPointerOperand()) &&
- "can't have an intervening store!");
+ MemInst.getPointerOperand() ||
+ MSSA) &&
+ "can't have an intervening store if not using MemorySSA!");
DEBUG(dbgs() << "EarlyCSE DSE (writeback): " << *Inst << '\n');
+ removeMSSA(Inst);
Inst->eraseFromParent();
Changed = true;
++NumDSE;
@@ -756,6 +862,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
if (LastStoreMemInst.isMatchingMemLoc(MemInst)) {
DEBUG(dbgs() << "EarlyCSE DEAD STORE: " << *LastStore
<< " due to: " << *Inst << '\n');
+ removeMSSA(LastStore);
LastStore->eraseFromParent();
Changed = true;
++NumDSE;
@@ -847,13 +954,15 @@ bool EarlyCSE::run() {
}
PreservedAnalyses EarlyCSEPass::run(Function &F,
- AnalysisManager<Function> &AM) {
+ FunctionAnalysisManager &AM) {
auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
auto &TTI = AM.getResult<TargetIRAnalysis>(F);
auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
auto &AC = AM.getResult<AssumptionAnalysis>(F);
+ auto *MSSA =
+ UseMemorySSA ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA() : nullptr;
- EarlyCSE CSE(TLI, TTI, DT, AC);
+ EarlyCSE CSE(TLI, TTI, DT, AC, MSSA);
if (!CSE.run())
return PreservedAnalyses::all();
@@ -863,6 +972,8 @@ PreservedAnalyses EarlyCSEPass::run(Function &F,
PreservedAnalyses PA;
PA.preserve<DominatorTreeAnalysis>();
PA.preserve<GlobalsAA>();
+ if (UseMemorySSA)
+ PA.preserve<MemorySSAAnalysis>();
return PA;
}
@@ -874,12 +985,16 @@ namespace {
/// canonicalize things as it goes. It is intended to be fast and catch obvious
/// cases so that instcombine and other passes are more effective. It is
/// expected that a later pass of GVN will catch the interesting/hard cases.
-class EarlyCSELegacyPass : public FunctionPass {
+template<bool UseMemorySSA>
+class EarlyCSELegacyCommonPass : public FunctionPass {
public:
static char ID;
- EarlyCSELegacyPass() : FunctionPass(ID) {
- initializeEarlyCSELegacyPassPass(*PassRegistry::getPassRegistry());
+ EarlyCSELegacyCommonPass() : FunctionPass(ID) {
+ if (UseMemorySSA)
+ initializeEarlyCSEMemSSALegacyPassPass(*PassRegistry::getPassRegistry());
+ else
+ initializeEarlyCSELegacyPassPass(*PassRegistry::getPassRegistry());
}
bool runOnFunction(Function &F) override {
@@ -890,8 +1005,10 @@ public:
auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+ auto *MSSA =
+ UseMemorySSA ? &getAnalysis<MemorySSAWrapperPass>().getMSSA() : nullptr;
- EarlyCSE CSE(TLI, TTI, DT, AC);
+ EarlyCSE CSE(TLI, TTI, DT, AC, MSSA);
return CSE.run();
}
@@ -901,15 +1018,20 @@ public:
AU.addRequired<DominatorTreeWrapperPass>();
AU.addRequired<TargetLibraryInfoWrapperPass>();
AU.addRequired<TargetTransformInfoWrapperPass>();
+ if (UseMemorySSA) {
+ AU.addRequired<MemorySSAWrapperPass>();
+ AU.addPreserved<MemorySSAWrapperPass>();
+ }
AU.addPreserved<GlobalsAAWrapperPass>();
AU.setPreservesCFG();
}
};
}
-char EarlyCSELegacyPass::ID = 0;
+using EarlyCSELegacyPass = EarlyCSELegacyCommonPass</*UseMemorySSA=*/false>;
-FunctionPass *llvm::createEarlyCSEPass() { return new EarlyCSELegacyPass(); }
+template<>
+char EarlyCSELegacyPass::ID = 0;
INITIALIZE_PASS_BEGIN(EarlyCSELegacyPass, "early-cse", "Early CSE", false,
false)
@@ -918,3 +1040,26 @@ INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
INITIALIZE_PASS_END(EarlyCSELegacyPass, "early-cse", "Early CSE", false, false)
+
+using EarlyCSEMemSSALegacyPass =
+ EarlyCSELegacyCommonPass</*UseMemorySSA=*/true>;
+
+template<>
+char EarlyCSEMemSSALegacyPass::ID = 0;
+
+FunctionPass *llvm::createEarlyCSEPass(bool UseMemorySSA) {
+ if (UseMemorySSA)
+ return new EarlyCSEMemSSALegacyPass();
+ else
+ return new EarlyCSELegacyPass();
+}
+
+INITIALIZE_PASS_BEGIN(EarlyCSEMemSSALegacyPass, "early-cse-memssa",
+ "Early CSE w/ MemorySSA", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
+INITIALIZE_PASS_END(EarlyCSEMemSSALegacyPass, "early-cse-memssa",
+ "Early CSE w/ MemorySSA", false, false)
diff --git a/contrib/llvm/lib/Transforms/Scalar/Float2Int.cpp b/contrib/llvm/lib/Transforms/Scalar/Float2Int.cpp
index 7aa6dc6..545036d 100644
--- a/contrib/llvm/lib/Transforms/Scalar/Float2Int.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/Float2Int.cpp
@@ -190,21 +190,14 @@ void Float2IntPass::walkBackwards(const SmallPtrSetImpl<Instruction*> &Roots) {
seen(I, badRange());
break;
- case Instruction::UIToFP: {
- // Path terminated cleanly.
- unsigned BW = I->getOperand(0)->getType()->getPrimitiveSizeInBits();
- APInt Min = APInt::getMinValue(BW).zextOrSelf(MaxIntegerBW+1);
- APInt Max = APInt::getMaxValue(BW).zextOrSelf(MaxIntegerBW+1);
- seen(I, validateRange(ConstantRange(Min, Max)));
- continue;
- }
-
+ case Instruction::UIToFP:
case Instruction::SIToFP: {
- // Path terminated cleanly.
+ // Path terminated cleanly - use the type of the integer input to seed
+ // the analysis.
unsigned BW = I->getOperand(0)->getType()->getPrimitiveSizeInBits();
- APInt SMin = APInt::getSignedMinValue(BW).sextOrSelf(MaxIntegerBW+1);
- APInt SMax = APInt::getSignedMaxValue(BW).sextOrSelf(MaxIntegerBW+1);
- seen(I, validateRange(ConstantRange(SMin, SMax)));
+ auto Input = ConstantRange(BW, true);
+ auto CastOp = (Instruction::CastOps)I->getOpcode();
+ seen(I, validateRange(Input.castOp(CastOp, MaxIntegerBW+1)));
continue;
}
@@ -249,23 +242,12 @@ void Float2IntPass::walkForwards() {
llvm_unreachable("Should have been handled in walkForwards!");
case Instruction::FAdd:
- Op = [](ArrayRef<ConstantRange> Ops) {
- assert(Ops.size() == 2 && "FAdd is a binary operator!");
- return Ops[0].add(Ops[1]);
- };
- break;
-
case Instruction::FSub:
- Op = [](ArrayRef<ConstantRange> Ops) {
- assert(Ops.size() == 2 && "FSub is a binary operator!");
- return Ops[0].sub(Ops[1]);
- };
- break;
-
case Instruction::FMul:
- Op = [](ArrayRef<ConstantRange> Ops) {
- assert(Ops.size() == 2 && "FMul is a binary operator!");
- return Ops[0].multiply(Ops[1]);
+ Op = [I](ArrayRef<ConstantRange> Ops) {
+ assert(Ops.size() == 2 && "its a binary operator!");
+ auto BinOp = (Instruction::BinaryOps) I->getOpcode();
+ return Ops[0].binaryOp(BinOp, Ops[1]);
};
break;
@@ -275,9 +257,12 @@ void Float2IntPass::walkForwards() {
//
case Instruction::FPToUI:
case Instruction::FPToSI:
- Op = [](ArrayRef<ConstantRange> Ops) {
+ Op = [I](ArrayRef<ConstantRange> Ops) {
assert(Ops.size() == 1 && "FPTo[US]I is a unary operator!");
- return Ops[0];
+ // Note: We're ignoring the casts output size here as that's what the
+ // caller expects.
+ auto CastOp = (Instruction::CastOps)I->getOpcode();
+ return Ops[0].castOp(CastOp, MaxIntegerBW+1);
};
break;
diff --git a/contrib/llvm/lib/Transforms/Scalar/GVN.cpp b/contrib/llvm/lib/Transforms/Scalar/GVN.cpp
index a35a106..0137378 100644
--- a/contrib/llvm/lib/Transforms/Scalar/GVN.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/GVN.cpp
@@ -33,6 +33,7 @@
#include "llvm/Analysis/Loads.h"
#include "llvm/Analysis/MemoryBuiltins.h"
#include "llvm/Analysis/MemoryDependenceAnalysis.h"
+#include "llvm/Analysis/OptimizationDiagnosticInfo.h"
#include "llvm/Analysis/PHITransAddr.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Analysis/ValueTracking.h"
@@ -338,16 +339,9 @@ GVN::Expression GVN::ValueTable::createExtractvalueExpr(ExtractValueInst *EI) {
//===----------------------------------------------------------------------===//
GVN::ValueTable::ValueTable() : nextValueNumber(1) {}
-GVN::ValueTable::ValueTable(const ValueTable &Arg)
- : valueNumbering(Arg.valueNumbering),
- expressionNumbering(Arg.expressionNumbering), AA(Arg.AA), MD(Arg.MD),
- DT(Arg.DT), nextValueNumber(Arg.nextValueNumber) {}
-GVN::ValueTable::ValueTable(ValueTable &&Arg)
- : valueNumbering(std::move(Arg.valueNumbering)),
- expressionNumbering(std::move(Arg.expressionNumbering)),
- AA(std::move(Arg.AA)), MD(std::move(Arg.MD)), DT(std::move(Arg.DT)),
- nextValueNumber(std::move(Arg.nextValueNumber)) {}
-GVN::ValueTable::~ValueTable() {}
+GVN::ValueTable::ValueTable(const ValueTable &) = default;
+GVN::ValueTable::ValueTable(ValueTable &&) = default;
+GVN::ValueTable::~ValueTable() = default;
/// add - Insert a value into the table with a specified value number.
void GVN::ValueTable::add(Value *V, uint32_t num) {
@@ -583,7 +577,7 @@ void GVN::ValueTable::verifyRemoved(const Value *V) const {
// GVN Pass
//===----------------------------------------------------------------------===//
-PreservedAnalyses GVN::run(Function &F, AnalysisManager<Function> &AM) {
+PreservedAnalyses GVN::run(Function &F, FunctionAnalysisManager &AM) {
// FIXME: The order of evaluation of these 'getResult' calls is very
// significant! Re-ordering these variables will cause GVN when run alone to
// be less effective! We should fix memdep and basic-aa to not exhibit this
@@ -593,7 +587,9 @@ PreservedAnalyses GVN::run(Function &F, AnalysisManager<Function> &AM) {
auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
auto &AA = AM.getResult<AAManager>(F);
auto &MemDep = AM.getResult<MemoryDependenceAnalysis>(F);
- bool Changed = runImpl(F, AC, DT, TLI, AA, &MemDep);
+ auto *LI = AM.getCachedResult<LoopAnalysis>(F);
+ auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
+ bool Changed = runImpl(F, AC, DT, TLI, AA, &MemDep, LI, &ORE);
if (!Changed)
return PreservedAnalyses::all();
PreservedAnalyses PA;
@@ -725,8 +721,9 @@ static Value *CoerceAvailableValueToLoadType(Value *StoredVal, Type *LoadedTy,
assert(CanCoerceMustAliasedValueToLoad(StoredVal, LoadedTy, DL) &&
"precondition violation - materialization can't fail");
- if (auto *CExpr = dyn_cast<ConstantExpr>(StoredVal))
- StoredVal = ConstantFoldConstantExpression(CExpr, DL);
+ if (auto *C = dyn_cast<Constant>(StoredVal))
+ if (auto *FoldedStoredVal = ConstantFoldConstant(C, DL))
+ StoredVal = FoldedStoredVal;
// If this is already the right type, just return it.
Type *StoredValTy = StoredVal->getType();
@@ -759,8 +756,9 @@ static Value *CoerceAvailableValueToLoadType(Value *StoredVal, Type *LoadedTy,
StoredVal = IRB.CreateIntToPtr(StoredVal, LoadedTy);
}
- if (auto *CExpr = dyn_cast<ConstantExpr>(StoredVal))
- StoredVal = ConstantFoldConstantExpression(CExpr, DL);
+ if (auto *C = dyn_cast<ConstantExpr>(StoredVal))
+ if (auto *FoldedStoredVal = ConstantFoldConstant(C, DL))
+ StoredVal = FoldedStoredVal;
return StoredVal;
}
@@ -804,8 +802,9 @@ static Value *CoerceAvailableValueToLoadType(Value *StoredVal, Type *LoadedTy,
StoredVal = IRB.CreateBitCast(StoredVal, LoadedTy, "bitcast");
}
- if (auto *CExpr = dyn_cast<ConstantExpr>(StoredVal))
- StoredVal = ConstantFoldConstantExpression(CExpr, DL);
+ if (auto *C = dyn_cast<Constant>(StoredVal))
+ if (auto *FoldedStoredVal = ConstantFoldConstant(C, DL))
+ StoredVal = FoldedStoredVal;
return StoredVal;
}
@@ -838,16 +837,6 @@ static int AnalyzeLoadFromClobberingWrite(Type *LoadTy, Value *LoadPtr,
// a must alias. AA must have gotten confused.
// FIXME: Study to see if/when this happens. One case is forwarding a memset
// to a load from the base of the memset.
-#if 0
- if (LoadOffset == StoreOffset) {
- dbgs() << "STORE/LOAD DEP WITH COMMON POINTER MISSED:\n"
- << "Base = " << *StoreBase << "\n"
- << "Store Ptr = " << *WritePtr << "\n"
- << "Store Offs = " << StoreOffset << "\n"
- << "Load Ptr = " << *LoadPtr << "\n";
- abort();
- }
-#endif
// If the load and store don't overlap at all, the store doesn't provide
// anything to the load. In this case, they really don't alias at all, AA
@@ -856,8 +845,8 @@ static int AnalyzeLoadFromClobberingWrite(Type *LoadTy, Value *LoadPtr,
if ((WriteSizeInBits & 7) | (LoadSize & 7))
return -1;
- uint64_t StoreSize = WriteSizeInBits >> 3; // Convert to bytes.
- LoadSize >>= 3;
+ uint64_t StoreSize = WriteSizeInBits / 8; // Convert to bytes.
+ LoadSize /= 8;
bool isAAFailure = false;
@@ -866,17 +855,8 @@ static int AnalyzeLoadFromClobberingWrite(Type *LoadTy, Value *LoadPtr,
else
isAAFailure = LoadOffset+int64_t(LoadSize) <= StoreOffset;
- if (isAAFailure) {
-#if 0
- dbgs() << "STORE LOAD DEP WITH COMMON BASE:\n"
- << "Base = " << *StoreBase << "\n"
- << "Store Ptr = " << *WritePtr << "\n"
- << "Store Offs = " << StoreOffset << "\n"
- << "Load Ptr = " << *LoadPtr << "\n";
- abort();
-#endif
+ if (isAAFailure)
return -1;
- }
// If the Load isn't completely contained within the stored bits, we don't
// have all the bits to feed it. We could do something crazy in the future
@@ -1229,6 +1209,38 @@ static bool isLifetimeStart(const Instruction *Inst) {
return false;
}
+/// \brief Try to locate the three instruction involved in a missed
+/// load-elimination case that is due to an intervening store.
+static void reportMayClobberedLoad(LoadInst *LI, MemDepResult DepInfo,
+ DominatorTree *DT,
+ OptimizationRemarkEmitter *ORE) {
+ using namespace ore;
+ User *OtherAccess = nullptr;
+
+ OptimizationRemarkMissed R(DEBUG_TYPE, "LoadClobbered", LI);
+ R << "load of type " << NV("Type", LI->getType()) << " not eliminated"
+ << setExtraArgs();
+
+ for (auto *U : LI->getPointerOperand()->users())
+ if (U != LI && (isa<LoadInst>(U) || isa<StoreInst>(U)) &&
+ DT->dominates(cast<Instruction>(U), LI)) {
+ // FIXME: for now give up if there are multiple memory accesses that
+ // dominate the load. We need further analysis to decide which one is
+ // that we're forwarding from.
+ if (OtherAccess)
+ OtherAccess = nullptr;
+ else
+ OtherAccess = U;
+ }
+
+ if (OtherAccess)
+ R << " in favor of " << NV("OtherAccess", OtherAccess);
+
+ R << " because it is clobbered by " << NV("ClobberedBy", DepInfo.getInst());
+
+ ORE->emit(R);
+}
+
bool GVN::AnalyzeLoadAvailability(LoadInst *LI, MemDepResult DepInfo,
Value *Address, AvailableValue &Res) {
@@ -1293,6 +1305,10 @@ bool GVN::AnalyzeLoadAvailability(LoadInst *LI, MemDepResult DepInfo,
Instruction *I = DepInfo.getInst();
dbgs() << " is clobbered by " << *I << '\n';
);
+
+ if (ORE->allowExtraAnalysis())
+ reportMayClobberedLoad(LI, DepInfo, DT, ORE);
+
return false;
}
assert(DepInfo.isDef() && "follows from above");
@@ -1556,6 +1572,13 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
// Assign value numbers to the new instructions.
for (Instruction *I : NewInsts) {
+ // Instructions that have been inserted in predecessor(s) to materialize
+ // the load address do not retain their original debug locations. Doing
+ // so could lead to confusing (but correct) source attributions.
+ // FIXME: How do we retain source locations without causing poor debugging
+ // behavior?
+ I->setDebugLoc(DebugLoc());
+
// FIXME: We really _ought_ to insert these value numbers into their
// parent's availability map. However, in doing so, we risk getting into
// ordering issues. If a block hasn't been processed yet, we would be
@@ -1585,8 +1608,11 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
if (auto *RangeMD = LI->getMetadata(LLVMContext::MD_range))
NewLoad->setMetadata(LLVMContext::MD_range, RangeMD);
- // Transfer DebugLoc.
- NewLoad->setDebugLoc(LI->getDebugLoc());
+ // We do not propagate the old load's debug location, because the new
+ // load now lives in a different BB, and we want to avoid a jumpy line
+ // table.
+ // FIXME: How do we retain source locations without causing poor debugging
+ // behavior?
// Add the newly created load.
ValuesPerBlock.push_back(AvailableValueInBlock::get(UnavailablePred,
@@ -1605,10 +1631,21 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
if (V->getType()->getScalarType()->isPointerTy())
MD->invalidateCachedPointerInfo(V);
markInstructionForDeletion(LI);
+ ORE->emit(OptimizationRemark(DEBUG_TYPE, "LoadPRE", LI)
+ << "load eliminated by PRE");
++NumPRELoad;
return true;
}
+static void reportLoadElim(LoadInst *LI, Value *AvailableValue,
+ OptimizationRemarkEmitter *ORE) {
+ using namespace ore;
+ ORE->emit(OptimizationRemark(DEBUG_TYPE, "LoadElim", LI)
+ << "load of type " << NV("Type", LI->getType()) << " eliminated"
+ << setExtraArgs() << " in favor of "
+ << NV("InfavorOfValue", AvailableValue));
+}
+
/// Attempt to eliminate a load whose dependencies are
/// non-local by performing PHI construction.
bool GVN::processNonLocalLoad(LoadInst *LI) {
@@ -1673,12 +1710,16 @@ bool GVN::processNonLocalLoad(LoadInst *LI) {
if (isa<PHINode>(V))
V->takeName(LI);
if (Instruction *I = dyn_cast<Instruction>(V))
- if (LI->getDebugLoc())
+ // If instruction I has debug info, then we should not update it.
+ // Also, if I has a null DebugLoc, then it is still potentially incorrect
+ // to propagate LI's DebugLoc because LI may not post-dominate I.
+ if (LI->getDebugLoc() && ValuesPerBlock.size() != 1)
I->setDebugLoc(LI->getDebugLoc());
if (V->getType()->getScalarType()->isPointerTy())
MD->invalidateCachedPointerInfo(V);
markInstructionForDeletion(LI);
++NumGVNLoad;
+ reportLoadElim(LI, V, ORE);
return true;
}
@@ -1754,7 +1795,12 @@ static void patchReplacementInstruction(Instruction *I, Value *Repl) {
// Patch the replacement so that it is not more restrictive than the value
// being replaced.
- ReplInst->andIRFlags(I);
+ // Note that if 'I' is a load being replaced by some operation,
+ // for example, by an arithmetic operation, then andIRFlags()
+ // would just erase all math flags from the original arithmetic
+ // operation, which is clearly not wanted and not needed.
+ if (!isa<LoadInst>(I))
+ ReplInst->andIRFlags(I);
// FIXME: If both the original and replacement value are part of the
// same control-flow region (meaning that the execution of one
@@ -1820,6 +1866,7 @@ bool GVN::processLoad(LoadInst *L) {
patchAndReplaceAllUsesWith(L, AvailableValue);
markInstructionForDeletion(L);
++NumGVNLoad;
+ reportLoadElim(L, AvailableValue, ORE);
// Tell MDA to rexamine the reused pointer since we might have more
// information after forwarding it.
if (MD && AvailableValue->getType()->getScalarType()->isPointerTy())
@@ -2197,7 +2244,8 @@ bool GVN::processInstruction(Instruction *I) {
/// runOnFunction - This is the main transformation entry point for a function.
bool GVN::runImpl(Function &F, AssumptionCache &RunAC, DominatorTree &RunDT,
const TargetLibraryInfo &RunTLI, AAResults &RunAA,
- MemoryDependenceResults *RunMD) {
+ MemoryDependenceResults *RunMD, LoopInfo *LI,
+ OptimizationRemarkEmitter *RunORE) {
AC = &RunAC;
DT = &RunDT;
VN.setDomTree(DT);
@@ -2205,6 +2253,7 @@ bool GVN::runImpl(Function &F, AssumptionCache &RunAC, DominatorTree &RunDT,
VN.setAliasAnalysis(&RunAA);
MD = RunMD;
VN.setMemDep(MD);
+ ORE = RunORE;
bool Changed = false;
bool ShouldContinue = true;
@@ -2214,9 +2263,9 @@ bool GVN::runImpl(Function &F, AssumptionCache &RunAC, DominatorTree &RunDT,
for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; ) {
BasicBlock *BB = &*FI++;
- bool removedBlock =
- MergeBlockIntoPredecessor(BB, DT, /* LoopInfo */ nullptr, MD);
- if (removedBlock) ++NumGVNBlocks;
+ bool removedBlock = MergeBlockIntoPredecessor(BB, DT, LI, MD);
+ if (removedBlock)
+ ++NumGVNBlocks;
Changed |= removedBlock;
}
@@ -2711,13 +2760,17 @@ public:
if (skipFunction(F))
return false;
+ auto *LIWP = getAnalysisIfAvailable<LoopInfoWrapperPass>();
+
return Impl.runImpl(
F, getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F),
getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(),
getAnalysis<AAResultsWrapperPass>().getAAResults(),
NoLoads ? nullptr
- : &getAnalysis<MemoryDependenceWrapperPass>().getMemDep());
+ : &getAnalysis<MemoryDependenceWrapperPass>().getMemDep(),
+ LIWP ? &LIWP->getLoopInfo() : nullptr,
+ &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE());
}
void getAnalysisUsage(AnalysisUsage &AU) const override {
@@ -2730,6 +2783,7 @@ public:
AU.addPreserved<DominatorTreeWrapperPass>();
AU.addPreserved<GlobalsAAWrapperPass>();
+ AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
}
private:
@@ -2751,4 +2805,5 @@ INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
INITIALIZE_PASS_END(GVNLegacyPass, "gvn", "Global Value Numbering", false, false)
diff --git a/contrib/llvm/lib/Transforms/Scalar/GVNHoist.cpp b/contrib/llvm/lib/Transforms/Scalar/GVNHoist.cpp
index cce1db3..f8e1d2e 100644
--- a/contrib/llvm/lib/Transforms/Scalar/GVNHoist.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/GVNHoist.cpp
@@ -9,20 +9,23 @@
//
// This pass hoists expressions from branches to a common dominator. It uses
// GVN (global value numbering) to discover expressions computing the same
-// values. The primary goal is to reduce the code size, and in some
-// cases reduce critical path (by exposing more ILP).
+// values. The primary goals of code-hoisting are:
+// 1. To reduce the code size.
+// 2. In some cases reduce critical path (by exposing more ILP).
+//
// Hoisting may affect the performance in some cases. To mitigate that, hoisting
// is disabled in the following cases.
// 1. Scalars across calls.
// 2. geps when corresponding load/store cannot be hoisted.
//===----------------------------------------------------------------------===//
+#include "llvm/Transforms/Scalar/GVN.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Scalar/GVN.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/MemorySSA.h"
using namespace llvm;
@@ -47,15 +50,25 @@ static cl::opt<int> MaxNumberOfBBSInPath(
cl::desc("Max number of basic blocks on the path between "
"hoisting locations (default = 4, unlimited = -1)"));
+static cl::opt<int> MaxDepthInBB(
+ "gvn-hoist-max-depth", cl::Hidden, cl::init(100),
+ cl::desc("Hoist instructions from the beginning of the BB up to the "
+ "maximum specified depth (default = 100, unlimited = -1)"));
+
+static cl::opt<int>
+ MaxChainLength("gvn-hoist-max-chain-length", cl::Hidden, cl::init(10),
+ cl::desc("Maximum length of dependent chains to hoist "
+ "(default = 10, unlimited = -1)"));
+
namespace {
// Provides a sorting function based on the execution order of two instructions.
struct SortByDFSIn {
private:
- DenseMap<const BasicBlock *, unsigned> &DFSNumber;
+ DenseMap<const Value *, unsigned> &DFSNumber;
public:
- SortByDFSIn(DenseMap<const BasicBlock *, unsigned> &D) : DFSNumber(D) {}
+ SortByDFSIn(DenseMap<const Value *, unsigned> &D) : DFSNumber(D) {}
// Returns true when A executes before B.
bool operator()(const Instruction *A, const Instruction *B) const {
@@ -68,16 +81,16 @@ public:
const BasicBlock *BA = A->getParent();
const BasicBlock *BB = B->getParent();
- unsigned NA = DFSNumber[BA];
- unsigned NB = DFSNumber[BB];
- if (NA < NB)
- return true;
- if (NA == NB) {
- // Sort them in the order they occur in the same basic block.
- BasicBlock::const_iterator AI(A), BI(B);
- return std::distance(AI, BI) < 0;
+ unsigned ADFS, BDFS;
+ if (BA == BB) {
+ ADFS = DFSNumber.lookup(A);
+ BDFS = DFSNumber.lookup(B);
+ } else {
+ ADFS = DFSNumber.lookup(BA);
+ BDFS = DFSNumber.lookup(BB);
}
- return false;
+ assert(ADFS && BDFS);
+ return ADFS < BDFS;
}
};
@@ -172,27 +185,77 @@ typedef DenseMap<const BasicBlock *, bool> BBSideEffectsSet;
typedef SmallVector<Instruction *, 4> SmallVecInsn;
typedef SmallVectorImpl<Instruction *> SmallVecImplInsn;
+static void combineKnownMetadata(Instruction *ReplInst, Instruction *I) {
+ static const unsigned KnownIDs[] = {
+ LLVMContext::MD_tbaa, LLVMContext::MD_alias_scope,
+ LLVMContext::MD_noalias, LLVMContext::MD_range,
+ LLVMContext::MD_fpmath, LLVMContext::MD_invariant_load,
+ LLVMContext::MD_invariant_group};
+ combineMetadata(ReplInst, I, KnownIDs);
+}
+
// This pass hoists common computations across branches sharing common
// dominator. The primary goal is to reduce the code size, and in some
// cases reduce critical path (by exposing more ILP).
class GVNHoist {
public:
+ GVNHoist(DominatorTree *DT, AliasAnalysis *AA, MemoryDependenceResults *MD,
+ MemorySSA *MSSA)
+ : DT(DT), AA(AA), MD(MD), MSSA(MSSA),
+ HoistingGeps(false),
+ HoistedCtr(0)
+ { }
+
+ bool run(Function &F) {
+ VN.setDomTree(DT);
+ VN.setAliasAnalysis(AA);
+ VN.setMemDep(MD);
+ bool Res = false;
+ // Perform DFS Numbering of instructions.
+ unsigned BBI = 0;
+ for (const BasicBlock *BB : depth_first(&F.getEntryBlock())) {
+ DFSNumber[BB] = ++BBI;
+ unsigned I = 0;
+ for (auto &Inst : *BB)
+ DFSNumber[&Inst] = ++I;
+ }
+
+ int ChainLength = 0;
+
+ // FIXME: use lazy evaluation of VN to avoid the fix-point computation.
+ while (1) {
+ if (MaxChainLength != -1 && ++ChainLength >= MaxChainLength)
+ return Res;
+
+ auto HoistStat = hoistExpressions(F);
+ if (HoistStat.first + HoistStat.second == 0)
+ return Res;
+
+ if (HoistStat.second > 0)
+ // To address a limitation of the current GVN, we need to rerun the
+ // hoisting after we hoisted loads or stores in order to be able to
+ // hoist all scalars dependent on the hoisted ld/st.
+ VN.clear();
+
+ Res = true;
+ }
+
+ return Res;
+ }
+
+private:
GVN::ValueTable VN;
DominatorTree *DT;
AliasAnalysis *AA;
MemoryDependenceResults *MD;
- const bool OptForMinSize;
- DenseMap<const BasicBlock *, unsigned> DFSNumber;
- BBSideEffectsSet BBSideEffects;
MemorySSA *MSSA;
+ const bool HoistingGeps;
+ DenseMap<const Value *, unsigned> DFSNumber;
+ BBSideEffectsSet BBSideEffects;
int HoistedCtr;
enum InsKind { Unknown, Scalar, Load, Store };
- GVNHoist(DominatorTree *Dt, AliasAnalysis *Aa, MemoryDependenceResults *Md,
- bool OptForMinSize)
- : DT(Dt), AA(Aa), MD(Md), OptForMinSize(OptForMinSize), HoistedCtr(0) {}
-
// Return true when there are exception handling in BB.
bool hasEH(const BasicBlock *BB) {
auto It = BBSideEffects.find(BB);
@@ -213,24 +276,32 @@ public:
return false;
}
- // Return true when all paths from A to the end of the function pass through
- // either B or C.
- bool hoistingFromAllPaths(const BasicBlock *A, const BasicBlock *B,
- const BasicBlock *C) {
- // We fully copy the WL in order to be able to remove items from it.
- SmallPtrSet<const BasicBlock *, 2> WL;
- WL.insert(B);
- WL.insert(C);
-
- for (auto It = df_begin(A), E = df_end(A); It != E;) {
- // There exists a path from A to the exit of the function if we are still
- // iterating in DF traversal and we removed all instructions from the work
- // list.
- if (WL.empty())
+ // Return true when a successor of BB dominates A.
+ bool successorDominate(const BasicBlock *BB, const BasicBlock *A) {
+ for (const BasicBlock *Succ : BB->getTerminator()->successors())
+ if (DT->dominates(Succ, A))
+ return true;
+
+ return false;
+ }
+
+ // Return true when all paths from HoistBB to the end of the function pass
+ // through one of the blocks in WL.
+ bool hoistingFromAllPaths(const BasicBlock *HoistBB,
+ SmallPtrSetImpl<const BasicBlock *> &WL) {
+
+ // Copy WL as the loop will remove elements from it.
+ SmallPtrSet<const BasicBlock *, 2> WorkList(WL.begin(), WL.end());
+
+ for (auto It = df_begin(HoistBB), E = df_end(HoistBB); It != E;) {
+ // There exists a path from HoistBB to the exit of the function if we are
+ // still iterating in DF traversal and we removed all instructions from
+ // the work list.
+ if (WorkList.empty())
return false;
const BasicBlock *BB = *It;
- if (WL.erase(BB)) {
+ if (WorkList.erase(BB)) {
// Stop DFS traversal when BB is in the work list.
It.skipChildren();
continue;
@@ -240,6 +311,11 @@ public:
if (!isGuaranteedToTransferExecutionToSuccessor(BB->getTerminator()))
return false;
+ // When reaching the back-edge of a loop, there may be a path through the
+ // loop that does not pass through B or C before exiting the loop.
+ if (successorDominate(BB, HoistBB))
+ return false;
+
// Increment DFS traversal when not skipping children.
++It;
}
@@ -248,40 +324,43 @@ public:
}
/* Return true when I1 appears before I2 in the instructions of BB. */
- bool firstInBB(BasicBlock *BB, const Instruction *I1, const Instruction *I2) {
- for (Instruction &I : *BB) {
- if (&I == I1)
- return true;
- if (&I == I2)
- return false;
- }
-
- llvm_unreachable("I1 and I2 not found in BB");
+ bool firstInBB(const Instruction *I1, const Instruction *I2) {
+ assert(I1->getParent() == I2->getParent());
+ unsigned I1DFS = DFSNumber.lookup(I1);
+ unsigned I2DFS = DFSNumber.lookup(I2);
+ assert(I1DFS && I2DFS);
+ return I1DFS < I2DFS;
}
- // Return true when there are users of Def in BB.
- bool hasMemoryUseOnPath(MemoryAccess *Def, const BasicBlock *BB,
- const Instruction *OldPt) {
- const BasicBlock *DefBB = Def->getBlock();
- const BasicBlock *OldBB = OldPt->getParent();
- for (User *U : Def->users())
- if (auto *MU = dyn_cast<MemoryUse>(U)) {
- BasicBlock *UBB = MU->getBlock();
- // Only analyze uses in BB.
- if (BB != UBB)
- continue;
+ // Return true when there are memory uses of Def in BB.
+ bool hasMemoryUse(const Instruction *NewPt, MemoryDef *Def,
+ const BasicBlock *BB) {
+ const MemorySSA::AccessList *Acc = MSSA->getBlockAccesses(BB);
+ if (!Acc)
+ return false;
- // A use in the same block as the Def is on the path.
- if (UBB == DefBB) {
- assert(MSSA->locallyDominates(Def, MU) && "def not dominating use");
- return true;
- }
+ Instruction *OldPt = Def->getMemoryInst();
+ const BasicBlock *OldBB = OldPt->getParent();
+ const BasicBlock *NewBB = NewPt->getParent();
+ bool ReachedNewPt = false;
- if (UBB != OldBB)
- return true;
+ for (const MemoryAccess &MA : *Acc)
+ if (const MemoryUse *MU = dyn_cast<MemoryUse>(&MA)) {
+ Instruction *Insn = MU->getMemoryInst();
+
+ // Do not check whether MU aliases Def when MU occurs after OldPt.
+ if (BB == OldBB && firstInBB(OldPt, Insn))
+ break;
- // It is only harmful to hoist when the use is before OldPt.
- if (firstInBB(UBB, MU->getMemoryInst(), OldPt))
+ // Do not check whether MU aliases Def when MU occurs before NewPt.
+ if (BB == NewBB) {
+ if (!ReachedNewPt) {
+ if (firstInBB(Insn, NewPt))
+ continue;
+ ReachedNewPt = true;
+ }
+ }
+ if (defClobbersUseOrDef(Def, MU, *AA))
return true;
}
@@ -289,17 +368,18 @@ public:
}
// Return true when there are exception handling or loads of memory Def
- // between OldPt and NewPt.
+ // between Def and NewPt. This function is only called for stores: Def is
+ // the MemoryDef of the store to be hoisted.
// Decrement by 1 NBBsOnAllPaths for each block between HoistPt and BB, and
// return true when the counter NBBsOnAllPaths reaces 0, except when it is
// initialized to -1 which is unlimited.
- bool hasEHOrLoadsOnPath(const Instruction *NewPt, const Instruction *OldPt,
- MemoryAccess *Def, int &NBBsOnAllPaths) {
+ bool hasEHOrLoadsOnPath(const Instruction *NewPt, MemoryDef *Def,
+ int &NBBsOnAllPaths) {
const BasicBlock *NewBB = NewPt->getParent();
- const BasicBlock *OldBB = OldPt->getParent();
+ const BasicBlock *OldBB = Def->getBlock();
assert(DT->dominates(NewBB, OldBB) && "invalid path");
- assert(DT->dominates(Def->getBlock(), NewBB) &&
+ assert(DT->dominates(Def->getDefiningAccess()->getBlock(), NewBB) &&
"def does not dominate new hoisting point");
// Walk all basic blocks reachable in depth-first iteration on the inverse
@@ -313,16 +393,16 @@ public:
continue;
}
+ // Stop walk once the limit is reached.
+ if (NBBsOnAllPaths == 0)
+ return true;
+
// Impossible to hoist with exceptions on the path.
if (hasEH(*I))
return true;
// Check that we do not move a store past loads.
- if (hasMemoryUseOnPath(Def, *I, OldPt))
- return true;
-
- // Stop walk once the limit is reached.
- if (NBBsOnAllPaths == 0)
+ if (hasMemoryUse(NewPt, Def, *I))
return true;
// -1 is unlimited number of blocks on all paths.
@@ -355,14 +435,14 @@ public:
continue;
}
- // Impossible to hoist with exceptions on the path.
- if (hasEH(*I))
- return true;
-
// Stop walk once the limit is reached.
if (NBBsOnAllPaths == 0)
return true;
+ // Impossible to hoist with exceptions on the path.
+ if (hasEH(*I))
+ return true;
+
// -1 is unlimited number of blocks on all paths.
if (NBBsOnAllPaths != -1)
--NBBsOnAllPaths;
@@ -395,13 +475,13 @@ public:
if (NewBB == DBB && !MSSA->isLiveOnEntryDef(D))
if (auto *UD = dyn_cast<MemoryUseOrDef>(D))
- if (firstInBB(DBB, NewPt, UD->getMemoryInst()))
+ if (firstInBB(NewPt, UD->getMemoryInst()))
// Cannot move the load or store to NewPt above its definition in D.
return false;
// Check for unsafe hoistings due to side effects.
if (K == InsKind::Store) {
- if (hasEHOrLoadsOnPath(NewPt, OldPt, D, NBBsOnAllPaths))
+ if (hasEHOrLoadsOnPath(NewPt, dyn_cast<MemoryDef>(U), NBBsOnAllPaths))
return false;
} else if (hasEHOnPath(NewBB, OldBB, NBBsOnAllPaths))
return false;
@@ -417,23 +497,19 @@ public:
return true;
}
- // Return true when it is safe to hoist scalar instructions from BB1 and BB2
- // to HoistBB.
- bool safeToHoistScalar(const BasicBlock *HoistBB, const BasicBlock *BB1,
- const BasicBlock *BB2, int &NBBsOnAllPaths) {
- // Check that the hoisted expression is needed on all paths. When HoistBB
- // already contains an instruction to be hoisted, the expression is needed
- // on all paths. Enable scalar hoisting at -Oz as it is safe to hoist
- // scalars to a place where they are partially needed.
- if (!OptForMinSize && BB1 != HoistBB &&
- !hoistingFromAllPaths(HoistBB, BB1, BB2))
+ // Return true when it is safe to hoist scalar instructions from all blocks in
+ // WL to HoistBB.
+ bool safeToHoistScalar(const BasicBlock *HoistBB,
+ SmallPtrSetImpl<const BasicBlock *> &WL,
+ int &NBBsOnAllPaths) {
+ // Check that the hoisted expression is needed on all paths.
+ if (!hoistingFromAllPaths(HoistBB, WL))
return false;
- if (hasEHOnPath(HoistBB, BB1, NBBsOnAllPaths) ||
- hasEHOnPath(HoistBB, BB2, NBBsOnAllPaths))
- return false;
+ for (const BasicBlock *BB : WL)
+ if (hasEHOnPath(HoistBB, BB, NBBsOnAllPaths))
+ return false;
- // Safe to hoist scalars from BB1 and BB2 to HoistBB.
return true;
}
@@ -454,7 +530,7 @@ public:
std::sort(InstructionsToHoist.begin(), InstructionsToHoist.end(), Pred);
}
- int NBBsOnAllPaths = MaxNumberOfBBSInPath;
+ int NumBBsOnAllPaths = MaxNumberOfBBSInPath;
SmallVecImplInsn::iterator II = InstructionsToHoist.begin();
SmallVecImplInsn::iterator Start = II;
@@ -462,7 +538,7 @@ public:
BasicBlock *HoistBB = HoistPt->getParent();
MemoryUseOrDef *UD;
if (K != InsKind::Scalar)
- UD = cast<MemoryUseOrDef>(MSSA->getMemoryAccess(HoistPt));
+ UD = MSSA->getMemoryAccess(HoistPt);
for (++II; II != InstructionsToHoist.end(); ++II) {
Instruction *Insn = *II;
@@ -470,10 +546,12 @@ public:
BasicBlock *NewHoistBB;
Instruction *NewHoistPt;
- if (BB == HoistBB) {
+ if (BB == HoistBB) { // Both are in the same Basic Block.
NewHoistBB = HoistBB;
- NewHoistPt = firstInBB(BB, Insn, HoistPt) ? Insn : HoistPt;
+ NewHoistPt = firstInBB(Insn, HoistPt) ? Insn : HoistPt;
} else {
+ // If the hoisting point contains one of the instructions,
+ // then hoist there, otherwise hoist before the terminator.
NewHoistBB = DT->findNearestCommonDominator(HoistBB, BB);
if (NewHoistBB == BB)
NewHoistPt = Insn;
@@ -483,8 +561,12 @@ public:
NewHoistPt = NewHoistBB->getTerminator();
}
+ SmallPtrSet<const BasicBlock *, 2> WL;
+ WL.insert(HoistBB);
+ WL.insert(BB);
+
if (K == InsKind::Scalar) {
- if (safeToHoistScalar(NewHoistBB, HoistBB, BB, NBBsOnAllPaths)) {
+ if (safeToHoistScalar(NewHoistBB, WL, NumBBsOnAllPaths)) {
// Extend HoistPt to NewHoistPt.
HoistPt = NewHoistPt;
HoistBB = NewHoistBB;
@@ -498,13 +580,12 @@ public:
// loading from the same address: for instance there may be a branch on
// which the address of the load may not be initialized.
if ((HoistBB == NewHoistBB || BB == NewHoistBB ||
- hoistingFromAllPaths(NewHoistBB, HoistBB, BB)) &&
+ hoistingFromAllPaths(NewHoistBB, WL)) &&
// Also check that it is safe to move the load or store from HoistPt
// to NewHoistPt, and from Insn to NewHoistPt.
- safeToHoistLdSt(NewHoistPt, HoistPt, UD, K, NBBsOnAllPaths) &&
- safeToHoistLdSt(NewHoistPt, Insn,
- cast<MemoryUseOrDef>(MSSA->getMemoryAccess(Insn)),
- K, NBBsOnAllPaths)) {
+ safeToHoistLdSt(NewHoistPt, HoistPt, UD, K, NumBBsOnAllPaths) &&
+ safeToHoistLdSt(NewHoistPt, Insn, MSSA->getMemoryAccess(Insn),
+ K, NumBBsOnAllPaths)) {
// Extend HoistPt to NewHoistPt.
HoistPt = NewHoistPt;
HoistBB = NewHoistBB;
@@ -520,10 +601,10 @@ public:
// Start over from BB.
Start = II;
if (K != InsKind::Scalar)
- UD = cast<MemoryUseOrDef>(MSSA->getMemoryAccess(*Start));
+ UD = MSSA->getMemoryAccess(*Start);
HoistPt = Insn;
HoistBB = BB;
- NBBsOnAllPaths = MaxNumberOfBBSInPath;
+ NumBBsOnAllPaths = MaxNumberOfBBSInPath;
}
// Save the last partition.
@@ -567,40 +648,88 @@ public:
return true;
}
- Instruction *firstOfTwo(Instruction *I, Instruction *J) const {
- for (Instruction &I1 : *I->getParent())
- if (&I1 == I || &I1 == J)
- return &I1;
- llvm_unreachable("Both I and J must be from same BB");
+ // Same as allOperandsAvailable with recursive check for GEP operands.
+ bool allGepOperandsAvailable(const Instruction *I,
+ const BasicBlock *HoistPt) const {
+ for (const Use &Op : I->operands())
+ if (const auto *Inst = dyn_cast<Instruction>(&Op))
+ if (!DT->dominates(Inst->getParent(), HoistPt)) {
+ if (const GetElementPtrInst *GepOp =
+ dyn_cast<GetElementPtrInst>(Inst)) {
+ if (!allGepOperandsAvailable(GepOp, HoistPt))
+ return false;
+ // Gep is available if all operands of GepOp are available.
+ } else {
+ // Gep is not available if it has operands other than GEPs that are
+ // defined in blocks not dominating HoistPt.
+ return false;
+ }
+ }
+ return true;
}
- // Replace the use of From with To in Insn.
- void replaceUseWith(Instruction *Insn, Value *From, Value *To) const {
- for (Value::use_iterator UI = From->use_begin(), UE = From->use_end();
- UI != UE;) {
- Use &U = *UI++;
- if (U.getUser() == Insn) {
- U.set(To);
- return;
+ // Make all operands of the GEP available.
+ void makeGepsAvailable(Instruction *Repl, BasicBlock *HoistPt,
+ const SmallVecInsn &InstructionsToHoist,
+ Instruction *Gep) const {
+ assert(allGepOperandsAvailable(Gep, HoistPt) &&
+ "GEP operands not available");
+
+ Instruction *ClonedGep = Gep->clone();
+ for (unsigned i = 0, e = Gep->getNumOperands(); i != e; ++i)
+ if (Instruction *Op = dyn_cast<Instruction>(Gep->getOperand(i))) {
+
+ // Check whether the operand is already available.
+ if (DT->dominates(Op->getParent(), HoistPt))
+ continue;
+
+ // As a GEP can refer to other GEPs, recursively make all the operands
+ // of this GEP available at HoistPt.
+ if (GetElementPtrInst *GepOp = dyn_cast<GetElementPtrInst>(Op))
+ makeGepsAvailable(ClonedGep, HoistPt, InstructionsToHoist, GepOp);
}
+
+ // Copy Gep and replace its uses in Repl with ClonedGep.
+ ClonedGep->insertBefore(HoistPt->getTerminator());
+
+ // Conservatively discard any optimization hints, they may differ on the
+ // other paths.
+ ClonedGep->dropUnknownNonDebugMetadata();
+
+ // If we have optimization hints which agree with each other along different
+ // paths, preserve them.
+ for (const Instruction *OtherInst : InstructionsToHoist) {
+ const GetElementPtrInst *OtherGep;
+ if (auto *OtherLd = dyn_cast<LoadInst>(OtherInst))
+ OtherGep = cast<GetElementPtrInst>(OtherLd->getPointerOperand());
+ else
+ OtherGep = cast<GetElementPtrInst>(
+ cast<StoreInst>(OtherInst)->getPointerOperand());
+ ClonedGep->andIRFlags(OtherGep);
}
- llvm_unreachable("should replace exactly once");
+
+ // Replace uses of Gep with ClonedGep in Repl.
+ Repl->replaceUsesOfWith(Gep, ClonedGep);
}
- bool makeOperandsAvailable(Instruction *Repl, BasicBlock *HoistPt) const {
+ // In the case Repl is a load or a store, we make all their GEPs
+ // available: GEPs are not hoisted by default to avoid the address
+ // computations to be hoisted without the associated load or store.
+ bool makeGepOperandsAvailable(Instruction *Repl, BasicBlock *HoistPt,
+ const SmallVecInsn &InstructionsToHoist) const {
// Check whether the GEP of a ld/st can be synthesized at HoistPt.
GetElementPtrInst *Gep = nullptr;
Instruction *Val = nullptr;
- if (auto *Ld = dyn_cast<LoadInst>(Repl))
+ if (auto *Ld = dyn_cast<LoadInst>(Repl)) {
Gep = dyn_cast<GetElementPtrInst>(Ld->getPointerOperand());
- if (auto *St = dyn_cast<StoreInst>(Repl)) {
+ } else if (auto *St = dyn_cast<StoreInst>(Repl)) {
Gep = dyn_cast<GetElementPtrInst>(St->getPointerOperand());
Val = dyn_cast<Instruction>(St->getValueOperand());
// Check that the stored value is available.
if (Val) {
if (isa<GetElementPtrInst>(Val)) {
// Check whether we can compute the GEP at HoistPt.
- if (!allOperandsAvailable(Val, HoistPt))
+ if (!allGepOperandsAvailable(Val, HoistPt))
return false;
} else if (!DT->dominates(Val->getParent(), HoistPt))
return false;
@@ -608,20 +737,13 @@ public:
}
// Check whether we can compute the Gep at HoistPt.
- if (!Gep || !allOperandsAvailable(Gep, HoistPt))
+ if (!Gep || !allGepOperandsAvailable(Gep, HoistPt))
return false;
- // Copy the gep before moving the ld/st.
- Instruction *ClonedGep = Gep->clone();
- ClonedGep->insertBefore(HoistPt->getTerminator());
- replaceUseWith(Repl, Gep, ClonedGep);
+ makeGepsAvailable(Repl, HoistPt, InstructionsToHoist, Gep);
- // Also copy Val when it is a GEP.
- if (Val && isa<GetElementPtrInst>(Val)) {
- Instruction *ClonedVal = Val->clone();
- ClonedVal->insertBefore(HoistPt->getTerminator());
- replaceUseWith(Repl, Val, ClonedVal);
- }
+ if (Val && isa<GetElementPtrInst>(Val))
+ makeGepsAvailable(Repl, HoistPt, InstructionsToHoist, Val);
return true;
}
@@ -635,17 +757,21 @@ public:
const SmallVecInsn &InstructionsToHoist = HP.second;
Instruction *Repl = nullptr;
for (Instruction *I : InstructionsToHoist)
- if (I->getParent() == HoistPt) {
+ if (I->getParent() == HoistPt)
// If there are two instructions in HoistPt to be hoisted in place:
// update Repl to be the first one, such that we can rename the uses
// of the second based on the first.
- Repl = !Repl ? I : firstOfTwo(Repl, I);
- }
+ if (!Repl || firstInBB(I, Repl))
+ Repl = I;
+ // Keep track of whether we moved the instruction so we know whether we
+ // should move the MemoryAccess.
+ bool MoveAccess = true;
if (Repl) {
// Repl is already in HoistPt: it remains in place.
assert(allOperandsAvailable(Repl, HoistPt) &&
"instruction depends on operands that are not available");
+ MoveAccess = false;
} else {
// When we do not find Repl in HoistPt, select the first in the list
// and move it to HoistPt.
@@ -654,10 +780,39 @@ public:
// We can move Repl in HoistPt only when all operands are available.
// The order in which hoistings are done may influence the availability
// of operands.
- if (!allOperandsAvailable(Repl, HoistPt) &&
- !makeOperandsAvailable(Repl, HoistPt))
- continue;
- Repl->moveBefore(HoistPt->getTerminator());
+ if (!allOperandsAvailable(Repl, HoistPt)) {
+
+ // When HoistingGeps there is nothing more we can do to make the
+ // operands available: just continue.
+ if (HoistingGeps)
+ continue;
+
+ // When not HoistingGeps we need to copy the GEPs.
+ if (!makeGepOperandsAvailable(Repl, HoistPt, InstructionsToHoist))
+ continue;
+ }
+
+ // Move the instruction at the end of HoistPt.
+ Instruction *Last = HoistPt->getTerminator();
+ MD->removeInstruction(Repl);
+ Repl->moveBefore(Last);
+
+ DFSNumber[Repl] = DFSNumber[Last]++;
+ }
+
+ MemoryAccess *NewMemAcc = MSSA->getMemoryAccess(Repl);
+
+ if (MoveAccess) {
+ if (MemoryUseOrDef *OldMemAcc =
+ dyn_cast_or_null<MemoryUseOrDef>(NewMemAcc)) {
+ // The definition of this ld/st will not change: ld/st hoisting is
+ // legal when the ld/st is not moved past its current definition.
+ MemoryAccess *Def = OldMemAcc->getDefiningAccess();
+ NewMemAcc =
+ MSSA->createMemoryAccessInBB(Repl, Def, HoistPt, MemorySSA::End);
+ OldMemAcc->replaceAllUsesWith(NewMemAcc);
+ MSSA->removeMemoryAccess(OldMemAcc);
+ }
}
if (isa<LoadInst>(Repl))
@@ -673,15 +828,54 @@ public:
for (Instruction *I : InstructionsToHoist)
if (I != Repl) {
++NR;
- if (isa<LoadInst>(Repl))
+ if (auto *ReplacementLoad = dyn_cast<LoadInst>(Repl)) {
+ ReplacementLoad->setAlignment(
+ std::min(ReplacementLoad->getAlignment(),
+ cast<LoadInst>(I)->getAlignment()));
++NumLoadsRemoved;
- else if (isa<StoreInst>(Repl))
+ } else if (auto *ReplacementStore = dyn_cast<StoreInst>(Repl)) {
+ ReplacementStore->setAlignment(
+ std::min(ReplacementStore->getAlignment(),
+ cast<StoreInst>(I)->getAlignment()));
++NumStoresRemoved;
- else if (isa<CallInst>(Repl))
+ } else if (auto *ReplacementAlloca = dyn_cast<AllocaInst>(Repl)) {
+ ReplacementAlloca->setAlignment(
+ std::max(ReplacementAlloca->getAlignment(),
+ cast<AllocaInst>(I)->getAlignment()));
+ } else if (isa<CallInst>(Repl)) {
++NumCallsRemoved;
+ }
+
+ if (NewMemAcc) {
+ // Update the uses of the old MSSA access with NewMemAcc.
+ MemoryAccess *OldMA = MSSA->getMemoryAccess(I);
+ OldMA->replaceAllUsesWith(NewMemAcc);
+ MSSA->removeMemoryAccess(OldMA);
+ }
+
+ Repl->andIRFlags(I);
+ combineKnownMetadata(Repl, I);
I->replaceAllUsesWith(Repl);
+ // Also invalidate the Alias Analysis cache.
+ MD->removeInstruction(I);
I->eraseFromParent();
}
+
+ // Remove MemorySSA phi nodes with the same arguments.
+ if (NewMemAcc) {
+ SmallPtrSet<MemoryPhi *, 4> UsePhis;
+ for (User *U : NewMemAcc->users())
+ if (MemoryPhi *Phi = dyn_cast<MemoryPhi>(U))
+ UsePhis.insert(Phi);
+
+ for (auto *Phi : UsePhis) {
+ auto In = Phi->incoming_values();
+ if (all_of(In, [&](Use &U) { return U == NewMemAcc; })) {
+ Phi->replaceAllUsesWith(NewMemAcc);
+ MSSA->removeMemoryAccess(Phi);
+ }
+ }
+ }
}
NumHoisted += NL + NS + NC + NI;
@@ -700,7 +894,17 @@ public:
StoreInfo SI;
CallInfo CI;
for (BasicBlock *BB : depth_first(&F.getEntryBlock())) {
+ int InstructionNb = 0;
for (Instruction &I1 : *BB) {
+ // Only hoist the first instructions in BB up to MaxDepthInBB. Hoisting
+ // deeper may increase the register pressure and compilation time.
+ if (MaxDepthInBB != -1 && InstructionNb++ >= MaxDepthInBB)
+ break;
+
+ // Do not value number terminator instructions.
+ if (isa<TerminatorInst>(&I1))
+ break;
+
if (auto *Load = dyn_cast<LoadInst>(&I1))
LI.insert(Load, VN);
else if (auto *Store = dyn_cast<StoreInst>(&I1))
@@ -711,15 +915,14 @@ public:
Intr->getIntrinsicID() == Intrinsic::assume)
continue;
}
- if (Call->mayHaveSideEffects()) {
- if (!OptForMinSize)
- break;
- // We may continue hoisting across calls which write to memory.
- if (Call->mayThrow())
- break;
- }
+ if (Call->mayHaveSideEffects())
+ break;
+
+ if (Call->isConvergent())
+ break;
+
CI.insert(Call, VN);
- } else if (OptForMinSize || !isa<GetElementPtrInst>(&I1))
+ } else if (HoistingGeps || !isa<GetElementPtrInst>(&I1))
// Do not hoist scalars past calls that may write to memory because
// that could result in spills later. geps are handled separately.
// TODO: We can relax this for targets like AArch64 as they have more
@@ -737,39 +940,6 @@ public:
computeInsertionPoints(CI.getStoreVNTable(), HPL, InsKind::Store);
return hoist(HPL);
}
-
- bool run(Function &F) {
- VN.setDomTree(DT);
- VN.setAliasAnalysis(AA);
- VN.setMemDep(MD);
- bool Res = false;
-
- unsigned I = 0;
- for (const BasicBlock *BB : depth_first(&F.getEntryBlock()))
- DFSNumber.insert({BB, ++I});
-
- // FIXME: use lazy evaluation of VN to avoid the fix-point computation.
- while (1) {
- // FIXME: only compute MemorySSA once. We need to update the analysis in
- // the same time as transforming the code.
- MemorySSA M(F, AA, DT);
- MSSA = &M;
-
- auto HoistStat = hoistExpressions(F);
- if (HoistStat.first + HoistStat.second == 0) {
- return Res;
- }
- if (HoistStat.second > 0) {
- // To address a limitation of the current GVN, we need to rerun the
- // hoisting after we hoisted loads in order to be able to hoist all
- // scalars dependent on the hoisted loads. Same for stores.
- VN.clear();
- }
- Res = true;
- }
-
- return Res;
- }
};
class GVNHoistLegacyPass : public FunctionPass {
@@ -781,11 +951,14 @@ public:
}
bool runOnFunction(Function &F) override {
+ if (skipFunction(F))
+ return false;
auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
auto &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
auto &MD = getAnalysis<MemoryDependenceWrapperPass>().getMemDep();
+ auto &MSSA = getAnalysis<MemorySSAWrapperPass>().getMSSA();
- GVNHoist G(&DT, &AA, &MD, F.optForMinSize());
+ GVNHoist G(&DT, &AA, &MD, &MSSA);
return G.run(F);
}
@@ -793,23 +966,25 @@ public:
AU.addRequired<DominatorTreeWrapperPass>();
AU.addRequired<AAResultsWrapperPass>();
AU.addRequired<MemoryDependenceWrapperPass>();
+ AU.addRequired<MemorySSAWrapperPass>();
AU.addPreserved<DominatorTreeWrapperPass>();
+ AU.addPreserved<MemorySSAWrapperPass>();
}
};
} // namespace
-PreservedAnalyses GVNHoistPass::run(Function &F,
- AnalysisManager<Function> &AM) {
+PreservedAnalyses GVNHoistPass::run(Function &F, FunctionAnalysisManager &AM) {
DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F);
AliasAnalysis &AA = AM.getResult<AAManager>(F);
MemoryDependenceResults &MD = AM.getResult<MemoryDependenceAnalysis>(F);
-
- GVNHoist G(&DT, &AA, &MD, F.optForMinSize());
+ MemorySSA &MSSA = AM.getResult<MemorySSAAnalysis>(F).getMSSA();
+ GVNHoist G(&DT, &AA, &MD, &MSSA);
if (!G.run(F))
return PreservedAnalyses::all();
PreservedAnalyses PA;
PA.preserve<DominatorTreeAnalysis>();
+ PA.preserve<MemorySSAAnalysis>();
return PA;
}
@@ -817,6 +992,7 @@ char GVNHoistLegacyPass::ID = 0;
INITIALIZE_PASS_BEGIN(GVNHoistLegacyPass, "gvn-hoist",
"Early GVN Hoisting of Expressions", false, false)
INITIALIZE_PASS_DEPENDENCY(MemoryDependenceWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
INITIALIZE_PASS_END(GVNHoistLegacyPass, "gvn-hoist",
diff --git a/contrib/llvm/lib/Transforms/Scalar/GuardWidening.cpp b/contrib/llvm/lib/Transforms/Scalar/GuardWidening.cpp
index 7686e65..b05ef00 100644
--- a/contrib/llvm/lib/Transforms/Scalar/GuardWidening.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/GuardWidening.cpp
@@ -46,6 +46,7 @@
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/PostDominators.h"
#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/ConstantRange.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/PatternMatch.h"
@@ -653,7 +654,7 @@ bool GuardWideningImpl::combineRangeChecks(
}
PreservedAnalyses GuardWideningPass::run(Function &F,
- AnalysisManager<Function> &AM) {
+ FunctionAnalysisManager &AM) {
auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
auto &LI = AM.getResult<LoopAnalysis>(F);
auto &PDT = AM.getResult<PostDominatorTreeAnalysis>(F);
diff --git a/contrib/llvm/lib/Transforms/Scalar/IVUsersPrinter.cpp b/contrib/llvm/lib/Transforms/Scalar/IVUsersPrinter.cpp
new file mode 100644
index 0000000..8075933
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/IVUsersPrinter.cpp
@@ -0,0 +1,22 @@
+//===- IVUsersPrinter.cpp - Induction Variable Users Printer ----*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/IVUsersPrinter.h"
+#include "llvm/Analysis/IVUsers.h"
+#include "llvm/Support/Debug.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "iv-users"
+
+PreservedAnalyses IVUsersPrinterPass::run(Loop &L, LoopAnalysisManager &AM,
+ LoopStandardAnalysisResults &AR,
+ LPMUpdater &U) {
+ AM.getResult<IVUsersAnalysis>(L, AR).print(OS);
+ return PreservedAnalyses::all();
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp b/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
index cf3e7c5..1752fb7 100644
--- a/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -25,15 +25,13 @@
//===----------------------------------------------------------------------===//
#include "llvm/Transforms/Scalar/IndVarSimplify.h"
-#include "llvm/Transforms/Scalar.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/LoopPass.h"
-#include "llvm/Analysis/LoopPassManager.h"
-#include "llvm/Analysis/ScalarEvolutionExpander.h"
#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/IR/BasicBlock.h"
@@ -49,6 +47,8 @@
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
@@ -79,8 +79,12 @@ static cl::opt<ReplaceExitVal> ReplaceExitValue(
clEnumValN(OnlyCheapRepl, "cheap",
"only replace exit value when the cost is cheap"),
clEnumValN(AlwaysRepl, "always",
- "always replace exit value whenever possible"),
- clEnumValEnd));
+ "always replace exit value whenever possible")));
+
+static cl::opt<bool> UsePostIncrementRanges(
+ "indvars-post-increment-ranges", cl::Hidden,
+ cl::desc("Use post increment control-dependent ranges in IndVarSimplify"),
+ cl::init(true));
namespace {
struct RewritePhi;
@@ -506,7 +510,8 @@ Value *IndVarSimplify::expandSCEVIfNeeded(SCEVExpander &Rewriter, const SCEV *S,
/// constant operands at the beginning of the loop.
void IndVarSimplify::rewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter) {
// Check a pre-condition.
- assert(L->isRecursivelyLCSSAForm(*DT) && "Indvars did not preserve LCSSA!");
+ assert(L->isRecursivelyLCSSAForm(*DT, *LI) &&
+ "Indvars did not preserve LCSSA!");
SmallVector<BasicBlock*, 8> ExitBlocks;
L->getUniqueExitBlocks(ExitBlocks);
@@ -880,7 +885,6 @@ class WidenIV {
// Parameters
PHINode *OrigPhi;
Type *WideType;
- bool IsSigned;
// Context
LoopInfo *LI;
@@ -888,31 +892,70 @@ class WidenIV {
ScalarEvolution *SE;
DominatorTree *DT;
+ // Does the module have any calls to the llvm.experimental.guard intrinsic
+ // at all? If not we can avoid scanning instructions looking for guards.
+ bool HasGuards;
+
// Result
PHINode *WidePhi;
Instruction *WideInc;
const SCEV *WideIncExpr;
SmallVectorImpl<WeakVH> &DeadInsts;
- SmallPtrSet<Instruction*,16> Widened;
+ SmallPtrSet<Instruction *,16> Widened;
SmallVector<NarrowIVDefUse, 8> NarrowIVUsers;
+ enum ExtendKind { ZeroExtended, SignExtended, Unknown };
+ // A map tracking the kind of extension used to widen each narrow IV
+ // and narrow IV user.
+ // Key: pointer to a narrow IV or IV user.
+ // Value: the kind of extension used to widen this Instruction.
+ DenseMap<AssertingVH<Instruction>, ExtendKind> ExtendKindMap;
+
+ typedef std::pair<AssertingVH<Value>, AssertingVH<Instruction>> DefUserPair;
+ // A map with control-dependent ranges for post increment IV uses. The key is
+ // a pair of IV def and a use of this def denoting the context. The value is
+ // a ConstantRange representing possible values of the def at the given
+ // context.
+ DenseMap<DefUserPair, ConstantRange> PostIncRangeInfos;
+
+ Optional<ConstantRange> getPostIncRangeInfo(Value *Def,
+ Instruction *UseI) {
+ DefUserPair Key(Def, UseI);
+ auto It = PostIncRangeInfos.find(Key);
+ return It == PostIncRangeInfos.end()
+ ? Optional<ConstantRange>(None)
+ : Optional<ConstantRange>(It->second);
+ }
+
+ void calculatePostIncRanges(PHINode *OrigPhi);
+ void calculatePostIncRange(Instruction *NarrowDef, Instruction *NarrowUser);
+ void updatePostIncRangeInfo(Value *Def, Instruction *UseI, ConstantRange R) {
+ DefUserPair Key(Def, UseI);
+ auto It = PostIncRangeInfos.find(Key);
+ if (It == PostIncRangeInfos.end())
+ PostIncRangeInfos.insert({Key, R});
+ else
+ It->second = R.intersectWith(It->second);
+ }
+
public:
WidenIV(const WideIVInfo &WI, LoopInfo *LInfo,
ScalarEvolution *SEv, DominatorTree *DTree,
- SmallVectorImpl<WeakVH> &DI) :
+ SmallVectorImpl<WeakVH> &DI, bool HasGuards) :
OrigPhi(WI.NarrowIV),
WideType(WI.WidestNativeType),
- IsSigned(WI.IsSigned),
LI(LInfo),
L(LI->getLoopFor(OrigPhi->getParent())),
SE(SEv),
DT(DTree),
+ HasGuards(HasGuards),
WidePhi(nullptr),
WideInc(nullptr),
WideIncExpr(nullptr),
DeadInsts(DI) {
assert(L->getHeader() == OrigPhi->getParent() && "Phi must be an IV");
+ ExtendKindMap[OrigPhi] = WI.IsSigned ? SignExtended : ZeroExtended;
}
PHINode *createWideIV(SCEVExpander &Rewriter);
@@ -926,9 +969,13 @@ protected:
const SCEVAddRecExpr *WideAR);
Instruction *cloneBitwiseIVUser(NarrowIVDefUse DU);
- const SCEVAddRecExpr *getWideRecurrence(Instruction *NarrowUse);
+ ExtendKind getExtendKind(Instruction *I);
- const SCEVAddRecExpr* getExtendedOperandRecurrence(NarrowIVDefUse DU);
+ typedef std::pair<const SCEVAddRecExpr *, ExtendKind> WidenedRecTy;
+
+ WidenedRecTy getWideRecurrence(NarrowIVDefUse DU);
+
+ WidenedRecTy getExtendedOperandRecurrence(NarrowIVDefUse DU);
const SCEV *getSCEVByOpCode(const SCEV *LHS, const SCEV *RHS,
unsigned OpCode) const;
@@ -1002,6 +1049,7 @@ Instruction *WidenIV::cloneBitwiseIVUser(NarrowIVDefUse DU) {
// about the narrow operand yet so must insert a [sz]ext. It is probably loop
// invariant and will be folded or hoisted. If it actually comes from a
// widened IV, it should be removed during a future call to widenIVUse.
+ bool IsSigned = getExtendKind(NarrowDef) == SignExtended;
Value *LHS = (NarrowUse->getOperand(0) == NarrowDef)
? WideDef
: createExtendInst(NarrowUse->getOperand(0), WideType,
@@ -1086,7 +1134,7 @@ Instruction *WidenIV::cloneArithmeticIVUser(NarrowIVDefUse DU,
return WideUse == WideAR;
};
- bool SignExtend = IsSigned;
+ bool SignExtend = getExtendKind(NarrowDef) == SignExtended;
if (!GuessNonIVOperand(SignExtend)) {
SignExtend = !SignExtend;
if (!GuessNonIVOperand(SignExtend))
@@ -1112,6 +1160,12 @@ Instruction *WidenIV::cloneArithmeticIVUser(NarrowIVDefUse DU,
return WideBO;
}
+WidenIV::ExtendKind WidenIV::getExtendKind(Instruction *I) {
+ auto It = ExtendKindMap.find(I);
+ assert(It != ExtendKindMap.end() && "Instruction not yet extended!");
+ return It->second;
+}
+
const SCEV *WidenIV::getSCEVByOpCode(const SCEV *LHS, const SCEV *RHS,
unsigned OpCode) const {
if (OpCode == Instruction::Add)
@@ -1127,15 +1181,16 @@ const SCEV *WidenIV::getSCEVByOpCode(const SCEV *LHS, const SCEV *RHS,
/// No-wrap operations can transfer sign extension of their result to their
/// operands. Generate the SCEV value for the widened operation without
/// actually modifying the IR yet. If the expression after extending the
-/// operands is an AddRec for this loop, return it.
-const SCEVAddRecExpr* WidenIV::getExtendedOperandRecurrence(NarrowIVDefUse DU) {
+/// operands is an AddRec for this loop, return the AddRec and the kind of
+/// extension used.
+WidenIV::WidenedRecTy WidenIV::getExtendedOperandRecurrence(NarrowIVDefUse DU) {
// Handle the common case of add<nsw/nuw>
const unsigned OpCode = DU.NarrowUse->getOpcode();
// Only Add/Sub/Mul instructions supported yet.
if (OpCode != Instruction::Add && OpCode != Instruction::Sub &&
OpCode != Instruction::Mul)
- return nullptr;
+ return {nullptr, Unknown};
// One operand (NarrowDef) has already been extended to WideDef. Now determine
// if extending the other will lead to a recurrence.
@@ -1146,14 +1201,15 @@ const SCEVAddRecExpr* WidenIV::getExtendedOperandRecurrence(NarrowIVDefUse DU) {
const SCEV *ExtendOperExpr = nullptr;
const OverflowingBinaryOperator *OBO =
cast<OverflowingBinaryOperator>(DU.NarrowUse);
- if (IsSigned && OBO->hasNoSignedWrap())
+ ExtendKind ExtKind = getExtendKind(DU.NarrowDef);
+ if (ExtKind == SignExtended && OBO->hasNoSignedWrap())
ExtendOperExpr = SE->getSignExtendExpr(
SE->getSCEV(DU.NarrowUse->getOperand(ExtendOperIdx)), WideType);
- else if(!IsSigned && OBO->hasNoUnsignedWrap())
+ else if(ExtKind == ZeroExtended && OBO->hasNoUnsignedWrap())
ExtendOperExpr = SE->getZeroExtendExpr(
SE->getSCEV(DU.NarrowUse->getOperand(ExtendOperIdx)), WideType);
else
- return nullptr;
+ return {nullptr, Unknown};
// When creating this SCEV expr, don't apply the current operations NSW or NUW
// flags. This instruction may be guarded by control flow that the no-wrap
@@ -1171,33 +1227,49 @@ const SCEVAddRecExpr* WidenIV::getExtendedOperandRecurrence(NarrowIVDefUse DU) {
dyn_cast<SCEVAddRecExpr>(getSCEVByOpCode(lhs, rhs, OpCode));
if (!AddRec || AddRec->getLoop() != L)
- return nullptr;
- return AddRec;
+ return {nullptr, Unknown};
+
+ return {AddRec, ExtKind};
}
/// Is this instruction potentially interesting for further simplification after
/// widening it's type? In other words, can the extend be safely hoisted out of
/// the loop with SCEV reducing the value to a recurrence on the same loop. If
-/// so, return the sign or zero extended recurrence. Otherwise return NULL.
-const SCEVAddRecExpr *WidenIV::getWideRecurrence(Instruction *NarrowUse) {
- if (!SE->isSCEVable(NarrowUse->getType()))
- return nullptr;
-
- const SCEV *NarrowExpr = SE->getSCEV(NarrowUse);
- if (SE->getTypeSizeInBits(NarrowExpr->getType())
- >= SE->getTypeSizeInBits(WideType)) {
+/// so, return the extended recurrence and the kind of extension used. Otherwise
+/// return {nullptr, Unknown}.
+WidenIV::WidenedRecTy WidenIV::getWideRecurrence(NarrowIVDefUse DU) {
+ if (!SE->isSCEVable(DU.NarrowUse->getType()))
+ return {nullptr, Unknown};
+
+ const SCEV *NarrowExpr = SE->getSCEV(DU.NarrowUse);
+ if (SE->getTypeSizeInBits(NarrowExpr->getType()) >=
+ SE->getTypeSizeInBits(WideType)) {
// NarrowUse implicitly widens its operand. e.g. a gep with a narrow
// index. So don't follow this use.
- return nullptr;
+ return {nullptr, Unknown};
}
- const SCEV *WideExpr = IsSigned ?
- SE->getSignExtendExpr(NarrowExpr, WideType) :
- SE->getZeroExtendExpr(NarrowExpr, WideType);
+ const SCEV *WideExpr;
+ ExtendKind ExtKind;
+ if (DU.NeverNegative) {
+ WideExpr = SE->getSignExtendExpr(NarrowExpr, WideType);
+ if (isa<SCEVAddRecExpr>(WideExpr))
+ ExtKind = SignExtended;
+ else {
+ WideExpr = SE->getZeroExtendExpr(NarrowExpr, WideType);
+ ExtKind = ZeroExtended;
+ }
+ } else if (getExtendKind(DU.NarrowDef) == SignExtended) {
+ WideExpr = SE->getSignExtendExpr(NarrowExpr, WideType);
+ ExtKind = SignExtended;
+ } else {
+ WideExpr = SE->getZeroExtendExpr(NarrowExpr, WideType);
+ ExtKind = ZeroExtended;
+ }
const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(WideExpr);
if (!AddRec || AddRec->getLoop() != L)
- return nullptr;
- return AddRec;
+ return {nullptr, Unknown};
+ return {AddRec, ExtKind};
}
/// This IV user cannot be widen. Replace this use of the original narrow IV
@@ -1233,7 +1305,7 @@ bool WidenIV::widenLoopCompare(NarrowIVDefUse DU) {
//
// (A) == icmp slt i32 sext(%narrow), sext(%val)
// == icmp slt i32 zext(%narrow), sext(%val)
-
+ bool IsSigned = getExtendKind(DU.NarrowDef) == SignExtended;
if (!(DU.NeverNegative || IsSigned == Cmp->isSigned()))
return false;
@@ -1258,6 +1330,8 @@ bool WidenIV::widenLoopCompare(NarrowIVDefUse DU) {
/// Determine whether an individual user of the narrow IV can be widened. If so,
/// return the wide clone of the user.
Instruction *WidenIV::widenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter) {
+ assert(ExtendKindMap.count(DU.NarrowDef) &&
+ "Should already know the kind of extension used to widen NarrowDef");
// Stop traversing the def-use chain at inner-loop phis or post-loop phis.
if (PHINode *UsePhi = dyn_cast<PHINode>(DU.NarrowUse)) {
@@ -1288,8 +1362,19 @@ Instruction *WidenIV::widenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter) {
return nullptr;
}
}
+
+ // This narrow use can be widened by a sext if it's non-negative or its narrow
+ // def was widended by a sext. Same for zext.
+ auto canWidenBySExt = [&]() {
+ return DU.NeverNegative || getExtendKind(DU.NarrowDef) == SignExtended;
+ };
+ auto canWidenByZExt = [&]() {
+ return DU.NeverNegative || getExtendKind(DU.NarrowDef) == ZeroExtended;
+ };
+
// Our raison d'etre! Eliminate sign and zero extension.
- if (IsSigned ? isa<SExtInst>(DU.NarrowUse) : isa<ZExtInst>(DU.NarrowUse)) {
+ if ((isa<SExtInst>(DU.NarrowUse) && canWidenBySExt()) ||
+ (isa<ZExtInst>(DU.NarrowUse) && canWidenByZExt())) {
Value *NewDef = DU.WideDef;
if (DU.NarrowUse->getType() != WideType) {
unsigned CastWidth = SE->getTypeSizeInBits(DU.NarrowUse->getType());
@@ -1327,17 +1412,18 @@ Instruction *WidenIV::widenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter) {
}
// Does this user itself evaluate to a recurrence after widening?
- const SCEVAddRecExpr *WideAddRec = getWideRecurrence(DU.NarrowUse);
- if (!WideAddRec)
- WideAddRec = getExtendedOperandRecurrence(DU);
+ WidenedRecTy WideAddRec = getExtendedOperandRecurrence(DU);
+ if (!WideAddRec.first)
+ WideAddRec = getWideRecurrence(DU);
- if (!WideAddRec) {
+ assert((WideAddRec.first == nullptr) == (WideAddRec.second == Unknown));
+ if (!WideAddRec.first) {
// If use is a loop condition, try to promote the condition instead of
// truncating the IV first.
if (widenLoopCompare(DU))
return nullptr;
- // This user does not evaluate to a recurence after widening, so don't
+ // This user does not evaluate to a recurrence after widening, so don't
// follow it. Instead insert a Trunc to kill off the original use,
// eventually isolating the original narrow IV so it can be removed.
truncateIVUse(DU, DT, LI);
@@ -1351,10 +1437,11 @@ Instruction *WidenIV::widenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter) {
// Reuse the IV increment that SCEVExpander created as long as it dominates
// NarrowUse.
Instruction *WideUse = nullptr;
- if (WideAddRec == WideIncExpr && Rewriter.hoistIVInc(WideInc, DU.NarrowUse))
+ if (WideAddRec.first == WideIncExpr &&
+ Rewriter.hoistIVInc(WideInc, DU.NarrowUse))
WideUse = WideInc;
else {
- WideUse = cloneIVUser(DU, WideAddRec);
+ WideUse = cloneIVUser(DU, WideAddRec.first);
if (!WideUse)
return nullptr;
}
@@ -1363,13 +1450,14 @@ Instruction *WidenIV::widenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter) {
// evaluates to the same expression as the extended narrow use, but doesn't
// absolutely guarantee it. Hence the following failsafe check. In rare cases
// where it fails, we simply throw away the newly created wide use.
- if (WideAddRec != SE->getSCEV(WideUse)) {
+ if (WideAddRec.first != SE->getSCEV(WideUse)) {
DEBUG(dbgs() << "Wide use expression mismatch: " << *WideUse
- << ": " << *SE->getSCEV(WideUse) << " != " << *WideAddRec << "\n");
+ << ": " << *SE->getSCEV(WideUse) << " != " << *WideAddRec.first << "\n");
DeadInsts.emplace_back(WideUse);
return nullptr;
}
+ ExtendKindMap[DU.NarrowUse] = WideAddRec.second;
// Returning WideUse pushes it on the worklist.
return WideUse;
}
@@ -1378,7 +1466,7 @@ Instruction *WidenIV::widenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter) {
///
void WidenIV::pushNarrowIVUsers(Instruction *NarrowDef, Instruction *WideDef) {
const SCEV *NarrowSCEV = SE->getSCEV(NarrowDef);
- bool NeverNegative =
+ bool NonNegativeDef =
SE->isKnownPredicate(ICmpInst::ICMP_SGE, NarrowSCEV,
SE->getConstant(NarrowSCEV->getType(), 0));
for (User *U : NarrowDef->users()) {
@@ -1388,7 +1476,15 @@ void WidenIV::pushNarrowIVUsers(Instruction *NarrowDef, Instruction *WideDef) {
if (!Widened.insert(NarrowUser).second)
continue;
- NarrowIVUsers.emplace_back(NarrowDef, NarrowUser, WideDef, NeverNegative);
+ bool NonNegativeUse = false;
+ if (!NonNegativeDef) {
+ // We might have a control-dependent range information for this context.
+ if (auto RangeInfo = getPostIncRangeInfo(NarrowDef, NarrowUser))
+ NonNegativeUse = RangeInfo->getSignedMin().isNonNegative();
+ }
+
+ NarrowIVUsers.emplace_back(NarrowDef, NarrowUser, WideDef,
+ NonNegativeDef || NonNegativeUse);
}
}
@@ -1408,9 +1504,9 @@ PHINode *WidenIV::createWideIV(SCEVExpander &Rewriter) {
return nullptr;
// Widen the induction variable expression.
- const SCEV *WideIVExpr = IsSigned ?
- SE->getSignExtendExpr(AddRec, WideType) :
- SE->getZeroExtendExpr(AddRec, WideType);
+ const SCEV *WideIVExpr = getExtendKind(OrigPhi) == SignExtended
+ ? SE->getSignExtendExpr(AddRec, WideType)
+ : SE->getZeroExtendExpr(AddRec, WideType);
assert(SE->getEffectiveSCEVType(WideIVExpr->getType()) == WideType &&
"Expect the new IV expression to preserve its type");
@@ -1428,6 +1524,19 @@ PHINode *WidenIV::createWideIV(SCEVExpander &Rewriter) {
SE->properlyDominates(AddRec->getStepRecurrence(*SE), L->getHeader()) &&
"Loop header phi recurrence inputs do not dominate the loop");
+ // Iterate over IV uses (including transitive ones) looking for IV increments
+ // of the form 'add nsw %iv, <const>'. For each increment and each use of
+ // the increment calculate control-dependent range information basing on
+ // dominating conditions inside of the loop (e.g. a range check inside of the
+ // loop). Calculated ranges are stored in PostIncRangeInfos map.
+ //
+ // Control-dependent range information is later used to prove that a narrow
+ // definition is not negative (see pushNarrowIVUsers). It's difficult to do
+ // this on demand because when pushNarrowIVUsers needs this information some
+ // of the dominating conditions might be already widened.
+ if (UsePostIncrementRanges)
+ calculatePostIncRanges(OrigPhi);
+
// The rewriter provides a value for the desired IV expression. This may
// either find an existing phi or materialize a new one. Either way, we
// expect a well-formed cyclic phi-with-increments. i.e. any operand not part
@@ -1443,6 +1552,11 @@ PHINode *WidenIV::createWideIV(SCEVExpander &Rewriter) {
WideInc =
cast<Instruction>(WidePhi->getIncomingValueForBlock(LatchBlock));
WideIncExpr = SE->getSCEV(WideInc);
+ // Propagate the debug location associated with the original loop increment
+ // to the new (widened) increment.
+ auto *OrigInc =
+ cast<Instruction>(OrigPhi->getIncomingValueForBlock(LatchBlock));
+ WideInc->setDebugLoc(OrigInc->getDebugLoc());
}
DEBUG(dbgs() << "Wide IV: " << *WidePhi << "\n");
@@ -1472,6 +1586,114 @@ PHINode *WidenIV::createWideIV(SCEVExpander &Rewriter) {
return WidePhi;
}
+/// Calculates control-dependent range for the given def at the given context
+/// by looking at dominating conditions inside of the loop
+void WidenIV::calculatePostIncRange(Instruction *NarrowDef,
+ Instruction *NarrowUser) {
+ using namespace llvm::PatternMatch;
+
+ Value *NarrowDefLHS;
+ const APInt *NarrowDefRHS;
+ if (!match(NarrowDef, m_NSWAdd(m_Value(NarrowDefLHS),
+ m_APInt(NarrowDefRHS))) ||
+ !NarrowDefRHS->isNonNegative())
+ return;
+
+ auto UpdateRangeFromCondition = [&] (Value *Condition,
+ bool TrueDest) {
+ CmpInst::Predicate Pred;
+ Value *CmpRHS;
+ if (!match(Condition, m_ICmp(Pred, m_Specific(NarrowDefLHS),
+ m_Value(CmpRHS))))
+ return;
+
+ CmpInst::Predicate P =
+ TrueDest ? Pred : CmpInst::getInversePredicate(Pred);
+
+ auto CmpRHSRange = SE->getSignedRange(SE->getSCEV(CmpRHS));
+ auto CmpConstrainedLHSRange =
+ ConstantRange::makeAllowedICmpRegion(P, CmpRHSRange);
+ auto NarrowDefRange =
+ CmpConstrainedLHSRange.addWithNoSignedWrap(*NarrowDefRHS);
+
+ updatePostIncRangeInfo(NarrowDef, NarrowUser, NarrowDefRange);
+ };
+
+ auto UpdateRangeFromGuards = [&](Instruction *Ctx) {
+ if (!HasGuards)
+ return;
+
+ for (Instruction &I : make_range(Ctx->getIterator().getReverse(),
+ Ctx->getParent()->rend())) {
+ Value *C = nullptr;
+ if (match(&I, m_Intrinsic<Intrinsic::experimental_guard>(m_Value(C))))
+ UpdateRangeFromCondition(C, /*TrueDest=*/true);
+ }
+ };
+
+ UpdateRangeFromGuards(NarrowUser);
+
+ BasicBlock *NarrowUserBB = NarrowUser->getParent();
+ // If NarrowUserBB is statically unreachable asking dominator queries may
+ // yield surprising results. (e.g. the block may not have a dom tree node)
+ if (!DT->isReachableFromEntry(NarrowUserBB))
+ return;
+
+ for (auto *DTB = (*DT)[NarrowUserBB]->getIDom();
+ L->contains(DTB->getBlock());
+ DTB = DTB->getIDom()) {
+ auto *BB = DTB->getBlock();
+ auto *TI = BB->getTerminator();
+ UpdateRangeFromGuards(TI);
+
+ auto *BI = dyn_cast<BranchInst>(TI);
+ if (!BI || !BI->isConditional())
+ continue;
+
+ auto *TrueSuccessor = BI->getSuccessor(0);
+ auto *FalseSuccessor = BI->getSuccessor(1);
+
+ auto DominatesNarrowUser = [this, NarrowUser] (BasicBlockEdge BBE) {
+ return BBE.isSingleEdge() &&
+ DT->dominates(BBE, NarrowUser->getParent());
+ };
+
+ if (DominatesNarrowUser(BasicBlockEdge(BB, TrueSuccessor)))
+ UpdateRangeFromCondition(BI->getCondition(), /*TrueDest=*/true);
+
+ if (DominatesNarrowUser(BasicBlockEdge(BB, FalseSuccessor)))
+ UpdateRangeFromCondition(BI->getCondition(), /*TrueDest=*/false);
+ }
+}
+
+/// Calculates PostIncRangeInfos map for the given IV
+void WidenIV::calculatePostIncRanges(PHINode *OrigPhi) {
+ SmallPtrSet<Instruction *, 16> Visited;
+ SmallVector<Instruction *, 6> Worklist;
+ Worklist.push_back(OrigPhi);
+ Visited.insert(OrigPhi);
+
+ while (!Worklist.empty()) {
+ Instruction *NarrowDef = Worklist.pop_back_val();
+
+ for (Use &U : NarrowDef->uses()) {
+ auto *NarrowUser = cast<Instruction>(U.getUser());
+
+ // Don't go looking outside the current loop.
+ auto *NarrowUserLoop = (*LI)[NarrowUser->getParent()];
+ if (!NarrowUserLoop || !L->contains(NarrowUserLoop))
+ continue;
+
+ if (!Visited.insert(NarrowUser).second)
+ continue;
+
+ Worklist.push_back(NarrowUser);
+
+ calculatePostIncRange(NarrowDef, NarrowUser);
+ }
+ }
+}
+
//===----------------------------------------------------------------------===//
// Live IV Reduction - Minimize IVs live across the loop.
//===----------------------------------------------------------------------===//
@@ -1514,6 +1736,10 @@ void IndVarSimplify::simplifyAndExtend(Loop *L,
LoopInfo *LI) {
SmallVector<WideIVInfo, 8> WideIVs;
+ auto *GuardDecl = L->getBlocks()[0]->getModule()->getFunction(
+ Intrinsic::getName(Intrinsic::experimental_guard));
+ bool HasGuards = GuardDecl && !GuardDecl->use_empty();
+
SmallVector<PHINode*, 8> LoopPhis;
for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I) {
LoopPhis.push_back(cast<PHINode>(I));
@@ -1543,7 +1769,7 @@ void IndVarSimplify::simplifyAndExtend(Loop *L,
} while(!LoopPhis.empty());
for (; !WideIVs.empty(); WideIVs.pop_back()) {
- WidenIV Widener(WideIVs.back(), LI, SE, DT, DeadInsts);
+ WidenIV Widener(WideIVs.back(), LI, SE, DT, DeadInsts, HasGuards);
if (PHINode *WidePhi = Widener.createWideIV(Rewriter)) {
Changed = true;
LoopPhis.push_back(WidePhi);
@@ -1870,7 +2096,7 @@ static Value *genLoopLimit(PHINode *IndVar, const SCEV *IVCount, Loop *L,
return Builder.CreateGEP(nullptr, GEPBase, GEPOffset, "lftr.limit");
} else {
// In any other case, convert both IVInit and IVCount to integers before
- // comparing. This may result in SCEV expension of pointers, but in practice
+ // comparing. This may result in SCEV expansion of pointers, but in practice
// SCEV will fold the pointer arithmetic away as such:
// BECount = (IVEnd - IVInit - 1) => IVLimit = IVInit (postinc).
//
@@ -1963,6 +2189,11 @@ linearFunctionTestReplace(Loop *L,
IRBuilder<> Builder(BI);
+ // The new loop exit condition should reuse the debug location of the
+ // original loop exit condition.
+ if (auto *Cond = dyn_cast<Instruction>(BI->getCondition()))
+ Builder.SetCurrentDebugLocation(Cond->getDebugLoc());
+
// LFTR can ignore IV overflow and truncate to the width of
// BECount. This avoids materializing the add(zext(add)) expression.
unsigned CmpIndVarSize = SE->getTypeSizeInBits(CmpIndVar->getType());
@@ -1992,8 +2223,36 @@ linearFunctionTestReplace(Loop *L,
DEBUG(dbgs() << " Widen RHS:\t" << *ExitCnt << "\n");
} else {
- CmpIndVar = Builder.CreateTrunc(CmpIndVar, ExitCnt->getType(),
- "lftr.wideiv");
+ // We try to extend trip count first. If that doesn't work we truncate IV.
+ // Zext(trunc(IV)) == IV implies equivalence of the following two:
+ // Trunc(IV) == ExitCnt and IV == zext(ExitCnt). Similarly for sext. If
+ // one of the two holds, extend the trip count, otherwise we truncate IV.
+ bool Extended = false;
+ const SCEV *IV = SE->getSCEV(CmpIndVar);
+ const SCEV *ZExtTrunc =
+ SE->getZeroExtendExpr(SE->getTruncateExpr(SE->getSCEV(CmpIndVar),
+ ExitCnt->getType()),
+ CmpIndVar->getType());
+
+ if (ZExtTrunc == IV) {
+ Extended = true;
+ ExitCnt = Builder.CreateZExt(ExitCnt, IndVar->getType(),
+ "wide.trip.count");
+ } else {
+ const SCEV *SExtTrunc =
+ SE->getSignExtendExpr(SE->getTruncateExpr(SE->getSCEV(CmpIndVar),
+ ExitCnt->getType()),
+ CmpIndVar->getType());
+ if (SExtTrunc == IV) {
+ Extended = true;
+ ExitCnt = Builder.CreateSExt(ExitCnt, IndVar->getType(),
+ "wide.trip.count");
+ }
+ }
+
+ if (!Extended)
+ CmpIndVar = Builder.CreateTrunc(CmpIndVar, ExitCnt->getType(),
+ "lftr.wideiv");
}
}
Value *Cond = Builder.CreateICmp(P, CmpIndVar, ExitCnt, "exitcond");
@@ -2025,7 +2284,7 @@ void IndVarSimplify::sinkUnusedInvariants(Loop *L) {
BasicBlock *Preheader = L->getLoopPreheader();
if (!Preheader) return;
- Instruction *InsertPt = &*ExitBlock->getFirstInsertionPt();
+ BasicBlock::iterator InsertPt = ExitBlock->getFirstInsertionPt();
BasicBlock::iterator I(Preheader->getTerminator());
while (I != Preheader->begin()) {
--I;
@@ -2094,9 +2353,9 @@ void IndVarSimplify::sinkUnusedInvariants(Loop *L) {
Done = true;
}
- ToMove->moveBefore(InsertPt);
+ ToMove->moveBefore(*ExitBlock, InsertPt);
if (Done) break;
- InsertPt = ToMove;
+ InsertPt = ToMove->getIterator();
}
}
@@ -2106,7 +2365,8 @@ void IndVarSimplify::sinkUnusedInvariants(Loop *L) {
bool IndVarSimplify::run(Loop *L) {
// We need (and expect!) the incoming loop to be in LCSSA.
- assert(L->isRecursivelyLCSSAForm(*DT) && "LCSSA required to run indvars!");
+ assert(L->isRecursivelyLCSSAForm(*DT, *LI) &&
+ "LCSSA required to run indvars!");
// If LoopSimplify form is not available, stay out of trouble. Some notes:
// - LSR currently only supports LoopSimplify-form loops. Indvars'
@@ -2199,7 +2459,8 @@ bool IndVarSimplify::run(Loop *L) {
Changed |= DeleteDeadPHIs(L->getHeader(), TLI);
// Check a post-condition.
- assert(L->isRecursivelyLCSSAForm(*DT) && "Indvars did not preserve LCSSA!");
+ assert(L->isRecursivelyLCSSAForm(*DT, *LI) &&
+ "Indvars did not preserve LCSSA!");
// Verify that LFTR, and any other change have not interfered with SCEV's
// ability to compute trip count.
@@ -2221,23 +2482,13 @@ bool IndVarSimplify::run(Loop *L) {
return Changed;
}
-PreservedAnalyses IndVarSimplifyPass::run(Loop &L, AnalysisManager<Loop> &AM) {
- auto &FAM = AM.getResult<FunctionAnalysisManagerLoopProxy>(L).getManager();
+PreservedAnalyses IndVarSimplifyPass::run(Loop &L, LoopAnalysisManager &AM,
+ LoopStandardAnalysisResults &AR,
+ LPMUpdater &) {
Function *F = L.getHeader()->getParent();
const DataLayout &DL = F->getParent()->getDataLayout();
- auto *LI = FAM.getCachedResult<LoopAnalysis>(*F);
- auto *SE = FAM.getCachedResult<ScalarEvolutionAnalysis>(*F);
- auto *DT = FAM.getCachedResult<DominatorTreeAnalysis>(*F);
-
- assert((LI && SE && DT) &&
- "Analyses required for indvarsimplify not available!");
-
- // Optional analyses.
- auto *TTI = FAM.getCachedResult<TargetIRAnalysis>(*F);
- auto *TLI = FAM.getCachedResult<TargetLibraryAnalysis>(*F);
-
- IndVarSimplify IVS(LI, SE, DT, DL, TLI, TTI);
+ IndVarSimplify IVS(&AR.LI, &AR.SE, &AR.DT, DL, &AR.TLI, &AR.TTI);
if (!IVS.run(&L))
return PreservedAnalyses::all();
diff --git a/contrib/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp b/contrib/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
index ec7f09a..8e81541 100644
--- a/contrib/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
@@ -43,21 +43,16 @@
#include "llvm/ADT/Optional.h"
#include "llvm/Analysis/BranchProbabilityInfo.h"
-#include "llvm/Analysis/InstructionSimplify.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/LoopPass.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/ScalarEvolutionExpander.h"
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
-#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Module.h"
#include "llvm/IR/PatternMatch.h"
-#include "llvm/IR/ValueHandle.h"
-#include "llvm/IR/Verifier.h"
#include "llvm/Pass.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
@@ -65,8 +60,7 @@
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/Cloning.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
-#include "llvm/Transforms/Utils/SimplifyIndVar.h"
-#include "llvm/Transforms/Utils/UnrollLoop.h"
+#include "llvm/Transforms/Utils/LoopSimplify.h"
using namespace llvm;
@@ -82,6 +76,11 @@ static cl::opt<bool> PrintRangeChecks("irce-print-range-checks", cl::Hidden,
static cl::opt<int> MaxExitProbReciprocal("irce-max-exit-prob-reciprocal",
cl::Hidden, cl::init(10));
+static cl::opt<bool> SkipProfitabilityChecks("irce-skip-profitability-checks",
+ cl::Hidden, cl::init(false));
+
+static const char *ClonedLoopTag = "irce.loop.clone";
+
#define DEBUG_TYPE "irce"
namespace {
@@ -152,11 +151,10 @@ public:
OS << " Operand: " << getCheckUse()->getOperandNo() << "\n";
}
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ LLVM_DUMP_METHOD
void dump() {
print(dbgs());
}
-#endif
Use *getCheckUse() const { return CheckUse; }
@@ -276,7 +274,7 @@ InductiveRangeCheck::parseRangeCheckICmp(Loop *L, ICmpInst *ICI,
case ICmpInst::ICMP_SLE:
std::swap(LHS, RHS);
- // fallthrough
+ LLVM_FALLTHROUGH;
case ICmpInst::ICMP_SGE:
if (match(RHS, m_ConstantInt<0>())) {
Index = LHS;
@@ -286,7 +284,7 @@ InductiveRangeCheck::parseRangeCheckICmp(Loop *L, ICmpInst *ICI,
case ICmpInst::ICMP_SLT:
std::swap(LHS, RHS);
- // fallthrough
+ LLVM_FALLTHROUGH;
case ICmpInst::ICMP_SGT:
if (match(RHS, m_ConstantInt<-1>())) {
Index = LHS;
@@ -302,7 +300,7 @@ InductiveRangeCheck::parseRangeCheckICmp(Loop *L, ICmpInst *ICI,
case ICmpInst::ICMP_ULT:
std::swap(LHS, RHS);
- // fallthrough
+ LLVM_FALLTHROUGH;
case ICmpInst::ICMP_UGT:
if (IsNonNegativeAndNotLoopVarying(LHS)) {
Index = RHS;
@@ -392,7 +390,8 @@ void InductiveRangeCheck::extractRangeChecksFromBranch(
BranchProbability LikelyTaken(15, 16);
- if (BPI.getEdgeProbability(BI->getParent(), (unsigned)0) < LikelyTaken)
+ if (!SkipProfitabilityChecks &&
+ BPI.getEdgeProbability(BI->getParent(), (unsigned)0) < LikelyTaken)
return;
SmallPtrSet<Value *, 8> Visited;
@@ -400,6 +399,34 @@ void InductiveRangeCheck::extractRangeChecksFromBranch(
Checks, Visited);
}
+// Add metadata to the loop L to disable loop optimizations. Callers need to
+// confirm that optimizing loop L is not beneficial.
+static void DisableAllLoopOptsOnLoop(Loop &L) {
+ // We do not care about any existing loopID related metadata for L, since we
+ // are setting all loop metadata to false.
+ LLVMContext &Context = L.getHeader()->getContext();
+ // Reserve first location for self reference to the LoopID metadata node.
+ MDNode *Dummy = MDNode::get(Context, {});
+ MDNode *DisableUnroll = MDNode::get(
+ Context, {MDString::get(Context, "llvm.loop.unroll.disable")});
+ Metadata *FalseVal =
+ ConstantAsMetadata::get(ConstantInt::get(Type::getInt1Ty(Context), 0));
+ MDNode *DisableVectorize = MDNode::get(
+ Context,
+ {MDString::get(Context, "llvm.loop.vectorize.enable"), FalseVal});
+ MDNode *DisableLICMVersioning = MDNode::get(
+ Context, {MDString::get(Context, "llvm.loop.licm_versioning.disable")});
+ MDNode *DisableDistribution= MDNode::get(
+ Context,
+ {MDString::get(Context, "llvm.loop.distribute.enable"), FalseVal});
+ MDNode *NewLoopID =
+ MDNode::get(Context, {Dummy, DisableUnroll, DisableVectorize,
+ DisableLICMVersioning, DisableDistribution});
+ // Set operand 0 to refer to the loop id itself.
+ NewLoopID->replaceOperandWith(0, NewLoopID);
+ L.setLoopID(NewLoopID);
+}
+
namespace {
// Keeps track of the structure of a loop. This is similar to llvm::Loop,
@@ -515,6 +542,11 @@ class LoopConstrainer {
//
void cloneLoop(ClonedLoop &CLResult, const char *Tag) const;
+ // Create the appropriate loop structure needed to describe a cloned copy of
+ // `Original`. The clone is described by `VM`.
+ Loop *createClonedLoopStructure(Loop *Original, Loop *Parent,
+ ValueToValueMapTy &VM);
+
// Rewrite the iteration space of the loop denoted by (LS, Preheader). The
// iteration space of the rewritten loop ends at ExitLoopAt. The start of the
// iteration space is not changed. `ExitLoopAt' is assumed to be slt
@@ -566,10 +598,12 @@ class LoopConstrainer {
Function &F;
LLVMContext &Ctx;
ScalarEvolution &SE;
+ DominatorTree &DT;
+ LPPassManager &LPM;
+ LoopInfo &LI;
// Information about the original loop we started out with.
Loop &OriginalLoop;
- LoopInfo &OriginalLoopInfo;
const SCEV *LatchTakenCount;
BasicBlock *OriginalPreheader;
@@ -585,12 +619,13 @@ class LoopConstrainer {
LoopStructure MainLoopStructure;
public:
- LoopConstrainer(Loop &L, LoopInfo &LI, const LoopStructure &LS,
- ScalarEvolution &SE, InductiveRangeCheck::Range R)
+ LoopConstrainer(Loop &L, LoopInfo &LI, LPPassManager &LPM,
+ const LoopStructure &LS, ScalarEvolution &SE,
+ DominatorTree &DT, InductiveRangeCheck::Range R)
: F(*L.getHeader()->getParent()), Ctx(L.getHeader()->getContext()),
- SE(SE), OriginalLoop(L), OriginalLoopInfo(LI), LatchTakenCount(nullptr),
- OriginalPreheader(nullptr), MainLoopPreheader(nullptr), Range(R),
- MainLoopStructure(LS) {}
+ SE(SE), DT(DT), LPM(LPM), LI(LI), OriginalLoop(L),
+ LatchTakenCount(nullptr), OriginalPreheader(nullptr),
+ MainLoopPreheader(nullptr), Range(R), MainLoopStructure(LS) {}
// Entry point for the algorithm. Returns true on success.
bool run();
@@ -622,9 +657,19 @@ static bool CanBeSMin(ScalarEvolution &SE, const SCEV *S) {
Optional<LoopStructure>
LoopStructure::parseLoopStructure(ScalarEvolution &SE, BranchProbabilityInfo &BPI,
Loop &L, const char *&FailureReason) {
- assert(L.isLoopSimplifyForm() && "should follow from addRequired<>");
+ if (!L.isLoopSimplifyForm()) {
+ FailureReason = "loop not in LoopSimplify form";
+ return None;
+ }
BasicBlock *Latch = L.getLoopLatch();
+ assert(Latch && "Simplified loops only have one latch!");
+
+ if (Latch->getTerminator()->getMetadata(ClonedLoopTag)) {
+ FailureReason = "loop has already been cloned";
+ return None;
+ }
+
if (!L.isLoopExiting(Latch)) {
FailureReason = "no loop latch";
return None;
@@ -648,7 +693,8 @@ LoopStructure::parseLoopStructure(ScalarEvolution &SE, BranchProbabilityInfo &BP
BranchProbability ExitProbability =
BPI.getEdgeProbability(LatchBr->getParent(), LatchBrExitIdx);
- if (ExitProbability > BranchProbability(1, MaxExitProbReciprocal)) {
+ if (!SkipProfitabilityChecks &&
+ ExitProbability > BranchProbability(1, MaxExitProbReciprocal)) {
FailureReason = "short running loop, not profitable";
return None;
}
@@ -907,6 +953,11 @@ void LoopConstrainer::cloneLoop(LoopConstrainer::ClonedLoop &Result,
return static_cast<Value *>(It->second);
};
+ auto *ClonedLatch =
+ cast<BasicBlock>(GetClonedValue(OriginalLoop.getLoopLatch()));
+ ClonedLatch->getTerminator()->setMetadata(ClonedLoopTag,
+ MDNode::get(Ctx, {}));
+
Result.Structure = MainLoopStructure.map(GetClonedValue);
Result.Structure.Tag = Tag;
@@ -924,17 +975,15 @@ void LoopConstrainer::cloneLoop(LoopConstrainer::ClonedLoop &Result,
// to be edited to reflect that. No phi nodes need to be introduced because
// the loop is in LCSSA.
- for (auto SBBI = succ_begin(OriginalBB), SBBE = succ_end(OriginalBB);
- SBBI != SBBE; ++SBBI) {
-
- if (OriginalLoop.contains(*SBBI))
+ for (auto *SBB : successors(OriginalBB)) {
+ if (OriginalLoop.contains(SBB))
continue; // not an exit block
- for (Instruction &I : **SBBI) {
- if (!isa<PHINode>(&I))
+ for (Instruction &I : *SBB) {
+ auto *PN = dyn_cast<PHINode>(&I);
+ if (!PN)
break;
- PHINode *PN = cast<PHINode>(&I);
Value *OldIncoming = PN->getIncomingValueForBlock(OriginalBB);
PN->addIncoming(GetClonedValue(OldIncoming), ClonedBB);
}
@@ -1020,11 +1069,11 @@ LoopConstrainer::RewrittenRangeInfo LoopConstrainer::changeIterationSpaceEnd(
RewrittenRangeInfo RRI;
- auto BBInsertLocation = std::next(Function::iterator(LS.Latch));
+ BasicBlock *BBInsertLocation = LS.Latch->getNextNode();
RRI.ExitSelector = BasicBlock::Create(Ctx, Twine(LS.Tag) + ".exit.selector",
- &F, &*BBInsertLocation);
+ &F, BBInsertLocation);
RRI.PseudoExit = BasicBlock::Create(Ctx, Twine(LS.Tag) + ".pseudo.exit", &F,
- &*BBInsertLocation);
+ BBInsertLocation);
BranchInst *PreheaderJump = cast<BranchInst>(Preheader->getTerminator());
bool Increasing = LS.IndVarIncreasing;
@@ -1067,11 +1116,10 @@ LoopConstrainer::RewrittenRangeInfo LoopConstrainer::changeIterationSpaceEnd(
// each of the PHI nodes in the loop header. This feeds into the initial
// value of the same PHI nodes if/when we continue execution.
for (Instruction &I : *LS.Header) {
- if (!isa<PHINode>(&I))
+ auto *PN = dyn_cast<PHINode>(&I);
+ if (!PN)
break;
- PHINode *PN = cast<PHINode>(&I);
-
PHINode *NewPHI = PHINode::Create(PN->getType(), 2, PN->getName() + ".copy",
BranchToContinuation);
@@ -1104,11 +1152,10 @@ void LoopConstrainer::rewriteIncomingValuesForPHIs(
unsigned PHIIndex = 0;
for (Instruction &I : *LS.Header) {
- if (!isa<PHINode>(&I))
+ auto *PN = dyn_cast<PHINode>(&I);
+ if (!PN)
break;
- PHINode *PN = cast<PHINode>(&I);
-
for (unsigned i = 0, e = PN->getNumIncomingValues(); i < e; ++i)
if (PN->getIncomingBlock(i) == ContinuationBlock)
PN->setIncomingValue(i, RRI.PHIValuesAtPseudoExit[PHIIndex++]);
@@ -1125,10 +1172,10 @@ BasicBlock *LoopConstrainer::createPreheader(const LoopStructure &LS,
BranchInst::Create(LS.Header, Preheader);
for (Instruction &I : *LS.Header) {
- if (!isa<PHINode>(&I))
+ auto *PN = dyn_cast<PHINode>(&I);
+ if (!PN)
break;
- PHINode *PN = cast<PHINode>(&I);
for (unsigned i = 0, e = PN->getNumIncomingValues(); i < e; ++i)
replacePHIBlock(PN, OldPreheader, Preheader);
}
@@ -1142,7 +1189,23 @@ void LoopConstrainer::addToParentLoopIfNeeded(ArrayRef<BasicBlock *> BBs) {
return;
for (BasicBlock *BB : BBs)
- ParentLoop->addBasicBlockToLoop(BB, OriginalLoopInfo);
+ ParentLoop->addBasicBlockToLoop(BB, LI);
+}
+
+Loop *LoopConstrainer::createClonedLoopStructure(Loop *Original, Loop *Parent,
+ ValueToValueMapTy &VM) {
+ Loop &New = LPM.addLoop(Parent);
+
+ // Add all of the blocks in Original to the new loop.
+ for (auto *BB : Original->blocks())
+ if (LI.getLoopFor(BB) == Original)
+ New.addBasicBlockToLoop(cast<BasicBlock>(VM[BB]), LI);
+
+ // Add all of the subloops to the new loop.
+ for (Loop *SubLoop : *Original)
+ createClonedLoopStructure(SubLoop, &New, VM);
+
+ return &New;
}
bool LoopConstrainer::run() {
@@ -1266,8 +1329,31 @@ bool LoopConstrainer::run() {
std::remove(std::begin(NewBlocks), std::end(NewBlocks), nullptr);
addToParentLoopIfNeeded(makeArrayRef(std::begin(NewBlocks), NewBlocksEnd));
- addToParentLoopIfNeeded(PreLoop.Blocks);
- addToParentLoopIfNeeded(PostLoop.Blocks);
+
+ DT.recalculate(F);
+
+ if (!PreLoop.Blocks.empty()) {
+ auto *L = createClonedLoopStructure(
+ &OriginalLoop, OriginalLoop.getParentLoop(), PreLoop.Map);
+ formLCSSARecursively(*L, DT, &LI, &SE);
+ simplifyLoop(L, &DT, &LI, &SE, nullptr, true);
+ // Pre loops are slow paths, we do not need to perform any loop
+ // optimizations on them.
+ DisableAllLoopOptsOnLoop(*L);
+ }
+
+ if (!PostLoop.Blocks.empty()) {
+ auto *L = createClonedLoopStructure(
+ &OriginalLoop, OriginalLoop.getParentLoop(), PostLoop.Map);
+ formLCSSARecursively(*L, DT, &LI, &SE);
+ simplifyLoop(L, &DT, &LI, &SE, nullptr, true);
+ // Post loops are slow paths, we do not need to perform any loop
+ // optimizations on them.
+ DisableAllLoopOptsOnLoop(*L);
+ }
+
+ formLCSSARecursively(OriginalLoop, DT, &LI, &SE);
+ simplifyLoop(&OriginalLoop, &DT, &LI, &SE, nullptr, true);
return true;
}
@@ -1439,8 +1525,9 @@ bool InductiveRangeCheckElimination::runOnLoop(Loop *L, LPPassManager &LPM) {
if (!SafeIterRange.hasValue())
return false;
- LoopConstrainer LC(*L, getAnalysis<LoopInfoWrapperPass>().getLoopInfo(), LS,
- SE, SafeIterRange.getValue());
+ auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ LoopConstrainer LC(*L, getAnalysis<LoopInfoWrapperPass>().getLoopInfo(), LPM,
+ LS, SE, DT, SafeIterRange.getValue());
bool Changed = LC.run();
if (Changed) {
diff --git a/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp
index 55ffc23..1870c3d 100644
--- a/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp
@@ -134,7 +134,7 @@ bool JumpThreading::runOnFunction(Function &F) {
}
PreservedAnalyses JumpThreadingPass::run(Function &F,
- AnalysisManager<Function> &AM) {
+ FunctionAnalysisManager &AM) {
auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
auto &LVI = AM.getResult<LazyValueAnalysis>(F);
@@ -951,12 +951,17 @@ bool JumpThreadingPass::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
// Scan a few instructions up from the load, to see if it is obviously live at
// the entry to its block.
BasicBlock::iterator BBIt(LI);
-
+ bool IsLoadCSE;
if (Value *AvailableVal =
- FindAvailableLoadedValue(LI, LoadBB, BBIt, DefMaxInstsToScan)) {
+ FindAvailableLoadedValue(LI, LoadBB, BBIt, DefMaxInstsToScan, nullptr, &IsLoadCSE)) {
// If the value of the load is locally available within the block, just use
// it. This frequently occurs for reg2mem'd allocas.
+ if (IsLoadCSE) {
+ LoadInst *NLI = cast<LoadInst>(AvailableVal);
+ combineMetadataForCSE(NLI, LI);
+ };
+
// If the returned value is the load itself, replace with an undef. This can
// only happen in dead loops.
if (AvailableVal == LI) AvailableVal = UndefValue::get(LI->getType());
@@ -983,6 +988,7 @@ bool JumpThreadingPass::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
typedef SmallVector<std::pair<BasicBlock*, Value*>, 8> AvailablePredsTy;
AvailablePredsTy AvailablePreds;
BasicBlock *OneUnavailablePred = nullptr;
+ SmallVector<LoadInst*, 8> CSELoads;
// If we got here, the loaded value is transparent through to the start of the
// block. Check to see if it is available in any of the predecessor blocks.
@@ -993,17 +999,17 @@ bool JumpThreadingPass::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
// Scan the predecessor to see if the value is available in the pred.
BBIt = PredBB->end();
- AAMDNodes ThisAATags;
Value *PredAvailable = FindAvailableLoadedValue(LI, PredBB, BBIt,
DefMaxInstsToScan,
- nullptr, &ThisAATags);
+ nullptr,
+ &IsLoadCSE);
if (!PredAvailable) {
OneUnavailablePred = PredBB;
continue;
}
- // If AA tags disagree or are not present, forget about them.
- if (AATags != ThisAATags) AATags = AAMDNodes();
+ if (IsLoadCSE)
+ CSELoads.push_back(cast<LoadInst>(PredAvailable));
// If so, this load is partially redundant. Remember this info so that we
// can create a PHI node.
@@ -1101,6 +1107,10 @@ bool JumpThreadingPass::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
PN->addIncoming(PredV, I->first);
}
+ for (LoadInst *PredLI : CSELoads) {
+ combineMetadataForCSE(PredLI, LI);
+ }
+
LI->replaceAllUsesWith(PN);
LI->eraseFromParent();
@@ -1157,8 +1167,7 @@ FindMostPopularDest(BasicBlock *BB,
for (unsigned i = 0; ; ++i) {
assert(i != TI->getNumSuccessors() && "Didn't find any successor!");
- if (std::find(SamePopularity.begin(), SamePopularity.end(),
- TI->getSuccessor(i)) == SamePopularity.end())
+ if (!is_contained(SamePopularity, TI->getSuccessor(i)))
continue;
MostPopularDest = TI->getSuccessor(i);
@@ -1594,7 +1603,7 @@ bool JumpThreadingPass::ThreadEdge(BasicBlock *BB,
}
/// Create a new basic block that will be the predecessor of BB and successor of
-/// all blocks in Preds. When profile data is availble, update the frequency of
+/// all blocks in Preds. When profile data is available, update the frequency of
/// this new block.
BasicBlock *JumpThreadingPass::SplitBlockPreds(BasicBlock *BB,
ArrayRef<BasicBlock *> Preds,
@@ -1615,6 +1624,23 @@ BasicBlock *JumpThreadingPass::SplitBlockPreds(BasicBlock *BB,
return PredBB;
}
+bool JumpThreadingPass::doesBlockHaveProfileData(BasicBlock *BB) {
+ const TerminatorInst *TI = BB->getTerminator();
+ assert(TI->getNumSuccessors() > 1 && "not a split");
+
+ MDNode *WeightsNode = TI->getMetadata(LLVMContext::MD_prof);
+ if (!WeightsNode)
+ return false;
+
+ MDString *MDName = cast<MDString>(WeightsNode->getOperand(0));
+ if (MDName->getString() != "branch_weights")
+ return false;
+
+ // Ensure there are weights for all of the successors. Note that the first
+ // operand to the metadata node is a name, not a weight.
+ return WeightsNode->getNumOperands() == TI->getNumSuccessors() + 1;
+}
+
/// Update the block frequency of BB and branch weight and the metadata on the
/// edge BB->SuccBB. This is done by scaling the weight of BB->SuccBB by 1 -
/// Freq(PredBB->BB) / Freq(BB->SuccBB).
@@ -1665,7 +1691,41 @@ void JumpThreadingPass::UpdateBlockFreqAndEdgeWeight(BasicBlock *PredBB,
for (int I = 0, E = BBSuccProbs.size(); I < E; I++)
BPI->setEdgeProbability(BB, I, BBSuccProbs[I]);
- if (BBSuccProbs.size() >= 2) {
+ // Update the profile metadata as well.
+ //
+ // Don't do this if the profile of the transformed blocks was statically
+ // estimated. (This could occur despite the function having an entry
+ // frequency in completely cold parts of the CFG.)
+ //
+ // In this case we don't want to suggest to subsequent passes that the
+ // calculated weights are fully consistent. Consider this graph:
+ //
+ // check_1
+ // 50% / |
+ // eq_1 | 50%
+ // \ |
+ // check_2
+ // 50% / |
+ // eq_2 | 50%
+ // \ |
+ // check_3
+ // 50% / |
+ // eq_3 | 50%
+ // \ |
+ //
+ // Assuming the blocks check_* all compare the same value against 1, 2 and 3,
+ // the overall probabilities are inconsistent; the total probability that the
+ // value is either 1, 2 or 3 is 150%.
+ //
+ // As a consequence if we thread eq_1 -> check_2 to check_3, check_2->check_3
+ // becomes 0%. This is even worse if the edge whose probability becomes 0% is
+ // the loop exit edge. Then based solely on static estimation we would assume
+ // the loop was extremely hot.
+ //
+ // FIXME this locally as well so that BPI and BFI are consistent as well. We
+ // shouldn't make edges extremely likely or unlikely based solely on static
+ // estimation.
+ if (BBSuccProbs.size() >= 2 && doesBlockHaveProfileData(BB)) {
SmallVector<uint32_t, 4> Weights;
for (auto Prob : BBSuccProbs)
Weights.push_back(Prob.getNumerator());
diff --git a/contrib/llvm/lib/Transforms/Scalar/LICM.cpp b/contrib/llvm/lib/Transforms/Scalar/LICM.cpp
index cdd17fc..f51d11c 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LICM.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LICM.cpp
@@ -41,8 +41,8 @@
#include "llvm/Analysis/Loads.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/LoopPass.h"
-#include "llvm/Analysis/LoopPassManager.h"
#include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/Analysis/OptimizationDiagnosticInfo.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
@@ -61,6 +61,7 @@
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
#include "llvm/Transforms/Utils/SSAUpdater.h"
@@ -84,14 +85,17 @@ static bool inSubLoop(BasicBlock *BB, Loop *CurLoop, LoopInfo *LI);
static bool isNotUsedInLoop(const Instruction &I, const Loop *CurLoop,
const LoopSafetyInfo *SafetyInfo);
static bool hoist(Instruction &I, const DominatorTree *DT, const Loop *CurLoop,
- const LoopSafetyInfo *SafetyInfo);
+ const LoopSafetyInfo *SafetyInfo,
+ OptimizationRemarkEmitter *ORE);
static bool sink(Instruction &I, const LoopInfo *LI, const DominatorTree *DT,
const Loop *CurLoop, AliasSetTracker *CurAST,
- const LoopSafetyInfo *SafetyInfo);
-static bool isSafeToExecuteUnconditionally(const Instruction &Inst,
+ const LoopSafetyInfo *SafetyInfo,
+ OptimizationRemarkEmitter *ORE);
+static bool isSafeToExecuteUnconditionally(Instruction &Inst,
const DominatorTree *DT,
const Loop *CurLoop,
const LoopSafetyInfo *SafetyInfo,
+ OptimizationRemarkEmitter *ORE,
const Instruction *CtxI = nullptr);
static bool pointerInvalidatedByLoop(Value *V, uint64_t Size,
const AAMDNodes &AAInfo,
@@ -100,15 +104,12 @@ static Instruction *
CloneInstructionInExitBlock(Instruction &I, BasicBlock &ExitBlock, PHINode &PN,
const LoopInfo *LI,
const LoopSafetyInfo *SafetyInfo);
-static bool canSinkOrHoistInst(Instruction &I, AliasAnalysis *AA,
- DominatorTree *DT, TargetLibraryInfo *TLI,
- Loop *CurLoop, AliasSetTracker *CurAST,
- LoopSafetyInfo *SafetyInfo);
namespace {
struct LoopInvariantCodeMotion {
bool runOnLoop(Loop *L, AliasAnalysis *AA, LoopInfo *LI, DominatorTree *DT,
- TargetLibraryInfo *TLI, ScalarEvolution *SE, bool DeleteAST);
+ TargetLibraryInfo *TLI, ScalarEvolution *SE,
+ OptimizationRemarkEmitter *ORE, bool DeleteAST);
DenseMap<Loop *, AliasSetTracker *> &getLoopToAliasSetMap() {
return LoopToAliasSetMap;
@@ -128,16 +129,27 @@ struct LegacyLICMPass : public LoopPass {
}
bool runOnLoop(Loop *L, LPPassManager &LPM) override {
- if (skipLoop(L))
+ if (skipLoop(L)) {
+ // If we have run LICM on a previous loop but now we are skipping
+ // (because we've hit the opt-bisect limit), we need to clear the
+ // loop alias information.
+ for (auto &LTAS : LICM.getLoopToAliasSetMap())
+ delete LTAS.second;
+ LICM.getLoopToAliasSetMap().clear();
return false;
+ }
auto *SE = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>();
+ // For the old PM, we can't use OptimizationRemarkEmitter as an analysis
+ // pass. Function analyses need to be preserved across loop transformations
+ // but ORE cannot be preserved (see comment before the pass definition).
+ OptimizationRemarkEmitter ORE(L->getHeader()->getParent());
return LICM.runOnLoop(L,
&getAnalysis<AAResultsWrapperPass>().getAAResults(),
&getAnalysis<LoopInfoWrapperPass>().getLoopInfo(),
&getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
&getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(),
- SE ? &SE->getSE() : nullptr, false);
+ SE ? &SE->getSE() : nullptr, &ORE, false);
}
/// This transformation requires natural loop information & requires that
@@ -173,21 +185,20 @@ private:
};
}
-PreservedAnalyses LICMPass::run(Loop &L, AnalysisManager<Loop> &AM) {
+PreservedAnalyses LICMPass::run(Loop &L, LoopAnalysisManager &AM,
+ LoopStandardAnalysisResults &AR, LPMUpdater &) {
const auto &FAM =
- AM.getResult<FunctionAnalysisManagerLoopProxy>(L).getManager();
+ AM.getResult<FunctionAnalysisManagerLoopProxy>(L, AR).getManager();
Function *F = L.getHeader()->getParent();
- auto *AA = FAM.getCachedResult<AAManager>(*F);
- auto *LI = FAM.getCachedResult<LoopAnalysis>(*F);
- auto *DT = FAM.getCachedResult<DominatorTreeAnalysis>(*F);
- auto *TLI = FAM.getCachedResult<TargetLibraryAnalysis>(*F);
- auto *SE = FAM.getCachedResult<ScalarEvolutionAnalysis>(*F);
- assert((AA && LI && DT && TLI && SE) && "Analyses for LICM not available");
+ auto *ORE = FAM.getCachedResult<OptimizationRemarkEmitterAnalysis>(*F);
+ // FIXME: This should probably be optional rather than required.
+ if (!ORE)
+ report_fatal_error("LICM: OptimizationRemarkEmitterAnalysis not "
+ "cached at a higher level");
LoopInvariantCodeMotion LICM;
-
- if (!LICM.runOnLoop(&L, AA, LI, DT, TLI, SE, true))
+ if (!LICM.runOnLoop(&L, &AR.AA, &AR.LI, &AR.DT, &AR.TLI, &AR.SE, ORE, true))
return PreservedAnalyses::all();
// FIXME: There is no setPreservesCFG in the new PM. When that becomes
@@ -214,7 +225,9 @@ Pass *llvm::createLICMPass() { return new LegacyLICMPass(); }
bool LoopInvariantCodeMotion::runOnLoop(Loop *L, AliasAnalysis *AA,
LoopInfo *LI, DominatorTree *DT,
TargetLibraryInfo *TLI,
- ScalarEvolution *SE, bool DeleteAST) {
+ ScalarEvolution *SE,
+ OptimizationRemarkEmitter *ORE,
+ bool DeleteAST) {
bool Changed = false;
assert(L->isLCSSAForm(*DT) && "Loop is not in LCSSA form.");
@@ -240,31 +253,54 @@ bool LoopInvariantCodeMotion::runOnLoop(Loop *L, AliasAnalysis *AA,
//
if (L->hasDedicatedExits())
Changed |= sinkRegion(DT->getNode(L->getHeader()), AA, LI, DT, TLI, L,
- CurAST, &SafetyInfo);
+ CurAST, &SafetyInfo, ORE);
if (Preheader)
Changed |= hoistRegion(DT->getNode(L->getHeader()), AA, LI, DT, TLI, L,
- CurAST, &SafetyInfo);
+ CurAST, &SafetyInfo, ORE);
// Now that all loop invariants have been removed from the loop, promote any
// memory references to scalars that we can.
- if (!DisablePromotion && (Preheader || L->hasDedicatedExits())) {
+ // Don't sink stores from loops without dedicated block exits. Exits
+ // containing indirect branches are not transformed by loop simplify,
+ // make sure we catch that. An additional load may be generated in the
+ // preheader for SSA updater, so also avoid sinking when no preheader
+ // is available.
+ if (!DisablePromotion && Preheader && L->hasDedicatedExits()) {
+ // Figure out the loop exits and their insertion points
SmallVector<BasicBlock *, 8> ExitBlocks;
- SmallVector<Instruction *, 8> InsertPts;
- PredIteratorCache PIC;
-
- // Loop over all of the alias sets in the tracker object.
- for (AliasSet &AS : *CurAST)
- Changed |= promoteLoopAccessesToScalars(
- AS, ExitBlocks, InsertPts, PIC, LI, DT, TLI, L, CurAST, &SafetyInfo);
-
- // Once we have promoted values across the loop body we have to recursively
- // reform LCSSA as any nested loop may now have values defined within the
- // loop used in the outer loop.
- // FIXME: This is really heavy handed. It would be a bit better to use an
- // SSAUpdater strategy during promotion that was LCSSA aware and reformed
- // it as it went.
- if (Changed) {
- formLCSSARecursively(*L, *DT, LI, SE);
+ L->getUniqueExitBlocks(ExitBlocks);
+
+ // We can't insert into a catchswitch.
+ bool HasCatchSwitch = llvm::any_of(ExitBlocks, [](BasicBlock *Exit) {
+ return isa<CatchSwitchInst>(Exit->getTerminator());
+ });
+
+ if (!HasCatchSwitch) {
+ SmallVector<Instruction *, 8> InsertPts;
+ InsertPts.reserve(ExitBlocks.size());
+ for (BasicBlock *ExitBlock : ExitBlocks)
+ InsertPts.push_back(&*ExitBlock->getFirstInsertionPt());
+
+ PredIteratorCache PIC;
+
+ bool Promoted = false;
+
+ // Loop over all of the alias sets in the tracker object.
+ for (AliasSet &AS : *CurAST)
+ Promoted |=
+ promoteLoopAccessesToScalars(AS, ExitBlocks, InsertPts, PIC, LI, DT,
+ TLI, L, CurAST, &SafetyInfo, ORE);
+
+ // Once we have promoted values across the loop body we have to
+ // recursively reform LCSSA as any nested loop may now have values defined
+ // within the loop used in the outer loop.
+ // FIXME: This is really heavy handed. It would be a bit better to use an
+ // SSAUpdater strategy during promotion that was LCSSA aware and reformed
+ // it as it went.
+ if (Promoted)
+ formLCSSARecursively(*L, *DT, LI, SE);
+
+ Changed |= Promoted;
}
}
@@ -294,7 +330,8 @@ bool LoopInvariantCodeMotion::runOnLoop(Loop *L, AliasAnalysis *AA,
///
bool llvm::sinkRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
DominatorTree *DT, TargetLibraryInfo *TLI, Loop *CurLoop,
- AliasSetTracker *CurAST, LoopSafetyInfo *SafetyInfo) {
+ AliasSetTracker *CurAST, LoopSafetyInfo *SafetyInfo,
+ OptimizationRemarkEmitter *ORE) {
// Verify inputs.
assert(N != nullptr && AA != nullptr && LI != nullptr && DT != nullptr &&
@@ -310,7 +347,8 @@ bool llvm::sinkRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
bool Changed = false;
const std::vector<DomTreeNode *> &Children = N->getChildren();
for (DomTreeNode *Child : Children)
- Changed |= sinkRegion(Child, AA, LI, DT, TLI, CurLoop, CurAST, SafetyInfo);
+ Changed |=
+ sinkRegion(Child, AA, LI, DT, TLI, CurLoop, CurAST, SafetyInfo, ORE);
// Only need to process the contents of this block if it is not part of a
// subloop (which would already have been processed).
@@ -337,9 +375,9 @@ bool llvm::sinkRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
// operands of the instruction are loop invariant.
//
if (isNotUsedInLoop(I, CurLoop, SafetyInfo) &&
- canSinkOrHoistInst(I, AA, DT, TLI, CurLoop, CurAST, SafetyInfo)) {
+ canSinkOrHoistInst(I, AA, DT, CurLoop, CurAST, SafetyInfo, ORE)) {
++II;
- Changed |= sink(I, LI, DT, CurLoop, CurAST, SafetyInfo);
+ Changed |= sink(I, LI, DT, CurLoop, CurAST, SafetyInfo, ORE);
}
}
return Changed;
@@ -352,7 +390,8 @@ bool llvm::sinkRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
///
bool llvm::hoistRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
DominatorTree *DT, TargetLibraryInfo *TLI, Loop *CurLoop,
- AliasSetTracker *CurAST, LoopSafetyInfo *SafetyInfo) {
+ AliasSetTracker *CurAST, LoopSafetyInfo *SafetyInfo,
+ OptimizationRemarkEmitter *ORE) {
// Verify inputs.
assert(N != nullptr && AA != nullptr && LI != nullptr && DT != nullptr &&
CurLoop != nullptr && CurAST != nullptr && SafetyInfo != nullptr &&
@@ -382,6 +421,7 @@ bool llvm::hoistRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
CurAST->deleteValue(&I);
I.eraseFromParent();
}
+ Changed = true;
continue;
}
@@ -390,16 +430,17 @@ bool llvm::hoistRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
// is safe to hoist the instruction.
//
if (CurLoop->hasLoopInvariantOperands(&I) &&
- canSinkOrHoistInst(I, AA, DT, TLI, CurLoop, CurAST, SafetyInfo) &&
+ canSinkOrHoistInst(I, AA, DT, CurLoop, CurAST, SafetyInfo, ORE) &&
isSafeToExecuteUnconditionally(
- I, DT, CurLoop, SafetyInfo,
+ I, DT, CurLoop, SafetyInfo, ORE,
CurLoop->getLoopPreheader()->getTerminator()))
- Changed |= hoist(I, DT, CurLoop, SafetyInfo);
+ Changed |= hoist(I, DT, CurLoop, SafetyInfo, ORE);
}
const std::vector<DomTreeNode *> &Children = N->getChildren();
for (DomTreeNode *Child : Children)
- Changed |= hoistRegion(Child, AA, LI, DT, TLI, CurLoop, CurAST, SafetyInfo);
+ Changed |=
+ hoistRegion(Child, AA, LI, DT, TLI, CurLoop, CurAST, SafetyInfo, ORE);
return Changed;
}
@@ -436,12 +477,10 @@ void llvm::computeLoopSafetyInfo(LoopSafetyInfo *SafetyInfo, Loop *CurLoop) {
SafetyInfo->BlockColors = colorEHFunclets(*Fn);
}
-/// canSinkOrHoistInst - Return true if the hoister and sinker can handle this
-/// instruction.
-///
-bool canSinkOrHoistInst(Instruction &I, AliasAnalysis *AA, DominatorTree *DT,
- TargetLibraryInfo *TLI, Loop *CurLoop,
- AliasSetTracker *CurAST, LoopSafetyInfo *SafetyInfo) {
+bool llvm::canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT,
+ Loop *CurLoop, AliasSetTracker *CurAST,
+ LoopSafetyInfo *SafetyInfo,
+ OptimizationRemarkEmitter *ORE) {
// Loads have extra constraints we have to verify before we can hoist them.
if (LoadInst *LI = dyn_cast<LoadInst>(&I)) {
if (!LI->isUnordered())
@@ -462,7 +501,17 @@ bool canSinkOrHoistInst(Instruction &I, AliasAnalysis *AA, DominatorTree *DT,
AAMDNodes AAInfo;
LI->getAAMetadata(AAInfo);
- return !pointerInvalidatedByLoop(LI->getOperand(0), Size, AAInfo, CurAST);
+ bool Invalidated =
+ pointerInvalidatedByLoop(LI->getOperand(0), Size, AAInfo, CurAST);
+ // Check loop-invariant address because this may also be a sinkable load
+ // whose address is not necessarily loop-invariant.
+ if (ORE && Invalidated && CurLoop->isLoopInvariant(LI->getPointerOperand()))
+ ORE->emit(OptimizationRemarkMissed(
+ DEBUG_TYPE, "LoadWithLoopInvariantAddressInvalidated", LI)
+ << "failed to move load with loop-invariant address "
+ "because the loop may invalidate its value");
+
+ return !Invalidated;
} else if (CallInst *CI = dyn_cast<CallInst>(&I)) {
// Don't sink or hoist dbg info; it's legal, but not useful.
if (isa<DbgInfoIntrinsic>(I))
@@ -515,6 +564,11 @@ bool canSinkOrHoistInst(Instruction &I, AliasAnalysis *AA, DominatorTree *DT,
!isa<InsertValueInst>(I))
return false;
+ // SafetyInfo is nullptr if we are checking for sinking from preheader to
+ // loop body. It will be always safe as there is no speculative execution.
+ if (!SafetyInfo)
+ return true;
+
// TODO: Plumb the context instruction through to make hoisting and sinking
// more powerful. Hoisting of loads already works due to the special casing
// above.
@@ -651,8 +705,11 @@ CloneInstructionInExitBlock(Instruction &I, BasicBlock &ExitBlock, PHINode &PN,
///
static bool sink(Instruction &I, const LoopInfo *LI, const DominatorTree *DT,
const Loop *CurLoop, AliasSetTracker *CurAST,
- const LoopSafetyInfo *SafetyInfo) {
+ const LoopSafetyInfo *SafetyInfo,
+ OptimizationRemarkEmitter *ORE) {
DEBUG(dbgs() << "LICM sinking instruction: " << I << "\n");
+ ORE->emit(OptimizationRemark(DEBUG_TYPE, "InstSunk", &I)
+ << "sinking " << ore::NV("Inst", &I));
bool Changed = false;
if (isa<LoadInst>(I))
++NumMovedLoads;
@@ -719,10 +776,13 @@ static bool sink(Instruction &I, const LoopInfo *LI, const DominatorTree *DT,
/// is safe to hoist, this instruction is called to do the dirty work.
///
static bool hoist(Instruction &I, const DominatorTree *DT, const Loop *CurLoop,
- const LoopSafetyInfo *SafetyInfo) {
+ const LoopSafetyInfo *SafetyInfo,
+ OptimizationRemarkEmitter *ORE) {
auto *Preheader = CurLoop->getLoopPreheader();
DEBUG(dbgs() << "LICM hoisting to " << Preheader->getName() << ": " << I
<< "\n");
+ ORE->emit(OptimizationRemark(DEBUG_TYPE, "Hoisted", &I)
+ << "hosting " << ore::NV("Inst", &I));
// Metadata can be dependent on conditions we are hoisting above.
// Conservatively strip all metadata on the instruction unless we were
@@ -738,6 +798,14 @@ static bool hoist(Instruction &I, const DominatorTree *DT, const Loop *CurLoop,
// Move the new node to the Preheader, before its terminator.
I.moveBefore(Preheader->getTerminator());
+ // Do not retain debug locations when we are moving instructions to different
+ // basic blocks, because we want to avoid jumpy line tables. Calls, however,
+ // need to retain their debug locs because they may be inlined.
+ // FIXME: How do we retain source locations without causing poor debugging
+ // behavior?
+ if (!isa<CallInst>(I))
+ I.setDebugLoc(DebugLoc());
+
if (isa<LoadInst>(I))
++NumMovedLoads;
else if (isa<CallInst>(I))
@@ -749,15 +817,28 @@ static bool hoist(Instruction &I, const DominatorTree *DT, const Loop *CurLoop,
/// Only sink or hoist an instruction if it is not a trapping instruction,
/// or if the instruction is known not to trap when moved to the preheader.
/// or if it is a trapping instruction and is guaranteed to execute.
-static bool isSafeToExecuteUnconditionally(const Instruction &Inst,
+static bool isSafeToExecuteUnconditionally(Instruction &Inst,
const DominatorTree *DT,
const Loop *CurLoop,
const LoopSafetyInfo *SafetyInfo,
+ OptimizationRemarkEmitter *ORE,
const Instruction *CtxI) {
if (isSafeToSpeculativelyExecute(&Inst, CtxI, DT))
return true;
- return isGuaranteedToExecute(Inst, DT, CurLoop, SafetyInfo);
+ bool GuaranteedToExecute =
+ isGuaranteedToExecute(Inst, DT, CurLoop, SafetyInfo);
+
+ if (!GuaranteedToExecute) {
+ auto *LI = dyn_cast<LoadInst>(&Inst);
+ if (LI && CurLoop->isLoopInvariant(LI->getPointerOperand()))
+ ORE->emit(OptimizationRemarkMissed(
+ DEBUG_TYPE, "LoadWithLoopInvariantAddressCondExecuted", LI)
+ << "failed to hoist load with loop-invariant address "
+ "because load is conditionally executed");
+ }
+
+ return GuaranteedToExecute;
}
namespace {
@@ -845,7 +926,8 @@ bool llvm::promoteLoopAccessesToScalars(
AliasSet &AS, SmallVectorImpl<BasicBlock *> &ExitBlocks,
SmallVectorImpl<Instruction *> &InsertPts, PredIteratorCache &PIC,
LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI,
- Loop *CurLoop, AliasSetTracker *CurAST, LoopSafetyInfo *SafetyInfo) {
+ Loop *CurLoop, AliasSetTracker *CurAST, LoopSafetyInfo *SafetyInfo,
+ OptimizationRemarkEmitter *ORE) {
// Verify inputs.
assert(LI != nullptr && DT != nullptr && CurLoop != nullptr &&
CurAST != nullptr && SafetyInfo != nullptr &&
@@ -876,23 +958,33 @@ bool llvm::promoteLoopAccessesToScalars(
// is not safe, because *P may only be valid to access if 'c' is true.
//
// The safety property divides into two parts:
- // 1) The memory may not be dereferenceable on entry to the loop. In this
+ // p1) The memory may not be dereferenceable on entry to the loop. In this
// case, we can't insert the required load in the preheader.
- // 2) The memory model does not allow us to insert a store along any dynamic
+ // p2) The memory model does not allow us to insert a store along any dynamic
// path which did not originally have one.
//
- // It is safe to promote P if all uses are direct load/stores and if at
- // least one is guaranteed to be executed.
- bool GuaranteedToExecute = false;
-
- // It is also safe to promote P if we can prove that speculating a load into
- // the preheader is safe (i.e. proving dereferenceability on all
- // paths through the loop), and that the memory can be proven thread local
- // (so that the memory model requirement doesn't apply.) We first establish
- // the former, and then run a capture analysis below to establish the later.
- // We can use any access within the alias set to prove dereferenceability
+ // If at least one store is guaranteed to execute, both properties are
+ // satisfied, and promotion is legal.
+ //
+ // This, however, is not a necessary condition. Even if no store/load is
+ // guaranteed to execute, we can still establish these properties.
+ // We can establish (p1) by proving that hoisting the load into the preheader
+ // is safe (i.e. proving dereferenceability on all paths through the loop). We
+ // can use any access within the alias set to prove dereferenceability,
// since they're all must alias.
- bool CanSpeculateLoad = false;
+ //
+ // There are two ways establish (p2):
+ // a) Prove the location is thread-local. In this case the memory model
+ // requirement does not apply, and stores are safe to insert.
+ // b) Prove a store dominates every exit block. In this case, if an exit
+ // blocks is reached, the original dynamic path would have taken us through
+ // the store, so inserting a store into the exit block is safe. Note that this
+ // is different from the store being guaranteed to execute. For instance,
+ // if an exception is thrown on the first iteration of the loop, the original
+ // store is never executed, but the exit blocks are not executed either.
+
+ bool DereferenceableInPH = false;
+ bool SafeToInsertStore = false;
SmallVector<Instruction *, 64> LoopUses;
SmallPtrSet<Value *, 4> PointerMustAliases;
@@ -901,15 +993,6 @@ bool llvm::promoteLoopAccessesToScalars(
// us to prove better alignment.
unsigned Alignment = 1;
AAMDNodes AATags;
- bool HasDedicatedExits = CurLoop->hasDedicatedExits();
-
- // Don't sink stores from loops without dedicated block exits. Exits
- // containing indirect branches are not transformed by loop simplify,
- // make sure we catch that. An additional load may be generated in the
- // preheader for SSA updater, so also avoid sinking when no preheader
- // is available.
- if (!HasDedicatedExits || !Preheader)
- return false;
const DataLayout &MDL = Preheader->getModule()->getDataLayout();
@@ -926,7 +1009,6 @@ bool llvm::promoteLoopAccessesToScalars(
// Check that all of the pointers in the alias set have the same type. We
// cannot (yet) promote a memory location that is loaded and stored in
// different sizes. While we are at it, collect alignment and AA info.
- bool Changed = false;
for (const auto &ASI : AS) {
Value *ASIV = ASI.getValue();
PointerMustAliases.insert(ASIV);
@@ -935,7 +1017,7 @@ bool llvm::promoteLoopAccessesToScalars(
// cannot (yet) promote a memory location that is loaded and stored in
// different sizes.
if (SomePtr->getType() != ASIV->getType())
- return Changed;
+ return false;
for (User *U : ASIV->users()) {
// Ignore instructions that are outside the loop.
@@ -945,14 +1027,14 @@ bool llvm::promoteLoopAccessesToScalars(
// If there is an non-load/store instruction in the loop, we can't promote
// it.
- if (const LoadInst *Load = dyn_cast<LoadInst>(UI)) {
+ if (LoadInst *Load = dyn_cast<LoadInst>(UI)) {
assert(!Load->isVolatile() && "AST broken");
if (!Load->isSimple())
- return Changed;
+ return false;
- if (!GuaranteedToExecute && !CanSpeculateLoad)
- CanSpeculateLoad = isSafeToExecuteUnconditionally(
- *Load, DT, CurLoop, SafetyInfo, Preheader->getTerminator());
+ if (!DereferenceableInPH)
+ DereferenceableInPH = isSafeToExecuteUnconditionally(
+ *Load, DT, CurLoop, SafetyInfo, ORE, Preheader->getTerminator());
} else if (const StoreInst *Store = dyn_cast<StoreInst>(UI)) {
// Stores *of* the pointer are not interesting, only stores *to* the
// pointer.
@@ -960,35 +1042,47 @@ bool llvm::promoteLoopAccessesToScalars(
continue;
assert(!Store->isVolatile() && "AST broken");
if (!Store->isSimple())
- return Changed;
-
- // Note that we only check GuaranteedToExecute inside the store case
- // so that we do not introduce stores where they did not exist before
- // (which would break the LLVM concurrency model).
+ return false;
- // If the alignment of this instruction allows us to specify a more
- // restrictive (and performant) alignment and if we are sure this
- // instruction will be executed, update the alignment.
- // Larger is better, with the exception of 0 being the best alignment.
+ // If the store is guaranteed to execute, both properties are satisfied.
+ // We may want to check if a store is guaranteed to execute even if we
+ // already know that promotion is safe, since it may have higher
+ // alignment than any other guaranteed stores, in which case we can
+ // raise the alignment on the promoted store.
unsigned InstAlignment = Store->getAlignment();
- if ((InstAlignment > Alignment || InstAlignment == 0) &&
- Alignment != 0) {
+ if (!InstAlignment)
+ InstAlignment =
+ MDL.getABITypeAlignment(Store->getValueOperand()->getType());
+
+ if (!DereferenceableInPH || !SafeToInsertStore ||
+ (InstAlignment > Alignment)) {
if (isGuaranteedToExecute(*UI, DT, CurLoop, SafetyInfo)) {
- GuaranteedToExecute = true;
- Alignment = InstAlignment;
+ DereferenceableInPH = true;
+ SafeToInsertStore = true;
+ Alignment = std::max(Alignment, InstAlignment);
}
- } else if (!GuaranteedToExecute) {
- GuaranteedToExecute =
- isGuaranteedToExecute(*UI, DT, CurLoop, SafetyInfo);
}
- if (!GuaranteedToExecute && !CanSpeculateLoad) {
- CanSpeculateLoad = isDereferenceableAndAlignedPointer(
+ // If a store dominates all exit blocks, it is safe to sink.
+ // As explained above, if an exit block was executed, a dominating
+ // store must have been been executed at least once, so we are not
+ // introducing stores on paths that did not have them.
+ // Note that this only looks at explicit exit blocks. If we ever
+ // start sinking stores into unwind edges (see above), this will break.
+ if (!SafeToInsertStore)
+ SafeToInsertStore = llvm::all_of(ExitBlocks, [&](BasicBlock *Exit) {
+ return DT->dominates(Store->getParent(), Exit);
+ });
+
+ // If the store is not guaranteed to execute, we may still get
+ // deref info through it.
+ if (!DereferenceableInPH) {
+ DereferenceableInPH = isDereferenceableAndAlignedPointer(
Store->getPointerOperand(), Store->getAlignment(), MDL,
Preheader->getTerminator(), DT);
}
} else
- return Changed; // Not a load or store.
+ return false; // Not a load or store.
// Merge the AA tags.
if (LoopUses.empty()) {
@@ -1002,38 +1096,32 @@ bool llvm::promoteLoopAccessesToScalars(
}
}
- // Check legality per comment above. Otherwise, we can't promote.
- bool PromotionIsLegal = GuaranteedToExecute;
- if (!PromotionIsLegal && CanSpeculateLoad) {
- // If this is a thread local location, then we can insert stores along
- // paths which originally didn't have them without violating the memory
- // model.
- Value *Object = GetUnderlyingObject(SomePtr, MDL);
- PromotionIsLegal =
- isAllocLikeFn(Object, TLI) && !PointerMayBeCaptured(Object, true, true);
- }
- if (!PromotionIsLegal)
- return Changed;
- // Figure out the loop exits and their insertion points, if this is the
- // first promotion.
- if (ExitBlocks.empty()) {
- CurLoop->getUniqueExitBlocks(ExitBlocks);
- InsertPts.clear();
- InsertPts.reserve(ExitBlocks.size());
- for (BasicBlock *ExitBlock : ExitBlocks)
- InsertPts.push_back(&*ExitBlock->getFirstInsertionPt());
+ // If we couldn't prove we can hoist the load, bail.
+ if (!DereferenceableInPH)
+ return false;
+
+ // We know we can hoist the load, but don't have a guaranteed store.
+ // Check whether the location is thread-local. If it is, then we can insert
+ // stores along paths which originally didn't have them without violating the
+ // memory model.
+ if (!SafeToInsertStore) {
+ Value *Object = GetUnderlyingObject(SomePtr, MDL);
+ SafeToInsertStore =
+ (isAllocLikeFn(Object, TLI) || isa<AllocaInst>(Object)) &&
+ !PointerMayBeCaptured(Object, true, true);
}
- // Can't insert into a catchswitch.
- for (BasicBlock *ExitBlock : ExitBlocks)
- if (isa<CatchSwitchInst>(ExitBlock->getTerminator()))
- return Changed;
+ // If we've still failed to prove we can sink the store, give up.
+ if (!SafeToInsertStore)
+ return false;
// Otherwise, this is safe to promote, lets do it!
DEBUG(dbgs() << "LICM: Promoting value stored to in loop: " << *SomePtr
<< '\n');
- Changed = true;
+ ORE->emit(
+ OptimizationRemark(DEBUG_TYPE, "PromoteLoopAccessesToScalar", LoopUses[0])
+ << "Moving accesses to memory location out of the loop");
++NumPromoted;
// Grab a debug location for the inserted loads/stores; given that the
@@ -1066,13 +1154,13 @@ bool llvm::promoteLoopAccessesToScalars(
if (PreheaderLoad->use_empty())
PreheaderLoad->eraseFromParent();
- return Changed;
+ return true;
}
/// Returns an owning pointer to an alias set which incorporates aliasing info
/// from L and all subloops of L.
-/// FIXME: In new pass manager, there is no helper functions to handle loop
-/// analysis such as cloneBasicBlockAnalysis. So the AST needs to be recompute
+/// FIXME: In new pass manager, there is no helper function to handle loop
+/// analysis such as cloneBasicBlockAnalysis, so the AST needs to be recomputed
/// from scratch for every loop. Hook up with the helper functions when
/// available in the new pass manager to avoid redundant computation.
AliasSetTracker *
@@ -1108,10 +1196,7 @@ LoopInvariantCodeMotion::collectAliasInfoForLoop(Loop *L, LoopInfo *LI,
auto mergeLoop = [&](Loop *L) {
// Loop over the body of this loop, looking for calls, invokes, and stores.
- // Because subloops have already been incorporated into AST, we skip blocks
- // in subloops.
for (BasicBlock *BB : L->blocks())
- if (LI->getLoopFor(BB) == L) // Ignore blocks in subloops.
CurAST->add(*BB); // Incorporate the specified basic block
};
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoadCombine.cpp b/contrib/llvm/lib/Transforms/Scalar/LoadCombine.cpp
index dfe51a4..389f1c5 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoadCombine.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoadCombine.cpp
@@ -44,9 +44,6 @@ struct PointerOffsetPair {
};
struct LoadPOPPair {
- LoadPOPPair() = default;
- LoadPOPPair(LoadInst *L, PointerOffsetPair P, unsigned O)
- : Load(L), POP(P), InsertOrder(O) {}
LoadInst *Load;
PointerOffsetPair POP;
/// \brief The new load needs to be created before the first load in IR order.
@@ -71,7 +68,7 @@ public:
AU.addPreserved<GlobalsAAWrapperPass>();
}
- const char *getPassName() const override { return LDCOMBINE_NAME; }
+ StringRef getPassName() const override { return LDCOMBINE_NAME; }
static char ID;
typedef IRBuilder<TargetFolder> BuilderTy;
@@ -264,7 +261,7 @@ bool LoadCombine::runOnBasicBlock(BasicBlock &BB) {
auto POP = getPointerOffsetPair(*LI);
if (!POP.Pointer)
continue;
- LoadMap[POP.Pointer].push_back(LoadPOPPair(LI, POP, Index++));
+ LoadMap[POP.Pointer].push_back({LI, std::move(POP), Index++});
AST.add(LI);
}
if (combineLoads(LoadMap))
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopAccessAnalysisPrinter.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopAccessAnalysisPrinter.cpp
new file mode 100644
index 0000000..a64c991
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopAccessAnalysisPrinter.cpp
@@ -0,0 +1,25 @@
+//===- LoopAccessAnalysisPrinter.cpp - Loop Access Analysis Printer --------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LoopAccessAnalysisPrinter.h"
+#include "llvm/Analysis/LoopAccessAnalysis.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "loop-accesses"
+
+PreservedAnalyses
+LoopAccessInfoPrinterPass::run(Loop &L, LoopAnalysisManager &AM,
+ LoopStandardAnalysisResults &AR, LPMUpdater &) {
+ Function &F = *L.getHeader()->getParent();
+ auto &LAI = AM.getResult<LoopAccessAnalysis>(L, AR);
+ OS << "Loop access info in function '" << F.getName() << "':\n";
+ OS.indent(2) << L.getHeader()->getName() << ":\n";
+ LAI.print(OS, 4);
+ return PreservedAnalyses::all();
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp
index 66b59d2..d09af32 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp
@@ -11,14 +11,16 @@
//
//===----------------------------------------------------------------------===//
+#include "llvm/Transforms/Scalar/LoopDataPrefetch.h"
+
#define DEBUG_TYPE "loop-data-prefetch"
-#include "llvm/Transforms/Scalar.h"
#include "llvm/ADT/DepthFirstIterator.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/CodeMetrics.h"
#include "llvm/Analysis/InstructionSimplify.h"
#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/OptimizationDiagnosticInfo.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
#include "llvm/Analysis/ScalarEvolutionExpander.h"
@@ -26,13 +28,13 @@
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/CFG.h"
-#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Module.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/ValueMapper.h"
@@ -59,77 +61,89 @@ static cl::opt<unsigned> MaxPrefetchIterationsAhead(
STATISTIC(NumPrefetches, "Number of prefetches inserted");
-namespace llvm {
- void initializeLoopDataPrefetchPass(PassRegistry&);
-}
-
namespace {
- class LoopDataPrefetch : public FunctionPass {
- public:
- static char ID; // Pass ID, replacement for typeid
- LoopDataPrefetch() : FunctionPass(ID) {
- initializeLoopDataPrefetchPass(*PassRegistry::getPassRegistry());
- }
+/// Loop prefetch implementation class.
+class LoopDataPrefetch {
+public:
+ LoopDataPrefetch(AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE,
+ const TargetTransformInfo *TTI,
+ OptimizationRemarkEmitter *ORE)
+ : AC(AC), LI(LI), SE(SE), TTI(TTI), ORE(ORE) {}
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<AssumptionCacheTracker>();
- AU.addPreserved<DominatorTreeWrapperPass>();
- AU.addRequired<LoopInfoWrapperPass>();
- AU.addPreserved<LoopInfoWrapperPass>();
- AU.addRequired<ScalarEvolutionWrapperPass>();
- // FIXME: For some reason, preserving SE here breaks LSR (even if
- // this pass changes nothing).
- // AU.addPreserved<ScalarEvolutionWrapperPass>();
- AU.addRequired<TargetTransformInfoWrapperPass>();
- }
+ bool run();
- bool runOnFunction(Function &F) override;
+private:
+ bool runOnLoop(Loop *L);
- private:
- bool runOnLoop(Loop *L);
+ /// \brief Check if the the stride of the accesses is large enough to
+ /// warrant a prefetch.
+ bool isStrideLargeEnough(const SCEVAddRecExpr *AR);
- /// \brief Check if the the stride of the accesses is large enough to
- /// warrant a prefetch.
- bool isStrideLargeEnough(const SCEVAddRecExpr *AR);
+ unsigned getMinPrefetchStride() {
+ if (MinPrefetchStride.getNumOccurrences() > 0)
+ return MinPrefetchStride;
+ return TTI->getMinPrefetchStride();
+ }
- unsigned getMinPrefetchStride() {
- if (MinPrefetchStride.getNumOccurrences() > 0)
- return MinPrefetchStride;
- return TTI->getMinPrefetchStride();
- }
+ unsigned getPrefetchDistance() {
+ if (PrefetchDistance.getNumOccurrences() > 0)
+ return PrefetchDistance;
+ return TTI->getPrefetchDistance();
+ }
- unsigned getPrefetchDistance() {
- if (PrefetchDistance.getNumOccurrences() > 0)
- return PrefetchDistance;
- return TTI->getPrefetchDistance();
- }
+ unsigned getMaxPrefetchIterationsAhead() {
+ if (MaxPrefetchIterationsAhead.getNumOccurrences() > 0)
+ return MaxPrefetchIterationsAhead;
+ return TTI->getMaxPrefetchIterationsAhead();
+ }
- unsigned getMaxPrefetchIterationsAhead() {
- if (MaxPrefetchIterationsAhead.getNumOccurrences() > 0)
- return MaxPrefetchIterationsAhead;
- return TTI->getMaxPrefetchIterationsAhead();
- }
+ AssumptionCache *AC;
+ LoopInfo *LI;
+ ScalarEvolution *SE;
+ const TargetTransformInfo *TTI;
+ OptimizationRemarkEmitter *ORE;
+};
+
+/// Legacy class for inserting loop data prefetches.
+class LoopDataPrefetchLegacyPass : public FunctionPass {
+public:
+ static char ID; // Pass ID, replacement for typeid
+ LoopDataPrefetchLegacyPass() : FunctionPass(ID) {
+ initializeLoopDataPrefetchLegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<AssumptionCacheTracker>();
+ AU.addPreserved<DominatorTreeWrapperPass>();
+ AU.addRequired<LoopInfoWrapperPass>();
+ AU.addPreserved<LoopInfoWrapperPass>();
+ AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
+ AU.addRequired<ScalarEvolutionWrapperPass>();
+ // FIXME: For some reason, preserving SE here breaks LSR (even if
+ // this pass changes nothing).
+ // AU.addPreserved<ScalarEvolutionWrapperPass>();
+ AU.addRequired<TargetTransformInfoWrapperPass>();
+ }
- AssumptionCache *AC;
- LoopInfo *LI;
- ScalarEvolution *SE;
- const TargetTransformInfo *TTI;
- const DataLayout *DL;
+ bool runOnFunction(Function &F) override;
};
}
-char LoopDataPrefetch::ID = 0;
-INITIALIZE_PASS_BEGIN(LoopDataPrefetch, "loop-data-prefetch",
+char LoopDataPrefetchLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(LoopDataPrefetchLegacyPass, "loop-data-prefetch",
"Loop Data Prefetch", false, false)
INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
-INITIALIZE_PASS_END(LoopDataPrefetch, "loop-data-prefetch",
+INITIALIZE_PASS_END(LoopDataPrefetchLegacyPass, "loop-data-prefetch",
"Loop Data Prefetch", false, false)
-FunctionPass *llvm::createLoopDataPrefetchPass() { return new LoopDataPrefetch(); }
+FunctionPass *llvm::createLoopDataPrefetchPass() {
+ return new LoopDataPrefetchLegacyPass();
+}
bool LoopDataPrefetch::isStrideLargeEnough(const SCEVAddRecExpr *AR) {
unsigned TargetMinStride = getMinPrefetchStride();
@@ -147,16 +161,46 @@ bool LoopDataPrefetch::isStrideLargeEnough(const SCEVAddRecExpr *AR) {
return TargetMinStride <= AbsStride;
}
-bool LoopDataPrefetch::runOnFunction(Function &F) {
+PreservedAnalyses LoopDataPrefetchPass::run(Function &F,
+ FunctionAnalysisManager &AM) {
+ LoopInfo *LI = &AM.getResult<LoopAnalysis>(F);
+ ScalarEvolution *SE = &AM.getResult<ScalarEvolutionAnalysis>(F);
+ AssumptionCache *AC = &AM.getResult<AssumptionAnalysis>(F);
+ OptimizationRemarkEmitter *ORE =
+ &AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
+ const TargetTransformInfo *TTI = &AM.getResult<TargetIRAnalysis>(F);
+
+ LoopDataPrefetch LDP(AC, LI, SE, TTI, ORE);
+ bool Changed = LDP.run();
+
+ if (Changed) {
+ PreservedAnalyses PA;
+ PA.preserve<DominatorTreeAnalysis>();
+ PA.preserve<LoopAnalysis>();
+ return PA;
+ }
+
+ return PreservedAnalyses::all();
+}
+
+bool LoopDataPrefetchLegacyPass::runOnFunction(Function &F) {
if (skipFunction(F))
return false;
- LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
- SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
- DL = &F.getParent()->getDataLayout();
- AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
- TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+ LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+ ScalarEvolution *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+ AssumptionCache *AC =
+ &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+ OptimizationRemarkEmitter *ORE =
+ &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
+ const TargetTransformInfo *TTI =
+ &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+
+ LoopDataPrefetch LDP(AC, LI, SE, TTI, ORE);
+ return LDP.run();
+}
+bool LoopDataPrefetch::run() {
// If PrefetchDistance is not set, don't run the pass. This gives an
// opportunity for targets to run this pass for selected subtargets only
// (whose TTI sets PrefetchDistance).
@@ -185,19 +229,16 @@ bool LoopDataPrefetch::runOnLoop(Loop *L) {
// Calculate the number of iterations ahead to prefetch
CodeMetrics Metrics;
- for (Loop::block_iterator I = L->block_begin(), IE = L->block_end();
- I != IE; ++I) {
-
+ for (const auto BB : L->blocks()) {
// If the loop already has prefetches, then assume that the user knows
// what they are doing and don't add any more.
- for (BasicBlock::iterator J = (*I)->begin(), JE = (*I)->end();
- J != JE; ++J)
- if (CallInst *CI = dyn_cast<CallInst>(J))
+ for (auto &I : *BB)
+ if (CallInst *CI = dyn_cast<CallInst>(&I))
if (Function *F = CI->getCalledFunction())
if (F->getIntrinsicID() == Intrinsic::prefetch)
return MadeChange;
- Metrics.analyzeBasicBlock(*I, *TTI, EphValues);
+ Metrics.analyzeBasicBlock(BB, *TTI, EphValues);
}
unsigned LoopSize = Metrics.NumInsts;
if (!LoopSize)
@@ -210,23 +251,20 @@ bool LoopDataPrefetch::runOnLoop(Loop *L) {
if (ItersAhead > getMaxPrefetchIterationsAhead())
return MadeChange;
- Function *F = L->getHeader()->getParent();
DEBUG(dbgs() << "Prefetching " << ItersAhead
<< " iterations ahead (loop size: " << LoopSize << ") in "
- << F->getName() << ": " << *L);
+ << L->getHeader()->getParent()->getName() << ": " << *L);
SmallVector<std::pair<Instruction *, const SCEVAddRecExpr *>, 16> PrefLoads;
- for (Loop::block_iterator I = L->block_begin(), IE = L->block_end();
- I != IE; ++I) {
- for (BasicBlock::iterator J = (*I)->begin(), JE = (*I)->end();
- J != JE; ++J) {
+ for (const auto BB : L->blocks()) {
+ for (auto &I : *BB) {
Value *PtrValue;
Instruction *MemI;
- if (LoadInst *LMemI = dyn_cast<LoadInst>(J)) {
+ if (LoadInst *LMemI = dyn_cast<LoadInst>(&I)) {
MemI = LMemI;
PtrValue = LMemI->getPointerOperand();
- } else if (StoreInst *SMemI = dyn_cast<StoreInst>(J)) {
+ } else if (StoreInst *SMemI = dyn_cast<StoreInst>(&I)) {
if (!PrefetchWrites) continue;
MemI = SMemI;
PtrValue = SMemI->getPointerOperand();
@@ -275,13 +313,13 @@ bool LoopDataPrefetch::runOnLoop(Loop *L) {
PrefLoads.push_back(std::make_pair(MemI, LSCEVAddRec));
- Type *I8Ptr = Type::getInt8PtrTy((*I)->getContext(), PtrAddrSpace);
- SCEVExpander SCEVE(*SE, J->getModule()->getDataLayout(), "prefaddr");
+ Type *I8Ptr = Type::getInt8PtrTy(BB->getContext(), PtrAddrSpace);
+ SCEVExpander SCEVE(*SE, I.getModule()->getDataLayout(), "prefaddr");
Value *PrefPtrValue = SCEVE.expandCodeFor(NextLSCEV, I8Ptr, MemI);
IRBuilder<> Builder(MemI);
- Module *M = (*I)->getParent()->getParent();
- Type *I32 = Type::getInt32Ty((*I)->getContext());
+ Module *M = BB->getParent()->getParent();
+ Type *I32 = Type::getInt32Ty(BB->getContext());
Value *PrefetchFunc = Intrinsic::getDeclaration(M, Intrinsic::prefetch);
Builder.CreateCall(
PrefetchFunc,
@@ -291,9 +329,8 @@ bool LoopDataPrefetch::runOnLoop(Loop *L) {
++NumPrefetches;
DEBUG(dbgs() << " Access: " << *PtrValue << ", SCEV: " << *LSCEV
<< "\n");
- emitOptimizationRemark(F->getContext(), DEBUG_TYPE, *F,
- MemI->getDebugLoc(), "prefetched memory access");
-
+ ORE->emit(OptimizationRemark(DEBUG_TYPE, "Prefetched", MemI)
+ << "prefetched memory access");
MadeChange = true;
}
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopDeletion.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopDeletion.cpp
index 19b2f89..cca75a3 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoopDeletion.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopDeletion.cpp
@@ -19,9 +19,9 @@
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/LoopPass.h"
-#include "llvm/Analysis/LoopPassManager.h"
#include "llvm/IR/Dominators.h"
#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
using namespace llvm;
@@ -215,15 +215,10 @@ bool LoopDeletionPass::runImpl(Loop *L, DominatorTree &DT, ScalarEvolution &SE,
return Changed;
}
-PreservedAnalyses LoopDeletionPass::run(Loop &L, AnalysisManager<Loop> &AM) {
- auto &FAM = AM.getResult<FunctionAnalysisManagerLoopProxy>(L).getManager();
- Function *F = L.getHeader()->getParent();
-
- auto &DT = *FAM.getCachedResult<DominatorTreeAnalysis>(*F);
- auto &SE = *FAM.getCachedResult<ScalarEvolutionAnalysis>(*F);
- auto &LI = *FAM.getCachedResult<LoopAnalysis>(*F);
-
- bool Changed = runImpl(&L, DT, SE, LI);
+PreservedAnalyses LoopDeletionPass::run(Loop &L, LoopAnalysisManager &AM,
+ LoopStandardAnalysisResults &AR,
+ LPMUpdater &) {
+ bool Changed = runImpl(&L, AR.DT, AR.SE, AR.LI);
if (!Changed)
return PreservedAnalyses::all();
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopDistribute.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopDistribute.cpp
index 7eca28e..19716b2 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoopDistribute.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopDistribute.cpp
@@ -28,15 +28,16 @@
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/LoopAccessAnalysis.h"
#include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/LoopPassManager.h"
#include "llvm/Analysis/OptimizationDiagnosticInfo.h"
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/Dominators.h"
#include "llvm/Pass.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/Cloning.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
@@ -72,11 +73,10 @@ static cl::opt<unsigned> PragmaDistributeSCEVCheckThreshold(
"The maximum number of SCEV checks allowed for Loop "
"Distribution for loop marked with #pragma loop distribute(enable)"));
-// Note that the initial value for this depends on whether the pass is invoked
-// directly or from the optimization pipeline.
static cl::opt<bool> EnableLoopDistribute(
"enable-loop-distribute", cl::Hidden,
- cl::desc("Enable the new, experimental LoopDistribution Pass"));
+ cl::desc("Enable the new, experimental LoopDistribution Pass"),
+ cl::init(false));
STATISTIC(NumLoopsDistributed, "Number of loops distributed");
@@ -605,11 +605,13 @@ public:
DEBUG(dbgs() << "\nLDist: In \"" << L->getHeader()->getParent()->getName()
<< "\" checking " << *L << "\n");
- BasicBlock *PH = L->getLoopPreheader();
- if (!PH)
- return fail("no preheader");
if (!L->getExitBlock())
- return fail("multiple exit blocks");
+ return fail("MultipleExitBlocks", "multiple exit blocks");
+ if (!L->isLoopSimplifyForm())
+ return fail("NotLoopSimplifyForm",
+ "loop is not in loop-simplify form");
+
+ BasicBlock *PH = L->getLoopPreheader();
// LAA will check that we only have a single exiting block.
LAI = &GetLAA(*L);
@@ -617,11 +619,12 @@ public:
// Currently, we only distribute to isolate the part of the loop with
// dependence cycles to enable partial vectorization.
if (LAI->canVectorizeMemory())
- return fail("memory operations are safe for vectorization");
+ return fail("MemOpsCanBeVectorized",
+ "memory operations are safe for vectorization");
auto *Dependences = LAI->getDepChecker().getDependences();
if (!Dependences || Dependences->empty())
- return fail("no unsafe dependences to isolate");
+ return fail("NoUnsafeDeps", "no unsafe dependences to isolate");
InstPartitionContainer Partitions(L, LI, DT);
@@ -674,14 +677,16 @@ public:
DEBUG(dbgs() << "Seeded partitions:\n" << Partitions);
if (Partitions.getSize() < 2)
- return fail("cannot isolate unsafe dependencies");
+ return fail("CantIsolateUnsafeDeps",
+ "cannot isolate unsafe dependencies");
// Run the merge heuristics: Merge non-cyclic adjacent partitions since we
// should be able to vectorize these together.
Partitions.mergeBeforePopulating();
DEBUG(dbgs() << "\nMerged partitions:\n" << Partitions);
if (Partitions.getSize() < 2)
- return fail("cannot isolate unsafe dependencies");
+ return fail("CantIsolateUnsafeDeps",
+ "cannot isolate unsafe dependencies");
// Now, populate the partitions with non-memory operations.
Partitions.populateUsedSet();
@@ -693,7 +698,8 @@ public:
DEBUG(dbgs() << "\nPartitions merged to ensure unique loads:\n"
<< Partitions);
if (Partitions.getSize() < 2)
- return fail("cannot isolate unsafe dependencies");
+ return fail("CantIsolateUnsafeDeps",
+ "cannot isolate unsafe dependencies");
}
// Don't distribute the loop if we need too many SCEV run-time checks.
@@ -701,7 +707,8 @@ public:
if (Pred.getComplexity() > (IsForced.getValueOr(false)
? PragmaDistributeSCEVCheckThreshold
: DistributeSCEVCheckThreshold))
- return fail("too many SCEV run-time checks needed.\n");
+ return fail("TooManySCEVRuntimeChecks",
+ "too many SCEV run-time checks needed.\n");
DEBUG(dbgs() << "\nDistributing loop: " << *L << "\n");
// We're done forming the partitions set up the reverse mapping from
@@ -742,36 +749,38 @@ public:
DEBUG(Partitions.printBlocks());
if (LDistVerify) {
- LI->verify();
+ LI->verify(*DT);
DT->verifyDomTree();
}
++NumLoopsDistributed;
// Report the success.
- emitOptimizationRemark(F->getContext(), LDIST_NAME, *F, L->getStartLoc(),
- "distributed loop");
+ ORE->emit(OptimizationRemark(LDIST_NAME, "Distribute", L->getStartLoc(),
+ L->getHeader())
+ << "distributed loop");
return true;
}
/// \brief Provide diagnostics then \return with false.
- bool fail(llvm::StringRef Message) {
+ bool fail(StringRef RemarkName, StringRef Message) {
LLVMContext &Ctx = F->getContext();
bool Forced = isForced().getValueOr(false);
DEBUG(dbgs() << "Skipping; " << Message << "\n");
// With Rpass-missed report that distribution failed.
- ORE->emitOptimizationRemarkMissed(
- LDIST_NAME, L,
- "loop not distributed: use -Rpass-analysis=loop-distribute for more "
- "info");
+ ORE->emit(
+ OptimizationRemarkMissed(LDIST_NAME, "NotDistributed", L->getStartLoc(),
+ L->getHeader())
+ << "loop not distributed: use -Rpass-analysis=loop-distribute for more "
+ "info");
// With Rpass-analysis report why. This is on by default if distribution
// was requested explicitly.
- emitOptimizationRemarkAnalysis(
- Ctx, Forced ? DiagnosticInfoOptimizationRemarkAnalysis::AlwaysPrint
- : LDIST_NAME,
- *F, L->getStartLoc(), Twine("loop not distributed: ") + Message);
+ ORE->emit(OptimizationRemarkAnalysis(
+ Forced ? OptimizationRemarkAnalysis::AlwaysPrint : LDIST_NAME,
+ RemarkName, L->getStartLoc(), L->getHeader())
+ << "loop not distributed: " << Message);
// Also issue a warning if distribution was requested explicitly but it
// failed.
@@ -865,8 +874,7 @@ private:
/// Shared implementation between new and old PMs.
static bool runImpl(Function &F, LoopInfo *LI, DominatorTree *DT,
ScalarEvolution *SE, OptimizationRemarkEmitter *ORE,
- std::function<const LoopAccessInfo &(Loop &)> &GetLAA,
- bool ProcessAllLoops) {
+ std::function<const LoopAccessInfo &(Loop &)> &GetLAA) {
// Build up a worklist of inner-loops to vectorize. This is necessary as the
// act of distributing a loop creates new loops and can invalidate iterators
// across the loops.
@@ -885,7 +893,7 @@ static bool runImpl(Function &F, LoopInfo *LI, DominatorTree *DT,
// If distribution was forced for the specific loop to be
// enabled/disabled, follow that. Otherwise use the global flag.
- if (LDL.isForced().getValueOr(ProcessAllLoops))
+ if (LDL.isForced().getValueOr(EnableLoopDistribute))
Changed |= LDL.processLoop(GetLAA);
}
@@ -896,15 +904,8 @@ static bool runImpl(Function &F, LoopInfo *LI, DominatorTree *DT,
/// \brief The pass class.
class LoopDistributeLegacy : public FunctionPass {
public:
- /// \p ProcessAllLoopsByDefault specifies whether loop distribution should be
- /// performed by default. Pass -enable-loop-distribute={0,1} overrides this
- /// default. We use this to keep LoopDistribution off by default when invoked
- /// from the optimization pipeline but on when invoked explicitly from opt.
- LoopDistributeLegacy(bool ProcessAllLoopsByDefault = true)
- : FunctionPass(ID), ProcessAllLoops(ProcessAllLoopsByDefault) {
+ LoopDistributeLegacy() : FunctionPass(ID) {
// The default is set by the caller.
- if (EnableLoopDistribute.getNumOccurrences() > 0)
- ProcessAllLoops = EnableLoopDistribute;
initializeLoopDistributeLegacyPass(*PassRegistry::getPassRegistry());
}
@@ -920,7 +921,7 @@ public:
std::function<const LoopAccessInfo &(Loop &)> GetLAA =
[&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
- return runImpl(F, LI, DT, SE, ORE, GetLAA, ProcessAllLoops);
+ return runImpl(F, LI, DT, SE, ORE, GetLAA);
}
void getAnalysisUsage(AnalysisUsage &AU) const override {
@@ -931,48 +932,46 @@ public:
AU.addRequired<DominatorTreeWrapperPass>();
AU.addPreserved<DominatorTreeWrapperPass>();
AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
+ AU.addPreserved<GlobalsAAWrapperPass>();
}
static char ID;
-
-private:
- /// \brief Whether distribution should be on in this function. The per-loop
- /// pragma can override this.
- bool ProcessAllLoops;
};
} // anonymous namespace
PreservedAnalyses LoopDistributePass::run(Function &F,
FunctionAnalysisManager &AM) {
- // FIXME: This does not currently match the behavior from the old PM.
- // ProcessAllLoops with the old PM defaults to true when invoked from opt and
- // false when invoked from the optimization pipeline.
- bool ProcessAllLoops = false;
- if (EnableLoopDistribute.getNumOccurrences() > 0)
- ProcessAllLoops = EnableLoopDistribute;
-
auto &LI = AM.getResult<LoopAnalysis>(F);
auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
+ // We don't directly need these analyses but they're required for loop
+ // analyses so provide them below.
+ auto &AA = AM.getResult<AAManager>(F);
+ auto &AC = AM.getResult<AssumptionAnalysis>(F);
+ auto &TTI = AM.getResult<TargetIRAnalysis>(F);
+ auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
+
auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
std::function<const LoopAccessInfo &(Loop &)> GetLAA =
[&](Loop &L) -> const LoopAccessInfo & {
- return LAM.getResult<LoopAccessAnalysis>(L);
+ LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI};
+ return LAM.getResult<LoopAccessAnalysis>(L, AR);
};
- bool Changed = runImpl(F, &LI, &DT, &SE, &ORE, GetLAA, ProcessAllLoops);
+ bool Changed = runImpl(F, &LI, &DT, &SE, &ORE, GetLAA);
if (!Changed)
return PreservedAnalyses::all();
PreservedAnalyses PA;
PA.preserve<LoopAnalysis>();
PA.preserve<DominatorTreeAnalysis>();
+ PA.preserve<GlobalsAA>();
return PA;
}
char LoopDistributeLegacy::ID;
-static const char ldist_name[] = "Loop Distribition";
+static const char ldist_name[] = "Loop Distribution";
INITIALIZE_PASS_BEGIN(LoopDistributeLegacy, LDIST_NAME, ldist_name, false,
false)
@@ -984,7 +983,5 @@ INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
INITIALIZE_PASS_END(LoopDistributeLegacy, LDIST_NAME, ldist_name, false, false)
namespace llvm {
-FunctionPass *createLoopDistributePass(bool ProcessAllLoopsByDefault) {
- return new LoopDistributeLegacy(ProcessAllLoopsByDefault);
-}
+FunctionPass *createLoopDistributePass() { return new LoopDistributeLegacy(); }
}
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index 1468676..5fec51c 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -11,6 +11,12 @@
// non-loop form. In cases that this kicks in, it can be a significant
// performance win.
//
+// If compiling for code size we avoid idiom recognition if the resulting
+// code could be larger than the code for the original loop. One way this could
+// happen is if the loop is not removable after idiom recognition due to the
+// presence of non-idiom instructions. The initial implementation of the
+// heuristics applies to idioms in multi-block loops.
+//
//===----------------------------------------------------------------------===//
//
// TODO List:
@@ -40,7 +46,6 @@
#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/LoopAccessAnalysis.h"
#include "llvm/Analysis/LoopPass.h"
-#include "llvm/Analysis/LoopPassManager.h"
#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
#include "llvm/Analysis/ScalarEvolutionExpander.h"
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
@@ -55,6 +60,7 @@
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
#include "llvm/Transforms/Utils/BuildLibCalls.h"
#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
@@ -65,6 +71,12 @@ using namespace llvm;
STATISTIC(NumMemSet, "Number of memset's formed from loop stores");
STATISTIC(NumMemCpy, "Number of memcpy's formed from loop load+stores");
+static cl::opt<bool> UseLIRCodeSizeHeurs(
+ "use-lir-code-size-heurs",
+ cl::desc("Use loop idiom recognition code size heuristics when compiling"
+ "with -Os/-Oz"),
+ cl::init(true), cl::Hidden);
+
namespace {
class LoopIdiomRecognize {
@@ -76,6 +88,7 @@ class LoopIdiomRecognize {
TargetLibraryInfo *TLI;
const TargetTransformInfo *TTI;
const DataLayout *DL;
+ bool ApplyCodeSizeHeuristics;
public:
explicit LoopIdiomRecognize(AliasAnalysis *AA, DominatorTree *DT,
@@ -117,8 +130,10 @@ private:
Instruction *TheStore,
SmallPtrSetImpl<Instruction *> &Stores,
const SCEVAddRecExpr *Ev, const SCEV *BECount,
- bool NegStride);
+ bool NegStride, bool IsLoopMemset = false);
bool processLoopStoreOfLoopLoad(StoreInst *SI, const SCEV *BECount);
+ bool avoidLIRForMultiBlockLoop(bool IsMemset = false,
+ bool IsLoopMemset = false);
/// @}
/// \name Noncountable Loop Idiom Handling
@@ -171,24 +186,12 @@ public:
};
} // End anonymous namespace.
-PreservedAnalyses LoopIdiomRecognizePass::run(Loop &L,
- AnalysisManager<Loop> &AM) {
- const auto &FAM =
- AM.getResult<FunctionAnalysisManagerLoopProxy>(L).getManager();
- Function *F = L.getHeader()->getParent();
-
- // Use getCachedResult because Loop pass cannot trigger a function analysis.
- auto *AA = FAM.getCachedResult<AAManager>(*F);
- auto *DT = FAM.getCachedResult<DominatorTreeAnalysis>(*F);
- auto *LI = FAM.getCachedResult<LoopAnalysis>(*F);
- auto *SE = FAM.getCachedResult<ScalarEvolutionAnalysis>(*F);
- auto *TLI = FAM.getCachedResult<TargetLibraryAnalysis>(*F);
- const auto *TTI = FAM.getCachedResult<TargetIRAnalysis>(*F);
+PreservedAnalyses LoopIdiomRecognizePass::run(Loop &L, LoopAnalysisManager &AM,
+ LoopStandardAnalysisResults &AR,
+ LPMUpdater &) {
const auto *DL = &L.getHeader()->getModule()->getDataLayout();
- assert((AA && DT && LI && SE && TLI && TTI && DL) &&
- "Analyses for Loop Idiom Recognition not available");
- LoopIdiomRecognize LIR(AA, DT, LI, SE, TLI, TTI, DL);
+ LoopIdiomRecognize LIR(&AR.AA, &AR.DT, &AR.LI, &AR.SE, &AR.TLI, &AR.TTI, DL);
if (!LIR.runOnLoop(&L))
return PreservedAnalyses::all();
@@ -229,6 +232,10 @@ bool LoopIdiomRecognize::runOnLoop(Loop *L) {
if (Name == "memset" || Name == "memcpy")
return false;
+ // Determine if code size heuristics need to be applied.
+ ApplyCodeSizeHeuristics =
+ L->getHeader()->getParent()->optForSize() && UseLIRCodeSizeHeurs;
+
HasMemset = TLI->has(LibFunc::memset);
HasMemsetPattern = TLI->has(LibFunc::memset_pattern16);
HasMemcpy = TLI->has(LibFunc::memcpy);
@@ -689,7 +696,7 @@ bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI,
bool NegStride = SizeInBytes == -Stride;
return processLoopStridedStore(Pointer, (unsigned)SizeInBytes,
MSI->getAlignment(), SplatValue, MSI, MSIs, Ev,
- BECount, NegStride);
+ BECount, NegStride, /*IsLoopMemset=*/true);
}
/// mayLoopAccessLocation - Return true if the specified loop might access the
@@ -745,7 +752,7 @@ bool LoopIdiomRecognize::processLoopStridedStore(
Value *DestPtr, unsigned StoreSize, unsigned StoreAlignment,
Value *StoredVal, Instruction *TheStore,
SmallPtrSetImpl<Instruction *> &Stores, const SCEVAddRecExpr *Ev,
- const SCEV *BECount, bool NegStride) {
+ const SCEV *BECount, bool NegStride, bool IsLoopMemset) {
Value *SplatValue = isBytewiseValue(StoredVal);
Constant *PatternValue = nullptr;
@@ -786,6 +793,9 @@ bool LoopIdiomRecognize::processLoopStridedStore(
return false;
}
+ if (avoidLIRForMultiBlockLoop(/*IsMemset=*/true, IsLoopMemset))
+ return false;
+
// Okay, everything looks good, insert the memset.
// The # stored bytes is (BECount+1)*Size. Expand the trip count out to
@@ -917,6 +927,9 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
return false;
}
+ if (avoidLIRForMultiBlockLoop())
+ return false;
+
// Okay, everything is safe, we can transform this!
// The # stored bytes is (BECount+1)*Size. Expand the trip count out to
@@ -948,6 +961,23 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
return true;
}
+// When compiling for codesize we avoid idiom recognition for a multi-block loop
+// unless it is a loop_memset idiom or a memset/memcpy idiom in a nested loop.
+//
+bool LoopIdiomRecognize::avoidLIRForMultiBlockLoop(bool IsMemset,
+ bool IsLoopMemset) {
+ if (ApplyCodeSizeHeuristics && CurLoop->getNumBlocks() > 1) {
+ if (!CurLoop->getParentLoop() && (!IsMemset || !IsLoopMemset)) {
+ DEBUG(dbgs() << " " << CurLoop->getHeader()->getParent()->getName()
+ << " : LIR " << (IsMemset ? "Memset" : "Memcpy")
+ << " avoided: multi-block top-level loop\n");
+ return true;
+ }
+ }
+
+ return false;
+}
+
bool LoopIdiomRecognize::runOnNoncountableLoop() {
return recognizePopcount();
}
@@ -955,7 +985,7 @@ bool LoopIdiomRecognize::runOnNoncountableLoop() {
/// Check if the given conditional branch is based on the comparison between
/// a variable and zero, and if the variable is non-zero, the control yields to
/// the loop entry. If the branch matches the behavior, the variable involved
-/// in the comparion is returned. This function will be called to see if the
+/// in the comparison is returned. This function will be called to see if the
/// precondition and postcondition of the loop are in desirable form.
static Value *matchCondition(BranchInst *BI, BasicBlock *LoopEntry) {
if (!BI || !BI->isConditional())
@@ -1139,9 +1169,7 @@ bool LoopIdiomRecognize::recognizePopcount() {
// It should have a preheader containing nothing but an unconditional branch.
BasicBlock *PH = CurLoop->getLoopPreheader();
- if (!PH)
- return false;
- if (&PH->front() != PH->getTerminator())
+ if (!PH || &PH->front() != PH->getTerminator())
return false;
auto *EntryBI = dyn_cast<BranchInst>(PH->getTerminator());
if (!EntryBI || EntryBI->isConditional())
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp
index 629cb87..69102d1 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp
@@ -18,7 +18,6 @@
#include "llvm/Analysis/InstructionSimplify.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/LoopPass.h"
-#include "llvm/Analysis/LoopPassManager.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/IR/DataLayout.h"
@@ -26,6 +25,7 @@
#include "llvm/IR/Instructions.h"
#include "llvm/Support/Debug.h"
#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
using namespace llvm;
@@ -183,20 +183,10 @@ public:
};
}
-PreservedAnalyses LoopInstSimplifyPass::run(Loop &L,
- AnalysisManager<Loop> &AM) {
- const auto &FAM =
- AM.getResult<FunctionAnalysisManagerLoopProxy>(L).getManager();
- Function *F = L.getHeader()->getParent();
-
- // Use getCachedResult because Loop pass cannot trigger a function analysis.
- auto *DT = FAM.getCachedResult<DominatorTreeAnalysis>(*F);
- auto *LI = FAM.getCachedResult<LoopAnalysis>(*F);
- auto *AC = FAM.getCachedResult<AssumptionAnalysis>(*F);
- const auto *TLI = FAM.getCachedResult<TargetLibraryAnalysis>(*F);
- assert((LI && AC && TLI) && "Analyses for Loop Inst Simplify not available");
-
- if (!SimplifyLoopInst(&L, DT, LI, AC, TLI))
+PreservedAnalyses LoopInstSimplifyPass::run(Loop &L, LoopAnalysisManager &AM,
+ LoopStandardAnalysisResults &AR,
+ LPMUpdater &) {
+ if (!SimplifyLoopInst(&L, &AR.DT, &AR.LI, &AR.AC, &AR.TLI))
return PreservedAnalyses::all();
return getLoopPassPreservedAnalyses();
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
index 9241ec3..e9f84ed 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -44,6 +44,10 @@ using namespace llvm;
#define DEBUG_TYPE "loop-interchange"
+static cl::opt<int> LoopInterchangeCostThreshold(
+ "loop-interchange-threshold", cl::init(0), cl::Hidden,
+ cl::desc("Interchange if you gain more than this number"));
+
namespace {
typedef SmallVector<Loop *, 8> LoopVector;
@@ -75,30 +79,23 @@ static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
typedef SmallVector<Value *, 16> ValueVector;
ValueVector MemInstr;
- if (Level > MaxLoopNestDepth) {
- DEBUG(dbgs() << "Cannot handle loops of depth greater than "
- << MaxLoopNestDepth << "\n");
- return false;
- }
-
// For each block.
for (Loop::block_iterator BB = L->block_begin(), BE = L->block_end();
BB != BE; ++BB) {
// Scan the BB and collect legal loads and stores.
for (BasicBlock::iterator I = (*BB)->begin(), E = (*BB)->end(); I != E;
++I) {
- Instruction *Ins = dyn_cast<Instruction>(I);
- if (!Ins)
- return false;
- LoadInst *Ld = dyn_cast<LoadInst>(I);
- StoreInst *St = dyn_cast<StoreInst>(I);
- if (!St && !Ld)
- continue;
- if (Ld && !Ld->isSimple())
- return false;
- if (St && !St->isSimple())
+ if (!isa<Instruction>(I))
return false;
- MemInstr.push_back(&*I);
+ if (LoadInst *Ld = dyn_cast<LoadInst>(I)) {
+ if (!Ld->isSimple())
+ return false;
+ MemInstr.push_back(&*I);
+ } else if (StoreInst *St = dyn_cast<StoreInst>(I)) {
+ if (!St->isSimple())
+ return false;
+ MemInstr.push_back(&*I);
+ }
}
}
@@ -110,66 +107,63 @@ static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
for (I = MemInstr.begin(), IE = MemInstr.end(); I != IE; ++I) {
for (J = I, JE = MemInstr.end(); J != JE; ++J) {
std::vector<char> Dep;
- Instruction *Src = dyn_cast<Instruction>(*I);
- Instruction *Des = dyn_cast<Instruction>(*J);
- if (Src == Des)
+ Instruction *Src = cast<Instruction>(*I);
+ Instruction *Dst = cast<Instruction>(*J);
+ if (Src == Dst)
continue;
- if (isa<LoadInst>(Src) && isa<LoadInst>(Des))
+ // Ignore Input dependencies.
+ if (isa<LoadInst>(Src) && isa<LoadInst>(Dst))
continue;
- if (auto D = DI->depends(Src, Des, true)) {
- DEBUG(dbgs() << "Found Dependency between Src=" << Src << " Des=" << Des
- << "\n");
- if (D->isFlow()) {
- // TODO: Handle Flow dependence.Check if it is sufficient to populate
- // the Dependence Matrix with the direction reversed.
- DEBUG(dbgs() << "Flow dependence not handled");
- return false;
- }
- if (D->isAnti()) {
- DEBUG(dbgs() << "Found Anti dependence \n");
- unsigned Levels = D->getLevels();
- char Direction;
- for (unsigned II = 1; II <= Levels; ++II) {
- const SCEV *Distance = D->getDistance(II);
- const SCEVConstant *SCEVConst =
- dyn_cast_or_null<SCEVConstant>(Distance);
- if (SCEVConst) {
- const ConstantInt *CI = SCEVConst->getValue();
- if (CI->isNegative())
- Direction = '<';
- else if (CI->isZero())
- Direction = '=';
- else
- Direction = '>';
- Dep.push_back(Direction);
- } else if (D->isScalar(II)) {
- Direction = 'S';
- Dep.push_back(Direction);
- } else {
- unsigned Dir = D->getDirection(II);
- if (Dir == Dependence::DVEntry::LT ||
- Dir == Dependence::DVEntry::LE)
- Direction = '<';
- else if (Dir == Dependence::DVEntry::GT ||
- Dir == Dependence::DVEntry::GE)
- Direction = '>';
- else if (Dir == Dependence::DVEntry::EQ)
- Direction = '=';
- else
- Direction = '*';
- Dep.push_back(Direction);
- }
- }
- while (Dep.size() != Level) {
- Dep.push_back('I');
+ // Track Output, Flow, and Anti dependencies.
+ if (auto D = DI->depends(Src, Dst, true)) {
+ assert(D->isOrdered() && "Expected an output, flow or anti dep.");
+ DEBUG(StringRef DepType =
+ D->isFlow() ? "flow" : D->isAnti() ? "anti" : "output";
+ dbgs() << "Found " << DepType
+ << " dependency between Src and Dst\n"
+ << " Src:" << *Src << "\n Dst:" << *Dst << '\n');
+ unsigned Levels = D->getLevels();
+ char Direction;
+ for (unsigned II = 1; II <= Levels; ++II) {
+ const SCEV *Distance = D->getDistance(II);
+ const SCEVConstant *SCEVConst =
+ dyn_cast_or_null<SCEVConstant>(Distance);
+ if (SCEVConst) {
+ const ConstantInt *CI = SCEVConst->getValue();
+ if (CI->isNegative())
+ Direction = '<';
+ else if (CI->isZero())
+ Direction = '=';
+ else
+ Direction = '>';
+ Dep.push_back(Direction);
+ } else if (D->isScalar(II)) {
+ Direction = 'S';
+ Dep.push_back(Direction);
+ } else {
+ unsigned Dir = D->getDirection(II);
+ if (Dir == Dependence::DVEntry::LT ||
+ Dir == Dependence::DVEntry::LE)
+ Direction = '<';
+ else if (Dir == Dependence::DVEntry::GT ||
+ Dir == Dependence::DVEntry::GE)
+ Direction = '>';
+ else if (Dir == Dependence::DVEntry::EQ)
+ Direction = '=';
+ else
+ Direction = '*';
+ Dep.push_back(Direction);
}
+ }
+ while (Dep.size() != Level) {
+ Dep.push_back('I');
+ }
- DepMatrix.push_back(Dep);
- if (DepMatrix.size() > MaxMemInstrCount) {
- DEBUG(dbgs() << "Cannot handle more than " << MaxMemInstrCount
- << " dependencies inside loop\n");
- return false;
- }
+ DepMatrix.push_back(Dep);
+ if (DepMatrix.size() > MaxMemInstrCount) {
+ DEBUG(dbgs() << "Cannot handle more than " << MaxMemInstrCount
+ << " dependencies inside loop\n");
+ return false;
}
}
}
@@ -183,8 +177,8 @@ static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
// A loop is moved from index 'from' to an index 'to'. Update the Dependence
// matrix by exchanging the two columns.
-static void interChangeDepedencies(CharMatrix &DepMatrix, unsigned FromIndx,
- unsigned ToIndx) {
+static void interChangeDependencies(CharMatrix &DepMatrix, unsigned FromIndx,
+ unsigned ToIndx) {
unsigned numRows = DepMatrix.size();
for (unsigned i = 0; i < numRows; ++i) {
char TmpVal = DepMatrix[i][ToIndx];
@@ -211,7 +205,7 @@ static bool isOuterMostDepPositive(CharMatrix &DepMatrix, unsigned Row,
static bool containsNoDependence(CharMatrix &DepMatrix, unsigned Row,
unsigned Column) {
for (unsigned i = 0; i < Column; ++i) {
- if (DepMatrix[Row][i] != '=' || DepMatrix[Row][i] != 'S' ||
+ if (DepMatrix[Row][i] != '=' && DepMatrix[Row][i] != 'S' &&
DepMatrix[Row][i] != 'I')
return false;
}
@@ -255,9 +249,8 @@ static bool validDepInterchange(CharMatrix &DepMatrix, unsigned Row,
// Checks if it is legal to interchange 2 loops.
// [Theorem] A permutation of the loops in a perfect nest is legal if and only
-// if
-// the direction matrix, after the same permutation is applied to its columns,
-// has no ">" direction as the leftmost non-"=" direction in any row.
+// if the direction matrix, after the same permutation is applied to its
+// columns, has no ">" direction as the leftmost non-"=" direction in any row.
static bool isLegalToInterChangeLoops(CharMatrix &DepMatrix,
unsigned InnerLoopId,
unsigned OuterLoopId) {
@@ -269,8 +262,7 @@ static bool isLegalToInterChangeLoops(CharMatrix &DepMatrix,
char OuterDep = DepMatrix[Row][OuterLoopId];
if (InnerDep == '*' || OuterDep == '*')
return false;
- else if (!validDepInterchange(DepMatrix, Row, OuterLoopId, InnerDep,
- OuterDep))
+ if (!validDepInterchange(DepMatrix, Row, OuterLoopId, InnerDep, OuterDep))
return false;
}
return true;
@@ -278,7 +270,9 @@ static bool isLegalToInterChangeLoops(CharMatrix &DepMatrix,
static void populateWorklist(Loop &L, SmallVector<LoopVector, 8> &V) {
- DEBUG(dbgs() << "Calling populateWorklist called\n");
+ DEBUG(dbgs() << "Calling populateWorklist on Func: "
+ << L.getHeader()->getParent()->getName() << " Loop: %"
+ << L.getHeader()->getName() << '\n');
LoopVector LoopList;
Loop *CurrentLoop = &L;
const std::vector<Loop *> *Vec = &CurrentLoop->getSubLoops();
@@ -315,8 +309,7 @@ static PHINode *getInductionVariable(Loop *L, ScalarEvolution *SE) {
if (!AddRec || !AddRec->isAffine())
continue;
const SCEV *Step = AddRec->getStepRecurrence(*SE);
- const SCEVConstant *C = dyn_cast<SCEVConstant>(Step);
- if (!C)
+ if (!isa<SCEVConstant>(Step))
continue;
// Found the induction variable.
// FIXME: Handle loops with more than one induction variable. Note that,
@@ -474,7 +467,7 @@ struct LoopInterchange : public FunctionPass {
for (Loop *L : LoopList) {
const SCEV *ExitCountOuter = SE->getBackedgeTakenCount(L);
if (ExitCountOuter == SE->getCouldNotCompute()) {
- DEBUG(dbgs() << "Couldn't compute Backedge count\n");
+ DEBUG(dbgs() << "Couldn't compute backedge count\n");
return false;
}
if (L->getNumBackEdges() != 1) {
@@ -482,7 +475,7 @@ struct LoopInterchange : public FunctionPass {
return false;
}
if (!L->getExitingBlock()) {
- DEBUG(dbgs() << "Loop Doesn't have unique exit block\n");
+ DEBUG(dbgs() << "Loop doesn't have unique exit block\n");
return false;
}
}
@@ -498,27 +491,32 @@ struct LoopInterchange : public FunctionPass {
bool processLoopList(LoopVector LoopList, Function &F) {
bool Changed = false;
- CharMatrix DependencyMatrix;
- if (LoopList.size() < 2) {
+ unsigned LoopNestDepth = LoopList.size();
+ if (LoopNestDepth < 2) {
DEBUG(dbgs() << "Loop doesn't contain minimum nesting level.\n");
return false;
}
+ if (LoopNestDepth > MaxLoopNestDepth) {
+ DEBUG(dbgs() << "Cannot handle loops of depth greater than "
+ << MaxLoopNestDepth << "\n");
+ return false;
+ }
if (!isComputableLoopNest(LoopList)) {
- DEBUG(dbgs() << "Not vaild loop candidate for interchange\n");
+ DEBUG(dbgs() << "Not valid loop candidate for interchange\n");
return false;
}
- Loop *OuterMostLoop = *(LoopList.begin());
- DEBUG(dbgs() << "Processing LoopList of size = " << LoopList.size()
- << "\n");
+ DEBUG(dbgs() << "Processing LoopList of size = " << LoopNestDepth << "\n");
- if (!populateDependencyMatrix(DependencyMatrix, LoopList.size(),
+ CharMatrix DependencyMatrix;
+ Loop *OuterMostLoop = *(LoopList.begin());
+ if (!populateDependencyMatrix(DependencyMatrix, LoopNestDepth,
OuterMostLoop, DI)) {
- DEBUG(dbgs() << "Populating Dependency matrix failed\n");
+ DEBUG(dbgs() << "Populating dependency matrix failed\n");
return false;
}
#ifdef DUMP_DEP_MATRICIES
- DEBUG(dbgs() << "Dependence before inter change \n");
+ DEBUG(dbgs() << "Dependence before interchange\n");
printDepMatrix(DependencyMatrix);
#endif
@@ -556,10 +554,10 @@ struct LoopInterchange : public FunctionPass {
std::swap(LoopList[i - 1], LoopList[i]);
// Update the DependencyMatrix
- interChangeDepedencies(DependencyMatrix, i, i - 1);
+ interChangeDependencies(DependencyMatrix, i, i - 1);
DT->recalculate(F);
#ifdef DUMP_DEP_MATRICIES
- DEBUG(dbgs() << "Dependence after inter change \n");
+ DEBUG(dbgs() << "Dependence after interchange\n");
printDepMatrix(DependencyMatrix);
#endif
Changed |= Interchanged;
@@ -571,7 +569,7 @@ struct LoopInterchange : public FunctionPass {
unsigned OuterLoopId, BasicBlock *LoopNestExit,
std::vector<std::vector<char>> &DependencyMatrix) {
- DEBUG(dbgs() << "Processing Innder Loop Id = " << InnerLoopId
+ DEBUG(dbgs() << "Processing Inner Loop Id = " << InnerLoopId
<< " and OuterLoopId = " << OuterLoopId << "\n");
Loop *InnerLoop = LoopList[InnerLoopId];
Loop *OuterLoop = LoopList[OuterLoopId];
@@ -585,7 +583,7 @@ struct LoopInterchange : public FunctionPass {
DEBUG(dbgs() << "Loops are legal to interchange\n");
LoopInterchangeProfitability LIP(OuterLoop, InnerLoop, SE);
if (!LIP.isProfitable(InnerLoopId, OuterLoopId, DependencyMatrix)) {
- DEBUG(dbgs() << "Interchanging Loops not profitable\n");
+ DEBUG(dbgs() << "Interchanging loops not profitable\n");
return false;
}
@@ -599,8 +597,8 @@ struct LoopInterchange : public FunctionPass {
} // end of namespace
bool LoopInterchangeLegality::areAllUsesReductions(Instruction *Ins, Loop *L) {
- return !std::any_of(Ins->user_begin(), Ins->user_end(), [=](User *U) -> bool {
- PHINode *UserIns = dyn_cast<PHINode>(U);
+ return none_of(Ins->users(), [=](User *U) -> bool {
+ auto *UserIns = dyn_cast<PHINode>(U);
RecurrenceDescriptor RD;
return !UserIns || !RecurrenceDescriptor::isReductionPHI(UserIns, L, RD);
});
@@ -626,8 +624,7 @@ bool LoopInterchangeLegality::containsUnsafeInstructionsInLatch(
// Stores corresponding to reductions are safe while concluding if tightly
// nested.
if (StoreInst *L = dyn_cast<StoreInst>(I)) {
- PHINode *PHI = dyn_cast<PHINode>(L->getOperand(0));
- if (!PHI)
+ if (!isa<PHINode>(L->getOperand(0)))
return true;
} else if (I->mayHaveSideEffects() || I->mayReadFromMemory())
return true;
@@ -640,30 +637,30 @@ bool LoopInterchangeLegality::tightlyNested(Loop *OuterLoop, Loop *InnerLoop) {
BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();
- DEBUG(dbgs() << "Checking if Loops are Tightly Nested\n");
+ DEBUG(dbgs() << "Checking if loops are tightly nested\n");
// A perfectly nested loop will not have any branch in between the outer and
// inner block i.e. outer header will branch to either inner preheader and
// outerloop latch.
- BranchInst *outerLoopHeaderBI =
+ BranchInst *OuterLoopHeaderBI =
dyn_cast<BranchInst>(OuterLoopHeader->getTerminator());
- if (!outerLoopHeaderBI)
+ if (!OuterLoopHeaderBI)
return false;
- unsigned num = outerLoopHeaderBI->getNumSuccessors();
- for (unsigned i = 0; i < num; i++) {
- if (outerLoopHeaderBI->getSuccessor(i) != InnerLoopPreHeader &&
- outerLoopHeaderBI->getSuccessor(i) != OuterLoopLatch)
+
+ for (unsigned i = 0, e = OuterLoopHeaderBI->getNumSuccessors(); i < e; ++i) {
+ if (OuterLoopHeaderBI->getSuccessor(i) != InnerLoopPreHeader &&
+ OuterLoopHeaderBI->getSuccessor(i) != OuterLoopLatch)
return false;
}
- DEBUG(dbgs() << "Checking instructions in Loop header and Loop latch \n");
+ DEBUG(dbgs() << "Checking instructions in Loop header and Loop latch\n");
// We do not have any basic block in between now make sure the outer header
// and outer loop latch doesn't contain any unsafe instructions.
if (containsUnsafeInstructionsInHeader(OuterLoopHeader) ||
containsUnsafeInstructionsInLatch(OuterLoopLatch))
return false;
- DEBUG(dbgs() << "Loops are perfectly nested \n");
+ DEBUG(dbgs() << "Loops are perfectly nested\n");
// We have a perfect loop nest.
return true;
}
@@ -703,7 +700,7 @@ bool LoopInterchangeLegality::findInductionAndReductions(
RecurrenceDescriptor RD;
InductionDescriptor ID;
PHINode *PHI = cast<PHINode>(I);
- if (InductionDescriptor::isInductionPHI(PHI, SE, ID))
+ if (InductionDescriptor::isInductionPHI(PHI, L, SE, ID))
Inductions.push_back(PHI);
else if (RecurrenceDescriptor::isReductionPHI(PHI, L, RD))
Reductions.push_back(PHI);
@@ -852,8 +849,8 @@ bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId,
if (!isLegalToInterChangeLoops(DepMatrix, InnerLoopId, OuterLoopId)) {
DEBUG(dbgs() << "Failed interchange InnerLoopId = " << InnerLoopId
- << "and OuterLoopId = " << OuterLoopId
- << "due to dependence\n");
+ << " and OuterLoopId = " << OuterLoopId
+ << " due to dependence\n");
return false;
}
@@ -946,9 +943,9 @@ int LoopInterchangeProfitability::getInstrOrderCost() {
return GoodOrder - BadOrder;
}
-static bool isProfitabileForVectorization(unsigned InnerLoopId,
- unsigned OuterLoopId,
- CharMatrix &DepMatrix) {
+static bool isProfitableForVectorization(unsigned InnerLoopId,
+ unsigned OuterLoopId,
+ CharMatrix &DepMatrix) {
// TODO: Improve this heuristic to catch more cases.
// If the inner loop is loop independent or doesn't carry any dependency it is
// profitable to move this to outer position.
@@ -977,16 +974,15 @@ bool LoopInterchangeProfitability::isProfitable(unsigned InnerLoopId,
// This is rough cost estimation algorithm. It counts the good and bad order
// of induction variables in the instruction and allows reordering if number
// of bad orders is more than good.
- int Cost = 0;
- Cost += getInstrOrderCost();
+ int Cost = getInstrOrderCost();
DEBUG(dbgs() << "Cost = " << Cost << "\n");
- if (Cost < 0)
+ if (Cost < -LoopInterchangeCostThreshold)
return true;
// It is not profitable as per current cache profitability model. But check if
// we can move this loop outside to improve parallelism.
bool ImprovesPar =
- isProfitabileForVectorization(InnerLoopId, OuterLoopId, DepMatrix);
+ isProfitableForVectorization(InnerLoopId, OuterLoopId, DepMatrix);
return ImprovesPar;
}
@@ -1022,8 +1018,6 @@ void LoopInterchangeTransform::restructureLoops(Loop *InnerLoop,
}
bool LoopInterchangeTransform::transform() {
-
- DEBUG(dbgs() << "transform\n");
bool Transformed = false;
Instruction *InnerIndexVar;
@@ -1046,16 +1040,16 @@ bool LoopInterchangeTransform::transform() {
// incremented/decremented.
// TODO: This splitting logic may not work always. Fix this.
splitInnerLoopLatch(InnerIndexVar);
- DEBUG(dbgs() << "splitInnerLoopLatch Done\n");
+ DEBUG(dbgs() << "splitInnerLoopLatch done\n");
// Splits the inner loops phi nodes out into a separate basic block.
splitInnerLoopHeader();
- DEBUG(dbgs() << "splitInnerLoopHeader Done\n");
+ DEBUG(dbgs() << "splitInnerLoopHeader done\n");
}
Transformed |= adjustLoopLinks();
if (!Transformed) {
- DEBUG(dbgs() << "adjustLoopLinks Failed\n");
+ DEBUG(dbgs() << "adjustLoopLinks failed\n");
return false;
}
@@ -1099,7 +1093,7 @@ void LoopInterchangeTransform::splitInnerLoopHeader() {
}
DEBUG(dbgs() << "Output of splitInnerLoopHeader InnerLoopHeaderSucc & "
- "InnerLoopHeader \n");
+ "InnerLoopHeader\n");
}
/// \brief Move all instructions except the terminator from FromBB right before
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp
index f29228c..8fb5801 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp
@@ -20,17 +20,37 @@
//
//===----------------------------------------------------------------------===//
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/LoopAccessAnalysis.h"
#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/IR/DataLayout.h"
#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Instructions.h"
#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Utils/LoopVersioning.h"
#include <forward_list>
+#include <cassert>
+#include <algorithm>
+#include <set>
+#include <tuple>
+#include <utility>
#define LLE_OPTION "loop-load-elim"
#define DEBUG_TYPE LLE_OPTION
@@ -47,7 +67,6 @@ static cl::opt<unsigned> LoadElimSCEVCheckThreshold(
cl::desc("The maximum number of SCEV checks allowed for Loop "
"Load Elimination"));
-
STATISTIC(NumLoopLoadEliminted, "Number of loads eliminated by LLE");
namespace {
@@ -113,10 +132,9 @@ bool doesStoreDominatesAllLatches(BasicBlock *StoreBlock, Loop *L,
DominatorTree *DT) {
SmallVector<BasicBlock *, 8> Latches;
L->getLoopLatches(Latches);
- return std::all_of(Latches.begin(), Latches.end(),
- [&](const BasicBlock *Latch) {
- return DT->dominates(StoreBlock, Latch);
- });
+ return llvm::all_of(Latches, [&](const BasicBlock *Latch) {
+ return DT->dominates(StoreBlock, Latch);
+ });
}
/// \brief Return true if the load is not executed on all paths in the loop.
@@ -348,7 +366,7 @@ public:
// Collect the pointers of the candidate loads.
// FIXME: SmallSet does not work with std::inserter.
std::set<Value *> CandLoadPtrs;
- std::transform(Candidates.begin(), Candidates.end(),
+ transform(Candidates,
std::inserter(CandLoadPtrs, CandLoadPtrs.begin()),
std::mem_fn(&StoreToLoadForwardingCandidate::getLoadPtr));
@@ -397,7 +415,9 @@ public:
Value *InitialPtr = SEE.expandCodeFor(PtrSCEV->getStart(), Ptr->getType(),
PH->getTerminator());
Value *Initial =
- new LoadInst(InitialPtr, "load_initial", PH->getTerminator());
+ new LoadInst(InitialPtr, "load_initial", /* isVolatile */ false,
+ Cand.Load->getAlignment(), PH->getTerminator());
+
PHINode *PHI = PHINode::Create(Initial->getType(), 2, "store_forwarded",
&L->getHeader()->front());
PHI->addIncoming(Initial, PH);
@@ -499,6 +519,11 @@ public:
return false;
}
+ if (!L->isLoopSimplifyForm()) {
+ DEBUG(dbgs() << "Loop is not is loop-simplify form");
+ return false;
+ }
+
// Point of no-return, start the transformation. First, version the loop
// if necessary.
@@ -581,11 +606,13 @@ public:
AU.addRequired<ScalarEvolutionWrapperPass>();
AU.addRequired<DominatorTreeWrapperPass>();
AU.addPreserved<DominatorTreeWrapperPass>();
+ AU.addPreserved<GlobalsAAWrapperPass>();
}
static char ID;
};
-}
+
+} // end anonymous namespace
char LoopLoadElimination::ID;
static const char LLE_name[] = "Loop Load Elimination";
@@ -599,7 +626,9 @@ INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
INITIALIZE_PASS_END(LoopLoadElimination, LLE_OPTION, LLE_name, false, false)
namespace llvm {
+
FunctionPass *createLoopLoadEliminationPass() {
return new LoopLoadElimination();
}
-}
+
+} // end namespace llvm
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopPassManager.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopPassManager.cpp
new file mode 100644
index 0000000..028f4bb
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopPassManager.cpp
@@ -0,0 +1,85 @@
+//===- LoopPassManager.cpp - Loop pass management -------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
+#include "llvm/Analysis/LoopInfo.h"
+
+using namespace llvm;
+
+// Explicit template instantiations and specialization defininitions for core
+// template typedefs.
+namespace llvm {
+template class PassManager<Loop, LoopAnalysisManager,
+ LoopStandardAnalysisResults &, LPMUpdater &>;
+
+/// Explicitly specialize the pass manager's run method to handle loop nest
+/// structure updates.
+template <>
+PreservedAnalyses
+PassManager<Loop, LoopAnalysisManager, LoopStandardAnalysisResults &,
+ LPMUpdater &>::run(Loop &L, LoopAnalysisManager &AM,
+ LoopStandardAnalysisResults &AR, LPMUpdater &U) {
+ PreservedAnalyses PA = PreservedAnalyses::all();
+
+ if (DebugLogging)
+ dbgs() << "Starting Loop pass manager run.\n";
+
+ for (auto &Pass : Passes) {
+ if (DebugLogging)
+ dbgs() << "Running pass: " << Pass->name() << " on " << L;
+
+ PreservedAnalyses PassPA = Pass->run(L, AM, AR, U);
+
+ // If the loop was deleted, abort the run and return to the outer walk.
+ if (U.skipCurrentLoop()) {
+ PA.intersect(std::move(PassPA));
+ break;
+ }
+
+ // Update the analysis manager as each pass runs and potentially
+ // invalidates analyses.
+ AM.invalidate(L, PassPA);
+
+ // Finally, we intersect the final preserved analyses to compute the
+ // aggregate preserved set for this pass manager.
+ PA.intersect(std::move(PassPA));
+
+ // FIXME: Historically, the pass managers all called the LLVM context's
+ // yield function here. We don't have a generic way to acquire the
+ // context and it isn't yet clear what the right pattern is for yielding
+ // in the new pass manager so it is currently omitted.
+ // ...getContext().yield();
+ }
+
+ // Invalidation for the current loop should be handled above, and other loop
+ // analysis results shouldn't be impacted by runs over this loop. Therefore,
+ // the remaining analysis results in the AnalysisManager are preserved. We
+ // mark this with a set so that we don't need to inspect each one
+ // individually.
+ // FIXME: This isn't correct! This loop and all nested loops' analyses should
+ // be preserved, but unrolling should invalidate the parent loop's analyses.
+ PA.preserveSet<AllAnalysesOn<Loop>>();
+
+ if (DebugLogging)
+ dbgs() << "Finished Loop pass manager run.\n";
+
+ return PA;
+}
+}
+
+PrintLoopPass::PrintLoopPass() : OS(dbgs()) {}
+PrintLoopPass::PrintLoopPass(raw_ostream &OS, const std::string &Banner)
+ : OS(OS), Banner(Banner) {}
+
+PreservedAnalyses PrintLoopPass::run(Loop &L, LoopAnalysisManager &,
+ LoopStandardAnalysisResults &,
+ LPMUpdater &) {
+ printLoop(L, OS, Banner);
+ return PreservedAnalyses::all();
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp
index d2f1b66..86058fe 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp
@@ -371,11 +371,12 @@ namespace {
protected:
typedef MapVector<Instruction*, BitVector> UsesTy;
- bool findRootsRecursive(Instruction *IVU,
+ void findRootsRecursive(Instruction *IVU,
SmallInstructionSet SubsumedInsts);
bool findRootsBase(Instruction *IVU, SmallInstructionSet SubsumedInsts);
bool collectPossibleRoots(Instruction *Base,
std::map<int64_t,Instruction*> &Roots);
+ bool validateRootSet(DAGRootSet &DRS);
bool collectUsedInstructions(SmallInstructionSet &PossibleRedSet);
void collectInLoopUserSet(const SmallInstructionVector &Roots,
@@ -739,11 +740,11 @@ void LoopReroll::DAGRootTracker::collectInLoopUserSet(
collectInLoopUserSet(Root, Exclude, Final, Users);
}
-static bool isSimpleLoadStore(Instruction *I) {
+static bool isUnorderedLoadStore(Instruction *I) {
if (LoadInst *LI = dyn_cast<LoadInst>(I))
- return LI->isSimple();
+ return LI->isUnordered();
if (StoreInst *SI = dyn_cast<StoreInst>(I))
- return SI->isSimple();
+ return SI->isUnordered();
if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I))
return !MI->isVolatile();
return false;
@@ -827,7 +828,8 @@ collectPossibleRoots(Instruction *Base, std::map<int64_t,Instruction*> &Roots) {
Roots[V] = cast<Instruction>(I);
}
- if (Roots.empty())
+ // Make sure we have at least two roots.
+ if (Roots.empty() || (Roots.size() == 1 && BaseUsers.empty()))
return false;
// If we found non-loop-inc, non-root users of Base, assume they are
@@ -861,40 +863,61 @@ collectPossibleRoots(Instruction *Base, std::map<int64_t,Instruction*> &Roots) {
return true;
}
-bool LoopReroll::DAGRootTracker::
+void LoopReroll::DAGRootTracker::
findRootsRecursive(Instruction *I, SmallInstructionSet SubsumedInsts) {
// Does the user look like it could be part of a root set?
// All its users must be simple arithmetic ops.
if (I->getNumUses() > IL_MaxRerollIterations)
- return false;
+ return;
- if ((I->getOpcode() == Instruction::Mul ||
- I->getOpcode() == Instruction::PHI) &&
- I != IV &&
- findRootsBase(I, SubsumedInsts))
- return true;
+ if (I != IV && findRootsBase(I, SubsumedInsts))
+ return;
SubsumedInsts.insert(I);
for (User *V : I->users()) {
- Instruction *I = dyn_cast<Instruction>(V);
- if (std::find(LoopIncs.begin(), LoopIncs.end(), I) != LoopIncs.end())
+ Instruction *I = cast<Instruction>(V);
+ if (is_contained(LoopIncs, I))
continue;
- if (!I || !isSimpleArithmeticOp(I) ||
- !findRootsRecursive(I, SubsumedInsts))
- return false;
+ if (!isSimpleArithmeticOp(I))
+ continue;
+
+ // The recursive call makes a copy of SubsumedInsts.
+ findRootsRecursive(I, SubsumedInsts);
}
+}
+
+bool LoopReroll::DAGRootTracker::validateRootSet(DAGRootSet &DRS) {
+ if (DRS.Roots.empty())
+ return false;
+
+ // Consider a DAGRootSet with N-1 roots (so N different values including
+ // BaseInst).
+ // Define d = Roots[0] - BaseInst, which should be the same as
+ // Roots[I] - Roots[I-1] for all I in [1..N).
+ // Define D = BaseInst@J - BaseInst@J-1, where "@J" means the value at the
+ // loop iteration J.
+ //
+ // Now, For the loop iterations to be consecutive:
+ // D = d * N
+ const auto *ADR = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(DRS.BaseInst));
+ if (!ADR)
+ return false;
+ unsigned N = DRS.Roots.size() + 1;
+ const SCEV *StepSCEV = SE->getMinusSCEV(SE->getSCEV(DRS.Roots[0]), ADR);
+ const SCEV *ScaleSCEV = SE->getConstant(StepSCEV->getType(), N);
+ if (ADR->getStepRecurrence(*SE) != SE->getMulExpr(StepSCEV, ScaleSCEV))
+ return false;
+
return true;
}
bool LoopReroll::DAGRootTracker::
findRootsBase(Instruction *IVU, SmallInstructionSet SubsumedInsts) {
-
- // The base instruction needs to be a multiply so
- // that we can erase it.
- if (IVU->getOpcode() != Instruction::Mul &&
- IVU->getOpcode() != Instruction::PHI)
+ // The base of a RootSet must be an AddRec, so it can be erased.
+ const auto *IVU_ADR = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(IVU));
+ if (!IVU_ADR || IVU_ADR->getLoop() != L)
return false;
std::map<int64_t, Instruction*> V;
@@ -910,6 +933,8 @@ findRootsBase(Instruction *IVU, SmallInstructionSet SubsumedInsts) {
DAGRootSet DRS;
DRS.BaseInst = nullptr;
+ SmallVector<DAGRootSet, 16> PotentialRootSets;
+
for (auto &KV : V) {
if (!DRS.BaseInst) {
DRS.BaseInst = KV.second;
@@ -920,13 +945,22 @@ findRootsBase(Instruction *IVU, SmallInstructionSet SubsumedInsts) {
DRS.Roots.push_back(KV.second);
} else {
// Linear sequence terminated.
- RootSets.push_back(DRS);
+ if (!validateRootSet(DRS))
+ return false;
+
+ // Construct a new DAGRootSet with the next sequence.
+ PotentialRootSets.push_back(DRS);
DRS.BaseInst = KV.second;
- DRS.SubsumedInsts = SubsumedInsts;
DRS.Roots.clear();
}
}
- RootSets.push_back(DRS);
+
+ if (!validateRootSet(DRS))
+ return false;
+
+ PotentialRootSets.push_back(DRS);
+
+ RootSets.append(PotentialRootSets.begin(), PotentialRootSets.end());
return true;
}
@@ -940,8 +974,7 @@ bool LoopReroll::DAGRootTracker::findRoots() {
if (isLoopIncrement(IVU, IV))
LoopIncs.push_back(cast<Instruction>(IVU));
}
- if (!findRootsRecursive(IV, SmallInstructionSet()))
- return false;
+ findRootsRecursive(IV, SmallInstructionSet());
LoopIncs.push_back(IV);
} else {
if (!findRootsBase(IV, SmallInstructionSet()))
@@ -961,31 +994,6 @@ bool LoopReroll::DAGRootTracker::findRoots() {
}
}
- // And ensure all loop iterations are consecutive. We rely on std::map
- // providing ordered traversal.
- for (auto &V : RootSets) {
- const auto *ADR = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(V.BaseInst));
- if (!ADR)
- return false;
-
- // Consider a DAGRootSet with N-1 roots (so N different values including
- // BaseInst).
- // Define d = Roots[0] - BaseInst, which should be the same as
- // Roots[I] - Roots[I-1] for all I in [1..N).
- // Define D = BaseInst@J - BaseInst@J-1, where "@J" means the value at the
- // loop iteration J.
- //
- // Now, For the loop iterations to be consecutive:
- // D = d * N
-
- unsigned N = V.Roots.size() + 1;
- const SCEV *StepSCEV = SE->getMinusSCEV(SE->getSCEV(V.Roots[0]), ADR);
- const SCEV *ScaleSCEV = SE->getConstant(StepSCEV->getType(), N);
- if (ADR->getStepRecurrence(*SE) != SE->getMulExpr(StepSCEV, ScaleSCEV)) {
- DEBUG(dbgs() << "LRR: Aborting because iterations are not consecutive\n");
- return false;
- }
- }
Scale = RootSets[0].Roots.size() + 1;
if (Scale > IL_MaxRerollIterations) {
@@ -1088,7 +1096,7 @@ bool LoopReroll::DAGRootTracker::isBaseInst(Instruction *I) {
bool LoopReroll::DAGRootTracker::isRootInst(Instruction *I) {
for (auto &DRS : RootSets) {
- if (std::find(DRS.Roots.begin(), DRS.Roots.end(), I) != DRS.Roots.end())
+ if (is_contained(DRS.Roots, I))
return true;
}
return false;
@@ -1283,7 +1291,7 @@ bool LoopReroll::DAGRootTracker::validate(ReductionTracker &Reductions) {
// which while a valid (somewhat arbitrary) micro-optimization, is
// needed because otherwise isSafeToSpeculativelyExecute returns
// false on PHI nodes.
- if (!isa<PHINode>(I) && !isSimpleLoadStore(I) &&
+ if (!isa<PHINode>(I) && !isUnorderedLoadStore(I) &&
!isSafeToSpeculativelyExecute(I))
// Intervening instructions cause side effects.
FutureSideEffects = true;
@@ -1313,10 +1321,10 @@ bool LoopReroll::DAGRootTracker::validate(ReductionTracker &Reductions) {
// If we've past an instruction from a future iteration that may have
// side effects, and this instruction might also, then we can't reorder
// them, and this matching fails. As an exception, we allow the alias
- // set tracker to handle regular (simple) load/store dependencies.
- if (FutureSideEffects && ((!isSimpleLoadStore(BaseInst) &&
+ // set tracker to handle regular (unordered) load/store dependencies.
+ if (FutureSideEffects && ((!isUnorderedLoadStore(BaseInst) &&
!isSafeToSpeculativelyExecute(BaseInst)) ||
- (!isSimpleLoadStore(RootInst) &&
+ (!isUnorderedLoadStore(RootInst) &&
!isSafeToSpeculativelyExecute(RootInst)))) {
DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst <<
" vs. " << *RootInst <<
@@ -1412,13 +1420,12 @@ bool LoopReroll::DAGRootTracker::validate(ReductionTracker &Reductions) {
void LoopReroll::DAGRootTracker::replace(const SCEV *IterCount) {
BasicBlock *Header = L->getHeader();
// Remove instructions associated with non-base iterations.
- for (BasicBlock::reverse_iterator J = Header->rbegin();
- J != Header->rend();) {
+ for (BasicBlock::reverse_iterator J = Header->rbegin(), JE = Header->rend();
+ J != JE;) {
unsigned I = Uses[&*J].find_first();
if (I > 0 && I < IL_All) {
- Instruction *D = &*J;
- DEBUG(dbgs() << "LRR: removing: " << *D << "\n");
- D->eraseFromParent();
+ DEBUG(dbgs() << "LRR: removing: " << *J << "\n");
+ J++->eraseFromParent();
continue;
}
@@ -1499,8 +1506,8 @@ void LoopReroll::DAGRootTracker::replaceIV(Instruction *Inst,
{ // Limit the lifetime of SCEVExpander.
const DataLayout &DL = Header->getModule()->getDataLayout();
SCEVExpander Expander(*SE, DL, "reroll");
- Value *NewIV =
- Expander.expandCodeFor(NewIVSCEV, InstIV->getType(), &Header->front());
+ Value *NewIV = Expander.expandCodeFor(NewIVSCEV, Inst->getType(),
+ Header->getFirstNonPHIOrDbg());
for (auto &KV : Uses)
if (KV.second.find_first() == 0)
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopRotation.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopRotation.cpp
index 7a06a25..cc83069 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoopRotation.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopRotation.cpp
@@ -14,13 +14,12 @@
#include "llvm/Transforms/Scalar/LoopRotation.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/BasicAliasAnalysis.h"
#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
#include "llvm/Analysis/CodeMetrics.h"
-#include "llvm/Analysis/InstructionSimplify.h"
#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/InstructionSimplify.h"
#include "llvm/Analysis/LoopPass.h"
-#include "llvm/Analysis/LoopPassManager.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
#include "llvm/Analysis/TargetTransformInfo.h"
@@ -34,6 +33,7 @@
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
@@ -326,6 +326,10 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
// Otherwise, stick the new instruction into the new block!
C->setName(Inst->getName());
C->insertBefore(LoopEntryBranch);
+
+ if (auto *II = dyn_cast<IntrinsicInst>(C))
+ if (II->getIntrinsicID() == Intrinsic::assume)
+ AC->registerAssumption(II);
}
}
@@ -501,7 +505,8 @@ static bool shouldSpeculateInstrs(BasicBlock::iterator Begin,
// GEPs are cheap if all indices are constant.
if (!cast<GEPOperator>(I)->hasAllConstantIndices())
return false;
- // fall-thru to increment case
+ // fall-thru to increment case
+ LLVM_FALLTHROUGH;
case Instruction::Add:
case Instruction::Sub:
case Instruction::And:
@@ -617,21 +622,14 @@ bool LoopRotate::processLoop(Loop *L) {
return MadeChange;
}
-LoopRotatePass::LoopRotatePass() {}
-
-PreservedAnalyses LoopRotatePass::run(Loop &L, AnalysisManager<Loop> &AM) {
- auto &FAM = AM.getResult<FunctionAnalysisManagerLoopProxy>(L).getManager();
- Function *F = L.getHeader()->getParent();
-
- auto *LI = FAM.getCachedResult<LoopAnalysis>(*F);
- const auto *TTI = FAM.getCachedResult<TargetIRAnalysis>(*F);
- auto *AC = FAM.getCachedResult<AssumptionAnalysis>(*F);
- assert((LI && TTI && AC) && "Analyses for loop rotation not available");
+LoopRotatePass::LoopRotatePass(bool EnableHeaderDuplication)
+ : EnableHeaderDuplication(EnableHeaderDuplication) {}
- // Optional analyses.
- auto *DT = FAM.getCachedResult<DominatorTreeAnalysis>(*F);
- auto *SE = FAM.getCachedResult<ScalarEvolutionAnalysis>(*F);
- LoopRotate LR(DefaultRotationThreshold, LI, TTI, AC, DT, SE);
+PreservedAnalyses LoopRotatePass::run(Loop &L, LoopAnalysisManager &AM,
+ LoopStandardAnalysisResults &AR,
+ LPMUpdater &) {
+ int Threshold = EnableHeaderDuplication ? DefaultRotationThreshold : 0;
+ LoopRotate LR(Threshold, &AR.LI, &AR.TTI, &AR.AC, &AR.DT, &AR.SE);
bool Changed = LR.processLoop(&L);
if (!Changed)
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
index ec22793..1606121 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
@@ -18,18 +18,18 @@
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/BasicAliasAnalysis.h"
#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
#include "llvm/Analysis/DependenceAnalysis.h"
#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/LoopPass.h"
-#include "llvm/Analysis/LoopPassManager.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/IR/Dominators.h"
#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
using namespace llvm;
@@ -64,16 +64,10 @@ static bool simplifyLoopCFG(Loop &L, DominatorTree &DT, LoopInfo &LI) {
return Changed;
}
-PreservedAnalyses LoopSimplifyCFGPass::run(Loop &L, AnalysisManager<Loop> &AM) {
- const auto &FAM =
- AM.getResult<FunctionAnalysisManagerLoopProxy>(L).getManager();
- Function *F = L.getHeader()->getParent();
-
- auto *LI = FAM.getCachedResult<LoopAnalysis>(*F);
- auto *DT = FAM.getCachedResult<DominatorTreeAnalysis>(*F);
- assert((LI && DT) && "Analyses for LoopSimplifyCFG not available");
-
- if (!simplifyLoopCFG(L, *DT, *LI))
+PreservedAnalyses LoopSimplifyCFGPass::run(Loop &L, LoopAnalysisManager &AM,
+ LoopStandardAnalysisResults &AR,
+ LPMUpdater &) {
+ if (!simplifyLoopCFG(L, AR.DT, AR.LI))
return PreservedAnalyses::all();
return getLoopPassPreservedAnalyses();
}
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopSink.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopSink.cpp
new file mode 100644
index 0000000..f3f4152
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopSink.cpp
@@ -0,0 +1,335 @@
+//===-- LoopSink.cpp - Loop Sink Pass ------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass does the inverse transformation of what LICM does.
+// It traverses all of the instructions in the loop's preheader and sinks
+// them to the loop body where frequency is lower than the loop's preheader.
+// This pass is a reverse-transformation of LICM. It differs from the Sink
+// pass in the following ways:
+//
+// * It only handles sinking of instructions from the loop's preheader to the
+// loop's body
+// * It uses alias set tracker to get more accurate alias info
+// * It uses block frequency info to find the optimal sinking locations
+//
+// Overall algorithm:
+//
+// For I in Preheader:
+// InsertBBs = BBs that uses I
+// For BB in sorted(LoopBBs):
+// DomBBs = BBs in InsertBBs that are dominated by BB
+// if freq(DomBBs) > freq(BB)
+// InsertBBs = UseBBs - DomBBs + BB
+// For BB in InsertBBs:
+// Insert I at BB's beginning
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AliasSetTracker.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/Loads.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "loopsink"
+
+STATISTIC(NumLoopSunk, "Number of instructions sunk into loop");
+STATISTIC(NumLoopSunkCloned, "Number of cloned instructions sunk into loop");
+
+static cl::opt<unsigned> SinkFrequencyPercentThreshold(
+ "sink-freq-percent-threshold", cl::Hidden, cl::init(90),
+ cl::desc("Do not sink instructions that require cloning unless they "
+ "execute less than this percent of the time."));
+
+static cl::opt<unsigned> MaxNumberOfUseBBsForSinking(
+ "max-uses-for-sinking", cl::Hidden, cl::init(30),
+ cl::desc("Do not sink instructions that have too many uses."));
+
+/// Return adjusted total frequency of \p BBs.
+///
+/// * If there is only one BB, sinking instruction will not introduce code
+/// size increase. Thus there is no need to adjust the frequency.
+/// * If there are more than one BB, sinking would lead to code size increase.
+/// In this case, we add some "tax" to the total frequency to make it harder
+/// to sink. E.g.
+/// Freq(Preheader) = 100
+/// Freq(BBs) = sum(50, 49) = 99
+/// Even if Freq(BBs) < Freq(Preheader), we will not sink from Preheade to
+/// BBs as the difference is too small to justify the code size increase.
+/// To model this, The adjusted Freq(BBs) will be:
+/// AdjustedFreq(BBs) = 99 / SinkFrequencyPercentThreshold%
+static BlockFrequency adjustedSumFreq(SmallPtrSetImpl<BasicBlock *> &BBs,
+ BlockFrequencyInfo &BFI) {
+ BlockFrequency T = 0;
+ for (BasicBlock *B : BBs)
+ T += BFI.getBlockFreq(B);
+ if (BBs.size() > 1)
+ T /= BranchProbability(SinkFrequencyPercentThreshold, 100);
+ return T;
+}
+
+/// Return a set of basic blocks to insert sinked instructions.
+///
+/// The returned set of basic blocks (BBsToSinkInto) should satisfy:
+///
+/// * Inside the loop \p L
+/// * For each UseBB in \p UseBBs, there is at least one BB in BBsToSinkInto
+/// that domintates the UseBB
+/// * Has minimum total frequency that is no greater than preheader frequency
+///
+/// The purpose of the function is to find the optimal sinking points to
+/// minimize execution cost, which is defined as "sum of frequency of
+/// BBsToSinkInto".
+/// As a result, the returned BBsToSinkInto needs to have minimum total
+/// frequency.
+/// Additionally, if the total frequency of BBsToSinkInto exceeds preheader
+/// frequency, the optimal solution is not sinking (return empty set).
+///
+/// \p ColdLoopBBs is used to help find the optimal sinking locations.
+/// It stores a list of BBs that is:
+///
+/// * Inside the loop \p L
+/// * Has a frequency no larger than the loop's preheader
+/// * Sorted by BB frequency
+///
+/// The complexity of the function is O(UseBBs.size() * ColdLoopBBs.size()).
+/// To avoid expensive computation, we cap the maximum UseBBs.size() in its
+/// caller.
+static SmallPtrSet<BasicBlock *, 2>
+findBBsToSinkInto(const Loop &L, const SmallPtrSetImpl<BasicBlock *> &UseBBs,
+ const SmallVectorImpl<BasicBlock *> &ColdLoopBBs,
+ DominatorTree &DT, BlockFrequencyInfo &BFI) {
+ SmallPtrSet<BasicBlock *, 2> BBsToSinkInto;
+ if (UseBBs.size() == 0)
+ return BBsToSinkInto;
+
+ BBsToSinkInto.insert(UseBBs.begin(), UseBBs.end());
+ SmallPtrSet<BasicBlock *, 2> BBsDominatedByColdestBB;
+
+ // For every iteration:
+ // * Pick the ColdestBB from ColdLoopBBs
+ // * Find the set BBsDominatedByColdestBB that satisfy:
+ // - BBsDominatedByColdestBB is a subset of BBsToSinkInto
+ // - Every BB in BBsDominatedByColdestBB is dominated by ColdestBB
+ // * If Freq(ColdestBB) < Freq(BBsDominatedByColdestBB), remove
+ // BBsDominatedByColdestBB from BBsToSinkInto, add ColdestBB to
+ // BBsToSinkInto
+ for (BasicBlock *ColdestBB : ColdLoopBBs) {
+ BBsDominatedByColdestBB.clear();
+ for (BasicBlock *SinkedBB : BBsToSinkInto)
+ if (DT.dominates(ColdestBB, SinkedBB))
+ BBsDominatedByColdestBB.insert(SinkedBB);
+ if (BBsDominatedByColdestBB.size() == 0)
+ continue;
+ if (adjustedSumFreq(BBsDominatedByColdestBB, BFI) >
+ BFI.getBlockFreq(ColdestBB)) {
+ for (BasicBlock *DominatedBB : BBsDominatedByColdestBB) {
+ BBsToSinkInto.erase(DominatedBB);
+ }
+ BBsToSinkInto.insert(ColdestBB);
+ }
+ }
+
+ // If the total frequency of BBsToSinkInto is larger than preheader frequency,
+ // do not sink.
+ if (adjustedSumFreq(BBsToSinkInto, BFI) >
+ BFI.getBlockFreq(L.getLoopPreheader()))
+ BBsToSinkInto.clear();
+ return BBsToSinkInto;
+}
+
+// Sinks \p I from the loop \p L's preheader to its uses. Returns true if
+// sinking is successful.
+// \p LoopBlockNumber is used to sort the insertion blocks to ensure
+// determinism.
+static bool sinkInstruction(Loop &L, Instruction &I,
+ const SmallVectorImpl<BasicBlock *> &ColdLoopBBs,
+ const SmallDenseMap<BasicBlock *, int, 16> &LoopBlockNumber,
+ LoopInfo &LI, DominatorTree &DT,
+ BlockFrequencyInfo &BFI) {
+ // Compute the set of blocks in loop L which contain a use of I.
+ SmallPtrSet<BasicBlock *, 2> BBs;
+ for (auto &U : I.uses()) {
+ Instruction *UI = cast<Instruction>(U.getUser());
+ // We cannot sink I to PHI-uses.
+ if (dyn_cast<PHINode>(UI))
+ return false;
+ // We cannot sink I if it has uses outside of the loop.
+ if (!L.contains(LI.getLoopFor(UI->getParent())))
+ return false;
+ BBs.insert(UI->getParent());
+ }
+
+ // findBBsToSinkInto is O(BBs.size() * ColdLoopBBs.size()). We cap the max
+ // BBs.size() to avoid expensive computation.
+ // FIXME: Handle code size growth for min_size and opt_size.
+ if (BBs.size() > MaxNumberOfUseBBsForSinking)
+ return false;
+
+ // Find the set of BBs that we should insert a copy of I.
+ SmallPtrSet<BasicBlock *, 2> BBsToSinkInto =
+ findBBsToSinkInto(L, BBs, ColdLoopBBs, DT, BFI);
+ if (BBsToSinkInto.empty())
+ return false;
+
+ // Copy the final BBs into a vector and sort them using the total ordering
+ // of the loop block numbers as iterating the set doesn't give a useful
+ // order. No need to stable sort as the block numbers are a total ordering.
+ SmallVector<BasicBlock *, 2> SortedBBsToSinkInto;
+ SortedBBsToSinkInto.insert(SortedBBsToSinkInto.begin(), BBsToSinkInto.begin(),
+ BBsToSinkInto.end());
+ std::sort(SortedBBsToSinkInto.begin(), SortedBBsToSinkInto.end(),
+ [&](BasicBlock *A, BasicBlock *B) {
+ return *LoopBlockNumber.find(A) < *LoopBlockNumber.find(B);
+ });
+
+ BasicBlock *MoveBB = *SortedBBsToSinkInto.begin();
+ // FIXME: Optimize the efficiency for cloned value replacement. The current
+ // implementation is O(SortedBBsToSinkInto.size() * I.num_uses()).
+ for (BasicBlock *N : SortedBBsToSinkInto) {
+ if (N == MoveBB)
+ continue;
+ // Clone I and replace its uses.
+ Instruction *IC = I.clone();
+ IC->setName(I.getName());
+ IC->insertBefore(&*N->getFirstInsertionPt());
+ // Replaces uses of I with IC in N
+ for (Value::use_iterator UI = I.use_begin(), UE = I.use_end(); UI != UE;) {
+ Use &U = *UI++;
+ auto *I = cast<Instruction>(U.getUser());
+ if (I->getParent() == N)
+ U.set(IC);
+ }
+ // Replaces uses of I with IC in blocks dominated by N
+ replaceDominatedUsesWith(&I, IC, DT, N);
+ DEBUG(dbgs() << "Sinking a clone of " << I << " To: " << N->getName()
+ << '\n');
+ NumLoopSunkCloned++;
+ }
+ DEBUG(dbgs() << "Sinking " << I << " To: " << MoveBB->getName() << '\n');
+ NumLoopSunk++;
+ I.moveBefore(&*MoveBB->getFirstInsertionPt());
+
+ return true;
+}
+
+/// Sinks instructions from loop's preheader to the loop body if the
+/// sum frequency of inserted copy is smaller than preheader's frequency.
+static bool sinkLoopInvariantInstructions(Loop &L, AAResults &AA, LoopInfo &LI,
+ DominatorTree &DT,
+ BlockFrequencyInfo &BFI,
+ ScalarEvolution *SE) {
+ BasicBlock *Preheader = L.getLoopPreheader();
+ if (!Preheader)
+ return false;
+
+ // Enable LoopSink only when runtime profile is available.
+ // With static profile, the sinking decision may be sub-optimal.
+ if (!Preheader->getParent()->getEntryCount())
+ return false;
+
+ const BlockFrequency PreheaderFreq = BFI.getBlockFreq(Preheader);
+ // If there are no basic blocks with lower frequency than the preheader then
+ // we can avoid the detailed analysis as we will never find profitable sinking
+ // opportunities.
+ if (all_of(L.blocks(), [&](const BasicBlock *BB) {
+ return BFI.getBlockFreq(BB) > PreheaderFreq;
+ }))
+ return false;
+
+ bool Changed = false;
+ AliasSetTracker CurAST(AA);
+
+ // Compute alias set.
+ for (BasicBlock *BB : L.blocks())
+ CurAST.add(*BB);
+
+ // Sort loop's basic blocks by frequency
+ SmallVector<BasicBlock *, 10> ColdLoopBBs;
+ SmallDenseMap<BasicBlock *, int, 16> LoopBlockNumber;
+ int i = 0;
+ for (BasicBlock *B : L.blocks())
+ if (BFI.getBlockFreq(B) < BFI.getBlockFreq(L.getLoopPreheader())) {
+ ColdLoopBBs.push_back(B);
+ LoopBlockNumber[B] = ++i;
+ }
+ std::stable_sort(ColdLoopBBs.begin(), ColdLoopBBs.end(),
+ [&](BasicBlock *A, BasicBlock *B) {
+ return BFI.getBlockFreq(A) < BFI.getBlockFreq(B);
+ });
+
+ // Traverse preheader's instructions in reverse order becaue if A depends
+ // on B (A appears after B), A needs to be sinked first before B can be
+ // sinked.
+ for (auto II = Preheader->rbegin(), E = Preheader->rend(); II != E;) {
+ Instruction *I = &*II++;
+ // No need to check for instruction's operands are loop invariant.
+ assert(L.hasLoopInvariantOperands(I) &&
+ "Insts in a loop's preheader should have loop invariant operands!");
+ if (!canSinkOrHoistInst(*I, &AA, &DT, &L, &CurAST, nullptr))
+ continue;
+ if (sinkInstruction(L, *I, ColdLoopBBs, LoopBlockNumber, LI, DT, BFI))
+ Changed = true;
+ }
+
+ if (Changed && SE)
+ SE->forgetLoopDispositions(&L);
+ return Changed;
+}
+
+namespace {
+struct LegacyLoopSinkPass : public LoopPass {
+ static char ID;
+ LegacyLoopSinkPass() : LoopPass(ID) {
+ initializeLegacyLoopSinkPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnLoop(Loop *L, LPPassManager &LPM) override {
+ if (skipLoop(L))
+ return false;
+
+ auto *SE = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>();
+ return sinkLoopInvariantInstructions(
+ *L, getAnalysis<AAResultsWrapperPass>().getAAResults(),
+ getAnalysis<LoopInfoWrapperPass>().getLoopInfo(),
+ getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
+ getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(),
+ SE ? &SE->getSE() : nullptr);
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ AU.addRequired<BlockFrequencyInfoWrapperPass>();
+ getLoopAnalysisUsage(AU);
+ }
+};
+}
+
+char LegacyLoopSinkPass::ID = 0;
+INITIALIZE_PASS_BEGIN(LegacyLoopSinkPass, "loop-sink", "Loop Sink", false,
+ false)
+INITIALIZE_PASS_DEPENDENCY(LoopPass)
+INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
+INITIALIZE_PASS_END(LegacyLoopSinkPass, "loop-sink", "Loop Sink", false, false)
+
+Pass *llvm::createLoopSinkPass() { return new LegacyLoopSinkPass(); }
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 70bd9d3..194587a 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -53,29 +53,64 @@
//
//===----------------------------------------------------------------------===//
-#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/LoopStrengthReduce.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/DenseSet.h"
#include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/PointerIntPair.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/SmallBitVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
#include "llvm/Analysis/IVUsers.h"
+#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/ScalarEvolutionNormalization.h"
#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Dominators.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Module.h"
+#include "llvm/IR/OperandTraits.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
#include "llvm/IR/ValueHandle.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/Local.h"
#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <iterator>
+#include <map>
+#include <tuple>
+#include <utility>
+
using namespace llvm;
#define DEBUG_TYPE "loop-reduce"
@@ -123,8 +158,9 @@ struct MemAccessTy {
bool operator!=(MemAccessTy Other) const { return !(*this == Other); }
- static MemAccessTy getUnknown(LLVMContext &Ctx) {
- return MemAccessTy(Type::getVoidTy(Ctx), UnknownAddressSpace);
+ static MemAccessTy getUnknown(LLVMContext &Ctx,
+ unsigned AS = UnknownAddressSpace) {
+ return MemAccessTy(Type::getVoidTy(Ctx), AS);
}
};
@@ -139,7 +175,7 @@ public:
void dump() const;
};
-}
+} // end anonymous namespace
void RegSortData::print(raw_ostream &OS) const {
OS << "[NumUses=" << UsedByIndices.count() << ']';
@@ -178,7 +214,7 @@ public:
const_iterator end() const { return RegSequence.end(); }
};
-}
+} // end anonymous namespace
void
RegUseTracker::countRegister(const SCEV *Reg, size_t LUIdx) {
@@ -210,7 +246,7 @@ RegUseTracker::swapAndDropUse(size_t LUIdx, size_t LastLUIdx) {
SmallBitVector &UsedByIndices = Pair.second.UsedByIndices;
if (LUIdx < UsedByIndices.size())
UsedByIndices[LUIdx] =
- LastLUIdx < UsedByIndices.size() ? UsedByIndices[LastLUIdx] : 0;
+ LastLUIdx < UsedByIndices.size() ? UsedByIndices[LastLUIdx] : false;
UsedByIndices.resize(std::min(UsedByIndices.size(), LastLUIdx));
}
}
@@ -301,7 +337,7 @@ struct Formula {
void dump() const;
};
-}
+} // end anonymous namespace
/// Recursion helper for initialMatch.
static void DoInitialMatch(const SCEV *S, Loop *L,
@@ -323,7 +359,7 @@ static void DoInitialMatch(const SCEV *S, Loop *L,
// Look at addrec operands.
if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S))
- if (!AR->getStart()->isZero()) {
+ if (!AR->getStart()->isZero() && AR->isAffine()) {
DoInitialMatch(AR->getStart(), L, Good, Bad, SE);
DoInitialMatch(SE.getAddRecExpr(SE.getConstant(AR->getType(), 0),
AR->getStepRecurrence(SE),
@@ -446,8 +482,7 @@ void Formula::deleteBaseReg(const SCEV *&S) {
/// Test if this formula references the given register.
bool Formula::referencesReg(const SCEV *S) const {
- return S == ScaledReg ||
- std::find(BaseRegs.begin(), BaseRegs.end(), S) != BaseRegs.end();
+ return S == ScaledReg || is_contained(BaseRegs, S);
}
/// Test whether this formula uses registers which are used by uses other than
@@ -567,7 +602,7 @@ static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS,
// Distribute the sdiv over addrec operands, if the addrec doesn't overflow.
if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(LHS)) {
- if (IgnoreSignificantBits || isAddRecSExtable(AR, SE)) {
+ if ((IgnoreSignificantBits || isAddRecSExtable(AR, SE)) && AR->isAffine()) {
const SCEV *Step = getExactSDiv(AR->getStepRecurrence(SE), RHS, SE,
IgnoreSignificantBits);
if (!Step) return nullptr;
@@ -822,8 +857,10 @@ DeleteTriviallyDeadInstructions(SmallVectorImpl<WeakVH> &DeadInsts) {
}
namespace {
+
class LSRUse;
-}
+
+} // end anonymous namespace
/// \brief Check if the addressing mode defined by \p F is completely
/// folded in \p LU at isel time.
@@ -883,7 +920,6 @@ public:
SmallPtrSetImpl<const SCEV *> &Regs,
const DenseSet<const SCEV *> &VisitedRegs,
const Loop *L,
- const SmallVectorImpl<int64_t> &Offsets,
ScalarEvolution &SE, DominatorTree &DT,
const LSRUse &LU,
SmallPtrSetImpl<const SCEV *> *LoserRegs = nullptr);
@@ -902,8 +938,144 @@ private:
ScalarEvolution &SE, DominatorTree &DT,
SmallPtrSetImpl<const SCEV *> *LoserRegs);
};
+
+/// An operand value in an instruction which is to be replaced with some
+/// equivalent, possibly strength-reduced, replacement.
+struct LSRFixup {
+ /// The instruction which will be updated.
+ Instruction *UserInst;
-}
+ /// The operand of the instruction which will be replaced. The operand may be
+ /// used more than once; every instance will be replaced.
+ Value *OperandValToReplace;
+
+ /// If this user is to use the post-incremented value of an induction
+ /// variable, this variable is non-null and holds the loop associated with the
+ /// induction variable.
+ PostIncLoopSet PostIncLoops;
+
+ /// A constant offset to be added to the LSRUse expression. This allows
+ /// multiple fixups to share the same LSRUse with different offsets, for
+ /// example in an unrolled loop.
+ int64_t Offset;
+
+ bool isUseFullyOutsideLoop(const Loop *L) const;
+
+ LSRFixup();
+
+ void print(raw_ostream &OS) const;
+ void dump() const;
+};
+
+/// A DenseMapInfo implementation for holding DenseMaps and DenseSets of sorted
+/// SmallVectors of const SCEV*.
+struct UniquifierDenseMapInfo {
+ static SmallVector<const SCEV *, 4> getEmptyKey() {
+ SmallVector<const SCEV *, 4> V;
+ V.push_back(reinterpret_cast<const SCEV *>(-1));
+ return V;
+ }
+
+ static SmallVector<const SCEV *, 4> getTombstoneKey() {
+ SmallVector<const SCEV *, 4> V;
+ V.push_back(reinterpret_cast<const SCEV *>(-2));
+ return V;
+ }
+
+ static unsigned getHashValue(const SmallVector<const SCEV *, 4> &V) {
+ return static_cast<unsigned>(hash_combine_range(V.begin(), V.end()));
+ }
+
+ static bool isEqual(const SmallVector<const SCEV *, 4> &LHS,
+ const SmallVector<const SCEV *, 4> &RHS) {
+ return LHS == RHS;
+ }
+};
+
+/// This class holds the state that LSR keeps for each use in IVUsers, as well
+/// as uses invented by LSR itself. It includes information about what kinds of
+/// things can be folded into the user, information about the user itself, and
+/// information about how the use may be satisfied. TODO: Represent multiple
+/// users of the same expression in common?
+class LSRUse {
+ DenseSet<SmallVector<const SCEV *, 4>, UniquifierDenseMapInfo> Uniquifier;
+
+public:
+ /// An enum for a kind of use, indicating what types of scaled and immediate
+ /// operands it might support.
+ enum KindType {
+ Basic, ///< A normal use, with no folding.
+ Special, ///< A special case of basic, allowing -1 scales.
+ Address, ///< An address use; folding according to TargetLowering
+ ICmpZero ///< An equality icmp with both operands folded into one.
+ // TODO: Add a generic icmp too?
+ };
+
+ typedef PointerIntPair<const SCEV *, 2, KindType> SCEVUseKindPair;
+
+ KindType Kind;
+ MemAccessTy AccessTy;
+
+ /// The list of operands which are to be replaced.
+ SmallVector<LSRFixup, 8> Fixups;
+
+ /// Keep track of the min and max offsets of the fixups.
+ int64_t MinOffset;
+ int64_t MaxOffset;
+
+ /// This records whether all of the fixups using this LSRUse are outside of
+ /// the loop, in which case some special-case heuristics may be used.
+ bool AllFixupsOutsideLoop;
+
+ /// RigidFormula is set to true to guarantee that this use will be associated
+ /// with a single formula--the one that initially matched. Some SCEV
+ /// expressions cannot be expanded. This allows LSR to consider the registers
+ /// used by those expressions without the need to expand them later after
+ /// changing the formula.
+ bool RigidFormula;
+
+ /// This records the widest use type for any fixup using this
+ /// LSRUse. FindUseWithSimilarFormula can't consider uses with different max
+ /// fixup widths to be equivalent, because the narrower one may be relying on
+ /// the implicit truncation to truncate away bogus bits.
+ Type *WidestFixupType;
+
+ /// A list of ways to build a value that can satisfy this user. After the
+ /// list is populated, one of these is selected heuristically and used to
+ /// formulate a replacement for OperandValToReplace in UserInst.
+ SmallVector<Formula, 12> Formulae;
+
+ /// The set of register candidates used by all formulae in this LSRUse.
+ SmallPtrSet<const SCEV *, 4> Regs;
+
+ LSRUse(KindType K, MemAccessTy AT)
+ : Kind(K), AccessTy(AT), MinOffset(INT64_MAX), MaxOffset(INT64_MIN),
+ AllFixupsOutsideLoop(true), RigidFormula(false),
+ WidestFixupType(nullptr) {}
+
+ LSRFixup &getNewFixup() {
+ Fixups.push_back(LSRFixup());
+ return Fixups.back();
+ }
+
+ void pushFixup(LSRFixup &f) {
+ Fixups.push_back(f);
+ if (f.Offset > MaxOffset)
+ MaxOffset = f.Offset;
+ if (f.Offset < MinOffset)
+ MinOffset = f.Offset;
+ }
+
+ bool HasFormulaWithSameRegs(const Formula &F) const;
+ bool InsertFormula(const Formula &F);
+ void DeleteFormula(Formula &F);
+ void RecomputeRegs(size_t LUIdx, RegUseTracker &Reguses);
+
+ void print(raw_ostream &OS) const;
+ void dump() const;
+};
+
+} // end anonymous namespace
/// Tally up interesting quantities from the given register.
void Cost::RateRegister(const SCEV *Reg,
@@ -975,7 +1147,6 @@ void Cost::RateFormula(const TargetTransformInfo &TTI,
SmallPtrSetImpl<const SCEV *> &Regs,
const DenseSet<const SCEV *> &VisitedRegs,
const Loop *L,
- const SmallVectorImpl<int64_t> &Offsets,
ScalarEvolution &SE, DominatorTree &DT,
const LSRUse &LU,
SmallPtrSetImpl<const SCEV *> *LoserRegs) {
@@ -1013,13 +1184,20 @@ void Cost::RateFormula(const TargetTransformInfo &TTI,
ScaleCost += getScalingFactorCost(TTI, LU, F);
// Tally up the non-zero immediates.
- for (int64_t O : Offsets) {
+ for (const LSRFixup &Fixup : LU.Fixups) {
+ int64_t O = Fixup.Offset;
int64_t Offset = (uint64_t)O + F.BaseOffset;
if (F.BaseGV)
ImmCost += 64; // Handle symbolic values conservatively.
// TODO: This should probably be the pointer size.
else if (Offset != 0)
ImmCost += APInt(64, Offset, true).getMinSignedBits();
+
+ // Check with target if this offset with this instruction is
+ // specifically not supported.
+ if ((isa<LoadInst>(Fixup.UserInst) || isa<StoreInst>(Fixup.UserInst)) &&
+ !TTI.isFoldableMemAccessOffset(Fixup.UserInst, Offset))
+ NumBaseAdds++;
}
assert(isValid() && "invalid cost");
}
@@ -1066,44 +1244,8 @@ void Cost::dump() const {
print(errs()); errs() << '\n';
}
-namespace {
-
-/// An operand value in an instruction which is to be replaced with some
-/// equivalent, possibly strength-reduced, replacement.
-struct LSRFixup {
- /// The instruction which will be updated.
- Instruction *UserInst;
-
- /// The operand of the instruction which will be replaced. The operand may be
- /// used more than once; every instance will be replaced.
- Value *OperandValToReplace;
-
- /// If this user is to use the post-incremented value of an induction
- /// variable, this variable is non-null and holds the loop associated with the
- /// induction variable.
- PostIncLoopSet PostIncLoops;
-
- /// The index of the LSRUse describing the expression which this fixup needs,
- /// minus an offset (below).
- size_t LUIdx;
-
- /// A constant offset to be added to the LSRUse expression. This allows
- /// multiple fixups to share the same LSRUse with different offsets, for
- /// example in an unrolled loop.
- int64_t Offset;
-
- bool isUseFullyOutsideLoop(const Loop *L) const;
-
- LSRFixup();
-
- void print(raw_ostream &OS) const;
- void dump() const;
-};
-
-}
-
LSRFixup::LSRFixup()
- : UserInst(nullptr), OperandValToReplace(nullptr), LUIdx(~size_t(0)),
+ : UserInst(nullptr), OperandValToReplace(nullptr),
Offset(0) {}
/// Test whether this fixup always uses its value outside of the given loop.
@@ -1139,9 +1281,6 @@ void LSRFixup::print(raw_ostream &OS) const {
PIL->getHeader()->printAsOperand(OS, /*PrintType=*/false);
}
- if (LUIdx != ~size_t(0))
- OS << ", LUIdx=" << LUIdx;
-
if (Offset != 0)
OS << ", Offset=" << Offset;
}
@@ -1151,102 +1290,6 @@ void LSRFixup::dump() const {
print(errs()); errs() << '\n';
}
-namespace {
-
-/// A DenseMapInfo implementation for holding DenseMaps and DenseSets of sorted
-/// SmallVectors of const SCEV*.
-struct UniquifierDenseMapInfo {
- static SmallVector<const SCEV *, 4> getEmptyKey() {
- SmallVector<const SCEV *, 4> V;
- V.push_back(reinterpret_cast<const SCEV *>(-1));
- return V;
- }
-
- static SmallVector<const SCEV *, 4> getTombstoneKey() {
- SmallVector<const SCEV *, 4> V;
- V.push_back(reinterpret_cast<const SCEV *>(-2));
- return V;
- }
-
- static unsigned getHashValue(const SmallVector<const SCEV *, 4> &V) {
- return static_cast<unsigned>(hash_combine_range(V.begin(), V.end()));
- }
-
- static bool isEqual(const SmallVector<const SCEV *, 4> &LHS,
- const SmallVector<const SCEV *, 4> &RHS) {
- return LHS == RHS;
- }
-};
-
-/// This class holds the state that LSR keeps for each use in IVUsers, as well
-/// as uses invented by LSR itself. It includes information about what kinds of
-/// things can be folded into the user, information about the user itself, and
-/// information about how the use may be satisfied. TODO: Represent multiple
-/// users of the same expression in common?
-class LSRUse {
- DenseSet<SmallVector<const SCEV *, 4>, UniquifierDenseMapInfo> Uniquifier;
-
-public:
- /// An enum for a kind of use, indicating what types of scaled and immediate
- /// operands it might support.
- enum KindType {
- Basic, ///< A normal use, with no folding.
- Special, ///< A special case of basic, allowing -1 scales.
- Address, ///< An address use; folding according to TargetLowering
- ICmpZero ///< An equality icmp with both operands folded into one.
- // TODO: Add a generic icmp too?
- };
-
- typedef PointerIntPair<const SCEV *, 2, KindType> SCEVUseKindPair;
-
- KindType Kind;
- MemAccessTy AccessTy;
-
- SmallVector<int64_t, 8> Offsets;
- int64_t MinOffset;
- int64_t MaxOffset;
-
- /// This records whether all of the fixups using this LSRUse are outside of
- /// the loop, in which case some special-case heuristics may be used.
- bool AllFixupsOutsideLoop;
-
- /// RigidFormula is set to true to guarantee that this use will be associated
- /// with a single formula--the one that initially matched. Some SCEV
- /// expressions cannot be expanded. This allows LSR to consider the registers
- /// used by those expressions without the need to expand them later after
- /// changing the formula.
- bool RigidFormula;
-
- /// This records the widest use type for any fixup using this
- /// LSRUse. FindUseWithSimilarFormula can't consider uses with different max
- /// fixup widths to be equivalent, because the narrower one may be relying on
- /// the implicit truncation to truncate away bogus bits.
- Type *WidestFixupType;
-
- /// A list of ways to build a value that can satisfy this user. After the
- /// list is populated, one of these is selected heuristically and used to
- /// formulate a replacement for OperandValToReplace in UserInst.
- SmallVector<Formula, 12> Formulae;
-
- /// The set of register candidates used by all formulae in this LSRUse.
- SmallPtrSet<const SCEV *, 4> Regs;
-
- LSRUse(KindType K, MemAccessTy AT)
- : Kind(K), AccessTy(AT), MinOffset(INT64_MAX), MaxOffset(INT64_MIN),
- AllFixupsOutsideLoop(true), RigidFormula(false),
- WidestFixupType(nullptr) {}
-
- bool HasFormulaWithSameRegs(const Formula &F) const;
- bool InsertFormula(const Formula &F);
- void DeleteFormula(Formula &F);
- void RecomputeRegs(size_t LUIdx, RegUseTracker &Reguses);
-
- void print(raw_ostream &OS) const;
- void dump() const;
-};
-
-}
-
/// Test whether this use as a formula which has the same registers as the given
/// formula.
bool LSRUse::HasFormulaWithSameRegs(const Formula &F) const {
@@ -1334,9 +1377,9 @@ void LSRUse::print(raw_ostream &OS) const {
OS << ", Offsets={";
bool NeedComma = false;
- for (int64_t O : Offsets) {
+ for (const LSRFixup &Fixup : Fixups) {
if (NeedComma) OS << ',';
- OS << O;
+ OS << Fixup.Offset;
NeedComma = true;
}
OS << '}';
@@ -1638,14 +1681,16 @@ class LSRInstance {
Instruction *IVIncInsertPos;
/// Interesting factors between use strides.
- SmallSetVector<int64_t, 8> Factors;
+ ///
+ /// We explicitly use a SetVector which contains a SmallSet, instead of the
+ /// default, a SmallDenseSet, because we need to use the full range of
+ /// int64_ts, and there's currently no good way of doing that with
+ /// SmallDenseSet.
+ SetVector<int64_t, SmallVector<int64_t, 8>, SmallSet<int64_t, 8>> Factors;
/// Interesting use types, to facilitate truncation reuse.
SmallSetVector<Type *, 4> Types;
- /// The list of operands which are to be replaced.
- SmallVector<LSRFixup, 16> Fixups;
-
/// The list of interesting uses.
SmallVector<LSRUse, 16> Uses;
@@ -1678,11 +1723,6 @@ class LSRInstance {
void CollectInterestingTypesAndFactors();
void CollectFixupsAndInitialFormulae();
- LSRFixup &getNewFixup() {
- Fixups.push_back(LSRFixup());
- return Fixups.back();
- }
-
// Support for sharing of LSRUses between LSRFixups.
typedef DenseMap<LSRUse::SCEVUseKindPair, size_t> UseMapTy;
UseMapTy UseMap;
@@ -1752,16 +1792,16 @@ class LSRInstance {
const LSRUse &LU,
SCEVExpander &Rewriter) const;
- Value *Expand(const LSRFixup &LF,
+ Value *Expand(const LSRUse &LU, const LSRFixup &LF,
const Formula &F,
BasicBlock::iterator IP,
SCEVExpander &Rewriter,
SmallVectorImpl<WeakVH> &DeadInsts) const;
- void RewriteForPHI(PHINode *PN, const LSRFixup &LF,
+ void RewriteForPHI(PHINode *PN, const LSRUse &LU, const LSRFixup &LF,
const Formula &F,
SCEVExpander &Rewriter,
SmallVectorImpl<WeakVH> &DeadInsts) const;
- void Rewrite(const LSRFixup &LF,
+ void Rewrite(const LSRUse &LU, const LSRFixup &LF,
const Formula &F,
SCEVExpander &Rewriter,
SmallVectorImpl<WeakVH> &DeadInsts) const;
@@ -1780,7 +1820,7 @@ public:
void dump() const;
};
-}
+} // end anonymous namespace
/// If IV is used in a int-to-float cast inside the loop then try to eliminate
/// the cast operation.
@@ -2068,10 +2108,30 @@ void
LSRInstance::OptimizeLoopTermCond() {
SmallPtrSet<Instruction *, 4> PostIncs;
+ // We need a different set of heuristics for rotated and non-rotated loops.
+ // If a loop is rotated then the latch is also the backedge, so inserting
+ // post-inc expressions just before the latch is ideal. To reduce live ranges
+ // it also makes sense to rewrite terminating conditions to use post-inc
+ // expressions.
+ //
+ // If the loop is not rotated then the latch is not a backedge; the latch
+ // check is done in the loop head. Adding post-inc expressions before the
+ // latch will cause overlapping live-ranges of pre-inc and post-inc expressions
+ // in the loop body. In this case we do *not* want to use post-inc expressions
+ // in the latch check, and we want to insert post-inc expressions before
+ // the backedge.
BasicBlock *LatchBlock = L->getLoopLatch();
SmallVector<BasicBlock*, 8> ExitingBlocks;
L->getExitingBlocks(ExitingBlocks);
+ if (llvm::all_of(ExitingBlocks, [&LatchBlock](const BasicBlock *BB) {
+ return LatchBlock != BB;
+ })) {
+ // The backedge doesn't exit the loop; treat this as a head-tested loop.
+ IVIncInsertPos = LatchBlock->getTerminator();
+ return;
+ }
+ // Otherwise treat this as a rotated loop.
for (BasicBlock *ExitingBlock : ExitingBlocks) {
// Get the terminating condition for the loop if possible. If we
@@ -2220,8 +2280,10 @@ bool LSRInstance::reconcileNewOffset(LSRUse &LU, int64_t NewOffset,
// TODO: Be less conservative when the type is similar and can use the same
// addressing modes.
if (Kind == LSRUse::Address) {
- if (AccessTy != LU.AccessTy)
- NewAccessTy = MemAccessTy::getUnknown(AccessTy.MemTy->getContext());
+ if (AccessTy.MemTy != LU.AccessTy.MemTy) {
+ NewAccessTy = MemAccessTy::getUnknown(AccessTy.MemTy->getContext(),
+ AccessTy.AddrSpace);
+ }
}
// Conservatively assume HasBaseReg is true for now.
@@ -2241,8 +2303,6 @@ bool LSRInstance::reconcileNewOffset(LSRUse &LU, int64_t NewOffset,
LU.MinOffset = NewMinOffset;
LU.MaxOffset = NewMaxOffset;
LU.AccessTy = NewAccessTy;
- if (NewOffset != LU.Offsets.back())
- LU.Offsets.push_back(NewOffset);
return true;
}
@@ -2279,11 +2339,6 @@ std::pair<size_t, int64_t> LSRInstance::getUse(const SCEV *&Expr,
Uses.push_back(LSRUse(Kind, AccessTy));
LSRUse &LU = Uses[LUIdx];
- // We don't need to track redundant offsets, but we don't need to go out
- // of our way here to avoid them.
- if (LU.Offsets.empty() || Offset != LU.Offsets.back())
- LU.Offsets.push_back(Offset);
-
LU.MinOffset = Offset;
LU.MaxOffset = Offset;
return std::make_pair(LUIdx, Offset);
@@ -2500,7 +2555,7 @@ bool IVChain::isProfitableIncrement(const SCEV *OperExpr,
if (!isa<SCEVConstant>(IncExpr)) {
const SCEV *HeadExpr = SE.getSCEV(getWideOperand(Incs[0].IVOperand));
if (isa<SCEVConstant>(SE.getMinusSCEV(OperExpr, HeadExpr)))
- return 0;
+ return false;
}
SmallPtrSet<const SCEV*, 8> Processed;
@@ -2797,9 +2852,8 @@ void LSRInstance::FinalizeChain(IVChain &Chain) {
DEBUG(dbgs() << "Final Chain: " << *Chain.Incs[0].UserInst << "\n");
for (const IVInc &Inc : Chain) {
- DEBUG(dbgs() << " Inc: " << Inc.UserInst << "\n");
- auto UseI = std::find(Inc.UserInst->op_begin(), Inc.UserInst->op_end(),
- Inc.IVOperand);
+ DEBUG(dbgs() << " Inc: " << *Inc.UserInst << "\n");
+ auto UseI = find(Inc.UserInst->operands(), Inc.IVOperand);
assert(UseI != Inc.UserInst->op_end() && "cannot find IV operand");
IVIncSet.insert(UseI);
}
@@ -2932,39 +2986,34 @@ void LSRInstance::CollectFixupsAndInitialFormulae() {
for (const IVStrideUse &U : IU) {
Instruction *UserInst = U.getUser();
// Skip IV users that are part of profitable IV Chains.
- User::op_iterator UseI = std::find(UserInst->op_begin(), UserInst->op_end(),
- U.getOperandValToReplace());
+ User::op_iterator UseI =
+ find(UserInst->operands(), U.getOperandValToReplace());
assert(UseI != UserInst->op_end() && "cannot find IV operand");
if (IVIncSet.count(UseI))
continue;
- // Record the uses.
- LSRFixup &LF = getNewFixup();
- LF.UserInst = UserInst;
- LF.OperandValToReplace = U.getOperandValToReplace();
- LF.PostIncLoops = U.getPostIncLoops();
-
LSRUse::KindType Kind = LSRUse::Basic;
MemAccessTy AccessTy;
- if (isAddressUse(LF.UserInst, LF.OperandValToReplace)) {
+ if (isAddressUse(UserInst, U.getOperandValToReplace())) {
Kind = LSRUse::Address;
- AccessTy = getAccessType(LF.UserInst);
+ AccessTy = getAccessType(UserInst);
}
const SCEV *S = IU.getExpr(U);
-
+ PostIncLoopSet TmpPostIncLoops = U.getPostIncLoops();
+
// Equality (== and !=) ICmps are special. We can rewrite (i == N) as
// (N - i == 0), and this allows (N - i) to be the expression that we work
// with rather than just N or i, so we can consider the register
// requirements for both N and i at the same time. Limiting this code to
// equality icmps is not a problem because all interesting loops use
// equality icmps, thanks to IndVarSimplify.
- if (ICmpInst *CI = dyn_cast<ICmpInst>(LF.UserInst))
+ if (ICmpInst *CI = dyn_cast<ICmpInst>(UserInst))
if (CI->isEquality()) {
// Swap the operands if needed to put the OperandValToReplace on the
// left, for consistency.
Value *NV = CI->getOperand(1);
- if (NV == LF.OperandValToReplace) {
+ if (NV == U.getOperandValToReplace()) {
CI->setOperand(1, CI->getOperand(0));
CI->setOperand(0, NV);
NV = CI->getOperand(1);
@@ -2977,7 +3026,7 @@ void LSRInstance::CollectFixupsAndInitialFormulae() {
// S is normalized, so normalize N before folding it into S
// to keep the result normalized.
N = TransformForPostIncUse(Normalize, N, CI, nullptr,
- LF.PostIncLoops, SE, DT);
+ TmpPostIncLoops, SE, DT);
Kind = LSRUse::ICmpZero;
S = SE.getMinusSCEV(N, S);
}
@@ -2990,12 +3039,20 @@ void LSRInstance::CollectFixupsAndInitialFormulae() {
Factors.insert(-1);
}
- // Set up the initial formula for this use.
+ // Get or create an LSRUse.
std::pair<size_t, int64_t> P = getUse(S, Kind, AccessTy);
- LF.LUIdx = P.first;
- LF.Offset = P.second;
- LSRUse &LU = Uses[LF.LUIdx];
+ size_t LUIdx = P.first;
+ int64_t Offset = P.second;
+ LSRUse &LU = Uses[LUIdx];
+
+ // Record the fixup.
+ LSRFixup &LF = LU.getNewFixup();
+ LF.UserInst = UserInst;
+ LF.OperandValToReplace = U.getOperandValToReplace();
+ LF.PostIncLoops = TmpPostIncLoops;
+ LF.Offset = Offset;
LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L);
+
if (!LU.WidestFixupType ||
SE.getTypeSizeInBits(LU.WidestFixupType) <
SE.getTypeSizeInBits(LF.OperandValToReplace->getType()))
@@ -3003,8 +3060,8 @@ void LSRInstance::CollectFixupsAndInitialFormulae() {
// If this is the first use of this LSRUse, give it a formula.
if (LU.Formulae.empty()) {
- InsertInitialFormula(S, LU, LF.LUIdx);
- CountRegisters(LU.Formulae.back(), LF.LUIdx);
+ InsertInitialFormula(S, LU, LUIdx);
+ CountRegisters(LU.Formulae.back(), LUIdx);
}
}
@@ -3109,6 +3166,9 @@ LSRInstance::CollectLoopInvariantFixupsAndFormulae() {
// Don't bother if the instruction is in a BB which ends in an EHPad.
if (UseBB->getTerminator()->isEHPad())
continue;
+ // Don't bother rewriting PHIs in catchswitch blocks.
+ if (isa<CatchSwitchInst>(UserInst->getParent()->getTerminator()))
+ continue;
// Ignore uses which are part of other SCEV expressions, to avoid
// analyzing them multiple times.
if (SE.isSCEVable(UserInst->getType())) {
@@ -3130,20 +3190,21 @@ LSRInstance::CollectLoopInvariantFixupsAndFormulae() {
continue;
}
- LSRFixup &LF = getNewFixup();
- LF.UserInst = const_cast<Instruction *>(UserInst);
- LF.OperandValToReplace = U;
std::pair<size_t, int64_t> P = getUse(
S, LSRUse::Basic, MemAccessTy());
- LF.LUIdx = P.first;
- LF.Offset = P.second;
- LSRUse &LU = Uses[LF.LUIdx];
+ size_t LUIdx = P.first;
+ int64_t Offset = P.second;
+ LSRUse &LU = Uses[LUIdx];
+ LSRFixup &LF = LU.getNewFixup();
+ LF.UserInst = const_cast<Instruction *>(UserInst);
+ LF.OperandValToReplace = U;
+ LF.Offset = Offset;
LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L);
if (!LU.WidestFixupType ||
SE.getTypeSizeInBits(LU.WidestFixupType) <
SE.getTypeSizeInBits(LF.OperandValToReplace->getType()))
LU.WidestFixupType = LF.OperandValToReplace->getType();
- InsertSupplementalFormula(US, LU, LF.LUIdx);
+ InsertSupplementalFormula(US, LU, LUIdx);
CountRegisters(LU.Formulae.back(), Uses.size() - 1);
break;
}
@@ -3175,7 +3236,7 @@ static const SCEV *CollectSubexprs(const SCEV *S, const SCEVConstant *C,
return nullptr;
} else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
// Split a non-zero base out of an addrec.
- if (AR->getStart()->isZero())
+ if (AR->getStart()->isZero() || !AR->isAffine())
return S;
const SCEV *Remainder = CollectSubexprs(AR->getStart(),
@@ -3629,7 +3690,7 @@ struct WorkItem {
void dump() const;
};
-}
+} // end anonymous namespace
void WorkItem::print(raw_ostream &OS) const {
OS << "in formulae referencing " << *OrigReg << " in use " << LUIdx
@@ -3872,8 +3933,7 @@ void LSRInstance::FilterOutUndesirableDedicatedRegisters() {
// the corresponding bad register from the Regs set.
Cost CostF;
Regs.clear();
- CostF.RateFormula(TTI, F, Regs, VisitedRegs, L, LU.Offsets, SE, DT, LU,
- &LoserRegs);
+ CostF.RateFormula(TTI, F, Regs, VisitedRegs, L, SE, DT, LU, &LoserRegs);
if (CostF.isLoser()) {
// During initial formula generation, undesirable formulae are generated
// by uses within other loops that have some non-trivial address mode or
@@ -3906,8 +3966,7 @@ void LSRInstance::FilterOutUndesirableDedicatedRegisters() {
Cost CostBest;
Regs.clear();
- CostBest.RateFormula(TTI, Best, Regs, VisitedRegs, L, LU.Offsets, SE,
- DT, LU);
+ CostBest.RateFormula(TTI, Best, Regs, VisitedRegs, L, SE, DT, LU);
if (CostF < CostBest)
std::swap(F, Best);
DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs());
@@ -4053,25 +4112,13 @@ void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() {
LUThatHas->AllFixupsOutsideLoop &= LU.AllFixupsOutsideLoop;
- // Update the relocs to reference the new use.
- for (LSRFixup &Fixup : Fixups) {
- if (Fixup.LUIdx == LUIdx) {
- Fixup.LUIdx = LUThatHas - &Uses.front();
- Fixup.Offset += F.BaseOffset;
- // Add the new offset to LUThatHas' offset list.
- if (LUThatHas->Offsets.back() != Fixup.Offset) {
- LUThatHas->Offsets.push_back(Fixup.Offset);
- if (Fixup.Offset > LUThatHas->MaxOffset)
- LUThatHas->MaxOffset = Fixup.Offset;
- if (Fixup.Offset < LUThatHas->MinOffset)
- LUThatHas->MinOffset = Fixup.Offset;
- }
- DEBUG(dbgs() << "New fixup has offset " << Fixup.Offset << '\n');
- }
- if (Fixup.LUIdx == NumUses-1)
- Fixup.LUIdx = LUIdx;
+ // Transfer the fixups of LU to LUThatHas.
+ for (LSRFixup &Fixup : LU.Fixups) {
+ Fixup.Offset += F.BaseOffset;
+ LUThatHas->pushFixup(Fixup);
+ DEBUG(dbgs() << "New fixup has offset " << Fixup.Offset << '\n');
}
-
+
// Delete formulae from the new use which are no longer legal.
bool Any = false;
for (size_t i = 0, e = LUThatHas->Formulae.size(); i != e; ++i) {
@@ -4137,9 +4184,10 @@ void LSRInstance::NarrowSearchSpaceByPickingWinnerRegs() {
for (const SCEV *Reg : RegUses) {
if (Taken.count(Reg))
continue;
- if (!Best)
+ if (!Best) {
Best = Reg;
- else {
+ BestNum = RegUses.getUsedByIndices(Reg).count();
+ } else {
unsigned Count = RegUses.getUsedByIndices(Reg).count();
if (Count > BestNum) {
Best = Reg;
@@ -4229,8 +4277,7 @@ void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
int NumReqRegsToFind = std::min(F.getNumRegs(), ReqRegs.size());
for (const SCEV *Reg : ReqRegs) {
if ((F.ScaledReg && F.ScaledReg == Reg) ||
- std::find(F.BaseRegs.begin(), F.BaseRegs.end(), Reg) !=
- F.BaseRegs.end()) {
+ is_contained(F.BaseRegs, Reg)) {
--NumReqRegsToFind;
if (NumReqRegsToFind == 0)
break;
@@ -4246,8 +4293,7 @@ void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
// the current best, prune the search at that point.
NewCost = CurCost;
NewRegs = CurRegs;
- NewCost.RateFormula(TTI, F, NewRegs, VisitedRegs, L, LU.Offsets, SE, DT,
- LU);
+ NewCost.RateFormula(TTI, F, NewRegs, VisitedRegs, L, SE, DT, LU);
if (NewCost < SolutionCost) {
Workspace.push_back(&F);
if (Workspace.size() != Uses.size()) {
@@ -4313,7 +4359,7 @@ LSRInstance::HoistInsertPosition(BasicBlock::iterator IP,
const SmallVectorImpl<Instruction *> &Inputs)
const {
Instruction *Tentative = &*IP;
- for (;;) {
+ while (true) {
bool AllDominate = true;
Instruction *BetterPos = nullptr;
// Don't bother attempting to insert before a catchswitch, their basic block
@@ -4430,12 +4476,12 @@ LSRInstance::AdjustInsertPositionForExpand(BasicBlock::iterator LowestIP,
/// Emit instructions for the leading candidate expression for this LSRUse (this
/// is called "expanding").
-Value *LSRInstance::Expand(const LSRFixup &LF,
+Value *LSRInstance::Expand(const LSRUse &LU,
+ const LSRFixup &LF,
const Formula &F,
BasicBlock::iterator IP,
SCEVExpander &Rewriter,
SmallVectorImpl<WeakVH> &DeadInsts) const {
- const LSRUse &LU = Uses[LF.LUIdx];
if (LU.RigidFormula)
return LF.OperandValToReplace;
@@ -4617,6 +4663,7 @@ Value *LSRInstance::Expand(const LSRFixup &LF,
/// effectively happens in their predecessor blocks, so the expression may need
/// to be expanded in multiple places.
void LSRInstance::RewriteForPHI(PHINode *PN,
+ const LSRUse &LU,
const LSRFixup &LF,
const Formula &F,
SCEVExpander &Rewriter,
@@ -4631,7 +4678,8 @@ void LSRInstance::RewriteForPHI(PHINode *PN,
// is the canonical backedge for this loop, which complicates post-inc
// users.
if (e != 1 && BB->getTerminator()->getNumSuccessors() > 1 &&
- !isa<IndirectBrInst>(BB->getTerminator())) {
+ !isa<IndirectBrInst>(BB->getTerminator()) &&
+ !isa<CatchSwitchInst>(BB->getTerminator())) {
BasicBlock *Parent = PN->getParent();
Loop *PNLoop = LI.getLoopFor(Parent);
if (!PNLoop || Parent != PNLoop->getHeader()) {
@@ -4670,7 +4718,7 @@ void LSRInstance::RewriteForPHI(PHINode *PN,
if (!Pair.second)
PN->setIncomingValue(i, Pair.first->second);
else {
- Value *FullV = Expand(LF, F, BB->getTerminator()->getIterator(),
+ Value *FullV = Expand(LU, LF, F, BB->getTerminator()->getIterator(),
Rewriter, DeadInsts);
// If this is reuse-by-noop-cast, insert the noop cast.
@@ -4691,17 +4739,18 @@ void LSRInstance::RewriteForPHI(PHINode *PN,
/// Emit instructions for the leading candidate expression for this LSRUse (this
/// is called "expanding"), and update the UserInst to reference the newly
/// expanded value.
-void LSRInstance::Rewrite(const LSRFixup &LF,
+void LSRInstance::Rewrite(const LSRUse &LU,
+ const LSRFixup &LF,
const Formula &F,
SCEVExpander &Rewriter,
SmallVectorImpl<WeakVH> &DeadInsts) const {
// First, find an insertion point that dominates UserInst. For PHI nodes,
// find the nearest block which dominates all the relevant uses.
if (PHINode *PN = dyn_cast<PHINode>(LF.UserInst)) {
- RewriteForPHI(PN, LF, F, Rewriter, DeadInsts);
+ RewriteForPHI(PN, LU, LF, F, Rewriter, DeadInsts);
} else {
Value *FullV =
- Expand(LF, F, LF.UserInst->getIterator(), Rewriter, DeadInsts);
+ Expand(LU, LF, F, LF.UserInst->getIterator(), Rewriter, DeadInsts);
// If this is reuse-by-noop-cast, insert the noop cast.
Type *OpTy = LF.OperandValToReplace->getType();
@@ -4717,7 +4766,7 @@ void LSRInstance::Rewrite(const LSRFixup &LF,
// its new value may happen to be equal to LF.OperandValToReplace, in
// which case doing replaceUsesOfWith leads to replacing both operands
// with the same value. TODO: Reorganize this.
- if (Uses[LF.LUIdx].Kind == LSRUse::ICmpZero)
+ if (LU.Kind == LSRUse::ICmpZero)
LF.UserInst->setOperand(0, FullV);
else
LF.UserInst->replaceUsesOfWith(LF.OperandValToReplace, FullV);
@@ -4750,11 +4799,11 @@ void LSRInstance::ImplementSolution(
}
// Expand the new value definitions and update the users.
- for (const LSRFixup &Fixup : Fixups) {
- Rewrite(Fixup, *Solution[Fixup.LUIdx], Rewriter, DeadInsts);
-
- Changed = true;
- }
+ for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx)
+ for (const LSRFixup &Fixup : Uses[LUIdx].Fixups) {
+ Rewrite(Uses[LUIdx], Fixup, *Solution[LUIdx], Rewriter, DeadInsts);
+ Changed = true;
+ }
for (const IVChain &Chain : IVChainVec) {
GenerateIVChain(Chain, Rewriter, DeadInsts);
@@ -4898,11 +4947,12 @@ void LSRInstance::print_factors_and_types(raw_ostream &OS) const {
void LSRInstance::print_fixups(raw_ostream &OS) const {
OS << "LSR is examining the following fixup sites:\n";
- for (const LSRFixup &LF : Fixups) {
- dbgs() << " ";
- LF.print(OS);
- OS << '\n';
- }
+ for (const LSRUse &LU : Uses)
+ for (const LSRFixup &LF : LU.Fixups) {
+ dbgs() << " ";
+ LF.print(OS);
+ OS << '\n';
+ }
}
void LSRInstance::print_uses(raw_ostream &OS) const {
@@ -4935,6 +4985,7 @@ namespace {
class LoopStrengthReduce : public LoopPass {
public:
static char ID; // Pass ID, replacement for typeid
+
LoopStrengthReduce();
private:
@@ -4942,24 +4993,7 @@ private:
void getAnalysisUsage(AnalysisUsage &AU) const override;
};
-}
-
-char LoopStrengthReduce::ID = 0;
-INITIALIZE_PASS_BEGIN(LoopStrengthReduce, "loop-reduce",
- "Loop Strength Reduction", false, false)
-INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(IVUsersWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
-INITIALIZE_PASS_END(LoopStrengthReduce, "loop-reduce",
- "Loop Strength Reduction", false, false)
-
-
-Pass *llvm::createLoopStrengthReducePass() {
- return new LoopStrengthReduce();
-}
+} // end anonymous namespace
LoopStrengthReduce::LoopStrengthReduce() : LoopPass(ID) {
initializeLoopStrengthReducePass(*PassRegistry::getPassRegistry());
@@ -4985,16 +5019,9 @@ void LoopStrengthReduce::getAnalysisUsage(AnalysisUsage &AU) const {
AU.addRequired<TargetTransformInfoWrapperPass>();
}
-bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager & /*LPM*/) {
- if (skipLoop(L))
- return false;
-
- auto &IU = getAnalysis<IVUsersWrapperPass>().getIU();
- auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
- auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
- auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
- const auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
- *L->getHeader()->getParent());
+static bool ReduceLoopStrength(Loop *L, IVUsers &IU, ScalarEvolution &SE,
+ DominatorTree &DT, LoopInfo &LI,
+ const TargetTransformInfo &TTI) {
bool Changed = false;
// Run the main LSR transformation.
@@ -5005,15 +5032,11 @@ bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager & /*LPM*/) {
if (EnablePhiElim && L->isLoopSimplifyForm()) {
SmallVector<WeakVH, 16> DeadInsts;
const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
- SCEVExpander Rewriter(getAnalysis<ScalarEvolutionWrapperPass>().getSE(), DL,
- "lsr");
+ SCEVExpander Rewriter(SE, DL, "lsr");
#ifndef NDEBUG
Rewriter.setDebugType(DEBUG_TYPE);
#endif
- unsigned numFolded = Rewriter.replaceCongruentIVs(
- L, &getAnalysis<DominatorTreeWrapperPass>().getDomTree(), DeadInsts,
- &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
- *L->getHeader()->getParent()));
+ unsigned numFolded = Rewriter.replaceCongruentIVs(L, &DT, DeadInsts, &TTI);
if (numFolded) {
Changed = true;
DeleteTriviallyDeadInstructions(DeadInsts);
@@ -5022,3 +5045,40 @@ bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager & /*LPM*/) {
}
return Changed;
}
+
+bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager & /*LPM*/) {
+ if (skipLoop(L))
+ return false;
+
+ auto &IU = getAnalysis<IVUsersWrapperPass>().getIU();
+ auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+ auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+ const auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
+ *L->getHeader()->getParent());
+ return ReduceLoopStrength(L, IU, SE, DT, LI, TTI);
+}
+
+PreservedAnalyses LoopStrengthReducePass::run(Loop &L, LoopAnalysisManager &AM,
+ LoopStandardAnalysisResults &AR,
+ LPMUpdater &) {
+ if (!ReduceLoopStrength(&L, AM.getResult<IVUsersAnalysis>(L, AR), AR.SE,
+ AR.DT, AR.LI, AR.TTI))
+ return PreservedAnalyses::all();
+
+ return getLoopPassPreservedAnalyses();
+}
+
+char LoopStrengthReduce::ID = 0;
+INITIALIZE_PASS_BEGIN(LoopStrengthReduce, "loop-reduce",
+ "Loop Strength Reduction", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(IVUsersWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
+INITIALIZE_PASS_END(LoopStrengthReduce, "loop-reduce",
+ "Loop Strength Reduction", false, false)
+
+Pass *llvm::createLoopStrengthReducePass() { return new LoopStrengthReduce(); }
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
index 91af4a1..c7f9122 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -12,6 +12,7 @@
// counts of loops easily.
//===----------------------------------------------------------------------===//
+#include "llvm/Transforms/Scalar/LoopUnrollPass.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/CodeMetrics.h"
@@ -19,11 +20,10 @@
#include "llvm/Analysis/InstructionSimplify.h"
#include "llvm/Analysis/LoopPass.h"
#include "llvm/Analysis/LoopUnrollAnalyzer.h"
+#include "llvm/Analysis/OptimizationDiagnosticInfo.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/InstVisitor.h"
#include "llvm/IR/IntrinsicInst.h"
@@ -32,6 +32,7 @@
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
#include "llvm/Transforms/Utils/UnrollLoop.h"
#include <climits>
@@ -45,16 +46,14 @@ static cl::opt<unsigned>
UnrollThreshold("unroll-threshold", cl::Hidden,
cl::desc("The baseline cost threshold for loop unrolling"));
-static cl::opt<unsigned> UnrollPercentDynamicCostSavedThreshold(
- "unroll-percent-dynamic-cost-saved-threshold", cl::init(50), cl::Hidden,
- cl::desc("The percentage of estimated dynamic cost which must be saved by "
- "unrolling to allow unrolling up to the max threshold."));
-
-static cl::opt<unsigned> UnrollDynamicCostSavingsDiscount(
- "unroll-dynamic-cost-savings-discount", cl::init(100), cl::Hidden,
- cl::desc("This is the amount discounted from the total unroll cost when "
- "the unrolled form has a high dynamic cost savings (triggered by "
- "the '-unroll-perecent-dynamic-cost-saved-threshold' flag)."));
+static cl::opt<unsigned> UnrollMaxPercentThresholdBoost(
+ "unroll-max-percent-threshold-boost", cl::init(400), cl::Hidden,
+ cl::desc("The maximum 'boost' (represented as a percentage >= 100) applied "
+ "to the threshold when aggressively unrolling a loop due to the "
+ "dynamic cost savings. If completely unrolling a loop will reduce "
+ "the total runtime from X to Y, we boost the loop unroll "
+ "threshold to DefaultThreshold*std::min(MaxPercentThresholdBoost, "
+ "X/Y). This limit avoids excessive code bloat."));
static cl::opt<unsigned> UnrollMaxIterationsCountToAnalyze(
"unroll-max-iteration-count-to-analyze", cl::init(10), cl::Hidden,
@@ -90,43 +89,59 @@ static cl::opt<bool>
UnrollRuntime("unroll-runtime", cl::ZeroOrMore, cl::Hidden,
cl::desc("Unroll loops with run-time trip counts"));
+static cl::opt<unsigned> UnrollMaxUpperBound(
+ "unroll-max-upperbound", cl::init(8), cl::Hidden,
+ cl::desc(
+ "The max of trip count upper bound that is considered in unrolling"));
+
static cl::opt<unsigned> PragmaUnrollThreshold(
"pragma-unroll-threshold", cl::init(16 * 1024), cl::Hidden,
cl::desc("Unrolled size limit for loops with an unroll(full) or "
"unroll_count pragma."));
+static cl::opt<unsigned> FlatLoopTripCountThreshold(
+ "flat-loop-tripcount-threshold", cl::init(5), cl::Hidden,
+ cl::desc("If the runtime tripcount for the loop is lower than the "
+ "threshold, the loop is considered as flat and will be less "
+ "aggressively unrolled."));
+
+static cl::opt<bool>
+ UnrollAllowPeeling("unroll-allow-peeling", cl::Hidden,
+ cl::desc("Allows loops to be peeled when the dynamic "
+ "trip count is known to be low."));
+
/// A magic value for use with the Threshold parameter to indicate
/// that the loop unroll should be performed regardless of how much
/// code expansion would result.
static const unsigned NoThreshold = UINT_MAX;
-/// Default unroll count for loops with run-time trip count if
-/// -unroll-count is not set
-static const unsigned DefaultUnrollRuntimeCount = 8;
-
/// Gather the various unrolling parameters based on the defaults, compiler
/// flags, TTI overrides and user specified parameters.
static TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(
Loop *L, const TargetTransformInfo &TTI, Optional<unsigned> UserThreshold,
Optional<unsigned> UserCount, Optional<bool> UserAllowPartial,
- Optional<bool> UserRuntime) {
+ Optional<bool> UserRuntime, Optional<bool> UserUpperBound) {
TargetTransformInfo::UnrollingPreferences UP;
// Set up the defaults
UP.Threshold = 150;
- UP.PercentDynamicCostSavedThreshold = 50;
- UP.DynamicCostSavingsDiscount = 100;
+ UP.MaxPercentThresholdBoost = 400;
UP.OptSizeThreshold = 0;
UP.PartialThreshold = UP.Threshold;
UP.PartialOptSizeThreshold = 0;
UP.Count = 0;
+ UP.PeelCount = 0;
+ UP.DefaultUnrollRuntimeCount = 8;
UP.MaxCount = UINT_MAX;
UP.FullUnrollMaxCount = UINT_MAX;
+ UP.BEInsns = 2;
UP.Partial = false;
UP.Runtime = false;
UP.AllowRemainder = true;
UP.AllowExpensiveTripCount = false;
UP.Force = false;
+ UP.UpperBound = false;
+ UP.AllowPeeling = false;
// Override with any target specific settings
TTI.getUnrollingPreferences(L, UP);
@@ -142,11 +157,8 @@ static TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(
UP.Threshold = UnrollThreshold;
UP.PartialThreshold = UnrollThreshold;
}
- if (UnrollPercentDynamicCostSavedThreshold.getNumOccurrences() > 0)
- UP.PercentDynamicCostSavedThreshold =
- UnrollPercentDynamicCostSavedThreshold;
- if (UnrollDynamicCostSavingsDiscount.getNumOccurrences() > 0)
- UP.DynamicCostSavingsDiscount = UnrollDynamicCostSavingsDiscount;
+ if (UnrollMaxPercentThresholdBoost.getNumOccurrences() > 0)
+ UP.MaxPercentThresholdBoost = UnrollMaxPercentThresholdBoost;
if (UnrollMaxCount.getNumOccurrences() > 0)
UP.MaxCount = UnrollMaxCount;
if (UnrollFullMaxCount.getNumOccurrences() > 0)
@@ -157,6 +169,10 @@ static TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(
UP.AllowRemainder = UnrollAllowRemainder;
if (UnrollRuntime.getNumOccurrences() > 0)
UP.Runtime = UnrollRuntime;
+ if (UnrollMaxUpperBound == 0)
+ UP.UpperBound = false;
+ if (UnrollAllowPeeling.getNumOccurrences() > 0)
+ UP.AllowPeeling = UnrollAllowPeeling;
// Apply user values provided by argument
if (UserThreshold.hasValue()) {
@@ -169,6 +185,8 @@ static TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(
UP.Partial = *UserAllowPartial;
if (UserRuntime.hasValue())
UP.Runtime = *UserRuntime;
+ if (UserUpperBound.hasValue())
+ UP.UpperBound = *UserUpperBound;
return UP;
}
@@ -210,11 +228,11 @@ struct UnrolledInstStateKeyInfo {
namespace {
struct EstimatedUnrollCost {
/// \brief The estimated cost after unrolling.
- int UnrolledCost;
+ unsigned UnrolledCost;
/// \brief The estimated dynamic cost of executing the instructions in the
/// rolled form.
- int RolledDynamicCost;
+ unsigned RolledDynamicCost;
};
}
@@ -234,7 +252,7 @@ struct EstimatedUnrollCost {
static Optional<EstimatedUnrollCost>
analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT,
ScalarEvolution &SE, const TargetTransformInfo &TTI,
- int MaxUnrolledLoopSize) {
+ unsigned MaxUnrolledLoopSize) {
// We want to be able to scale offsets by the trip count and add more offsets
// to them without checking for overflows, and we already don't want to
// analyze *massive* trip counts, so we force the max to be reasonably small.
@@ -258,14 +276,14 @@ analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT,
// The estimated cost of the unrolled form of the loop. We try to estimate
// this by simplifying as much as we can while computing the estimate.
- int UnrolledCost = 0;
+ unsigned UnrolledCost = 0;
// We also track the estimated dynamic (that is, actually executed) cost in
// the rolled form. This helps identify cases when the savings from unrolling
// aren't just exposing dead control flows, but actual reduced dynamic
// instructions due to the simplifications which we expect to occur after
// unrolling.
- int RolledDynamicCost = 0;
+ unsigned RolledDynamicCost = 0;
// We track the simplification of each instruction in each iteration. We use
// this to recursively merge costs into the unrolled cost on-demand so that
@@ -412,6 +430,9 @@ analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT,
// it. We don't change the actual IR, just count optimization
// opportunities.
for (Instruction &I : *BB) {
+ if (isa<DbgInfoIntrinsic>(I))
+ continue;
+
// Track this instruction's expected baseline cost when executing the
// rolled loop form.
RolledDynamicCost += TTI.getUserCost(&I);
@@ -429,16 +450,16 @@ analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT,
if (IsFree)
continue;
- // If the instruction might have a side-effect recursively account for
- // the cost of it and all the instructions leading up to it.
- if (I.mayHaveSideEffects())
- AddCostRecursively(I, Iteration);
-
// Can't properly model a cost of a call.
// FIXME: With a proper cost model we should be able to do it.
if(isa<CallInst>(&I))
return None;
+ // If the instruction might have a side-effect recursively account for
+ // the cost of it and all the instructions leading up to it.
+ if (I.mayHaveSideEffects())
+ AddCostRecursively(I, Iteration);
+
// If unrolled body turns out to be too big, bail out.
if (UnrolledCost > MaxUnrolledLoopSize) {
DEBUG(dbgs() << " Exceeded threshold.. exiting.\n"
@@ -529,7 +550,7 @@ analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT,
static unsigned ApproximateLoopSize(const Loop *L, unsigned &NumCalls,
bool &NotDuplicatable, bool &Convergent,
const TargetTransformInfo &TTI,
- AssumptionCache *AC) {
+ AssumptionCache *AC, unsigned BEInsns) {
SmallPtrSet<const Value *, 32> EphValues;
CodeMetrics::collectEphemeralValues(L, AC, EphValues);
@@ -548,7 +569,7 @@ static unsigned ApproximateLoopSize(const Loop *L, unsigned &NumCalls,
// that each loop has at least three instructions (likely a conditional
// branch, a comparison feeding that branch, and some kind of loop increment
// feeding that comparison instruction).
- LoopSize = std::max(LoopSize, 3u);
+ LoopSize = std::max(LoopSize, BEInsns + 1);
return LoopSize;
}
@@ -635,70 +656,38 @@ static void SetLoopAlreadyUnrolled(Loop *L) {
L->setLoopID(NewLoopID);
}
-static bool canUnrollCompletely(Loop *L, unsigned Threshold,
- unsigned PercentDynamicCostSavedThreshold,
- unsigned DynamicCostSavingsDiscount,
- uint64_t UnrolledCost,
- uint64_t RolledDynamicCost) {
- if (Threshold == NoThreshold) {
- DEBUG(dbgs() << " Can fully unroll, because no threshold is set.\n");
- return true;
- }
-
- if (UnrolledCost <= Threshold) {
- DEBUG(dbgs() << " Can fully unroll, because unrolled cost: "
- << UnrolledCost << "<" << Threshold << "\n");
- return true;
- }
-
- assert(UnrolledCost && "UnrolledCost can't be 0 at this point.");
- assert(RolledDynamicCost >= UnrolledCost &&
- "Cannot have a higher unrolled cost than a rolled cost!");
-
- // Compute the percentage of the dynamic cost in the rolled form that is
- // saved when unrolled. If unrolling dramatically reduces the estimated
- // dynamic cost of the loop, we use a higher threshold to allow more
- // unrolling.
- unsigned PercentDynamicCostSaved =
- (uint64_t)(RolledDynamicCost - UnrolledCost) * 100ull / RolledDynamicCost;
-
- if (PercentDynamicCostSaved >= PercentDynamicCostSavedThreshold &&
- (int64_t)UnrolledCost - (int64_t)DynamicCostSavingsDiscount <=
- (int64_t)Threshold) {
- DEBUG(dbgs() << " Can fully unroll, because unrolling will reduce the "
- "expected dynamic cost by "
- << PercentDynamicCostSaved << "% (threshold: "
- << PercentDynamicCostSavedThreshold << "%)\n"
- << " and the unrolled cost (" << UnrolledCost
- << ") is less than the max threshold ("
- << DynamicCostSavingsDiscount << ").\n");
- return true;
- }
+// Computes the boosting factor for complete unrolling.
+// If fully unrolling the loop would save a lot of RolledDynamicCost, it would
+// be beneficial to fully unroll the loop even if unrolledcost is large. We
+// use (RolledDynamicCost / UnrolledCost) to model the unroll benefits to adjust
+// the unroll threshold.
+static unsigned getFullUnrollBoostingFactor(const EstimatedUnrollCost &Cost,
+ unsigned MaxPercentThresholdBoost) {
+ if (Cost.RolledDynamicCost >= UINT_MAX / 100)
+ return 100;
+ else if (Cost.UnrolledCost != 0)
+ // The boosting factor is RolledDynamicCost / UnrolledCost
+ return std::min(100 * Cost.RolledDynamicCost / Cost.UnrolledCost,
+ MaxPercentThresholdBoost);
+ else
+ return MaxPercentThresholdBoost;
+}
- DEBUG(dbgs() << " Too large to fully unroll:\n");
- DEBUG(dbgs() << " Threshold: " << Threshold << "\n");
- DEBUG(dbgs() << " Max threshold: " << DynamicCostSavingsDiscount << "\n");
- DEBUG(dbgs() << " Percent cost saved threshold: "
- << PercentDynamicCostSavedThreshold << "%\n");
- DEBUG(dbgs() << " Unrolled cost: " << UnrolledCost << "\n");
- DEBUG(dbgs() << " Rolled dynamic cost: " << RolledDynamicCost << "\n");
- DEBUG(dbgs() << " Percent cost saved: " << PercentDynamicCostSaved
- << "\n");
- return false;
+// Returns loop size estimation for unrolled loop.
+static uint64_t getUnrolledLoopSize(
+ unsigned LoopSize,
+ TargetTransformInfo::UnrollingPreferences &UP) {
+ assert(LoopSize >= UP.BEInsns && "LoopSize should not be less than BEInsns!");
+ return (uint64_t)(LoopSize - UP.BEInsns) * UP.Count + UP.BEInsns;
}
// Returns true if unroll count was set explicitly.
// Calculates unroll count and writes it to UP.Count.
-static bool computeUnrollCount(Loop *L, const TargetTransformInfo &TTI,
- DominatorTree &DT, LoopInfo *LI,
- ScalarEvolution *SE, unsigned TripCount,
- unsigned TripMultiple, unsigned LoopSize,
- TargetTransformInfo::UnrollingPreferences &UP) {
- // BEInsns represents number of instructions optimized when "back edge"
- // becomes "fall through" in unrolled loop.
- // For now we count a conditional branch on a backedge and a comparison
- // feeding it.
- unsigned BEInsns = 2;
+static bool computeUnrollCount(
+ Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, LoopInfo *LI,
+ ScalarEvolution *SE, OptimizationRemarkEmitter *ORE, unsigned &TripCount,
+ unsigned MaxTripCount, unsigned &TripMultiple, unsigned LoopSize,
+ TargetTransformInfo::UnrollingPreferences &UP, bool &UseUpperBound) {
// Check for explicit Count.
// 1st priority is unroll count set by "unroll-count" option.
bool UserUnrollCount = UnrollCount.getNumOccurrences() > 0;
@@ -706,8 +695,7 @@ static bool computeUnrollCount(Loop *L, const TargetTransformInfo &TTI,
UP.Count = UnrollCount;
UP.AllowExpensiveTripCount = true;
UP.Force = true;
- if (UP.AllowRemainder &&
- (LoopSize - BEInsns) * UP.Count + BEInsns < UP.Threshold)
+ if (UP.AllowRemainder && getUnrolledLoopSize(LoopSize, UP) < UP.Threshold)
return true;
}
@@ -719,13 +707,13 @@ static bool computeUnrollCount(Loop *L, const TargetTransformInfo &TTI,
UP.AllowExpensiveTripCount = true;
UP.Force = true;
if (UP.AllowRemainder &&
- (LoopSize - BEInsns) * UP.Count + BEInsns < PragmaUnrollThreshold)
+ getUnrolledLoopSize(LoopSize, UP) < PragmaUnrollThreshold)
return true;
}
bool PragmaFullUnroll = HasUnrollFullPragma(L);
if (PragmaFullUnroll && TripCount != 0) {
UP.Count = TripCount;
- if ((LoopSize - BEInsns) * UP.Count + BEInsns < PragmaUnrollThreshold)
+ if (getUnrolledLoopSize(LoopSize, UP) < PragmaUnrollThreshold)
return false;
}
@@ -733,11 +721,6 @@ static bool computeUnrollCount(Loop *L, const TargetTransformInfo &TTI,
bool ExplicitUnroll = PragmaCount > 0 || PragmaFullUnroll ||
PragmaEnableUnroll || UserUnrollCount;
- uint64_t UnrolledSize;
- DebugLoc LoopLoc = L->getStartLoc();
- Function *F = L->getHeader()->getParent();
- LLVMContext &Ctx = F->getContext();
-
if (ExplicitUnroll && TripCount != 0) {
// If the loop has an unrolling pragma, we want to be more aggressive with
// unrolling limits. Set thresholds to at least the PragmaThreshold value
@@ -748,38 +731,48 @@ static bool computeUnrollCount(Loop *L, const TargetTransformInfo &TTI,
}
// 3rd priority is full unroll count.
- // Full unroll make sense only when TripCount could be staticaly calculated.
+ // Full unroll makes sense only when TripCount or its upper bound could be
+ // statically calculated.
// Also we need to check if we exceed FullUnrollMaxCount.
- if (TripCount && TripCount <= UP.FullUnrollMaxCount) {
+ // If using the upper bound to unroll, TripMultiple should be set to 1 because
+ // we do not know when loop may exit.
+ // MaxTripCount and ExactTripCount cannot both be non zero since we only
+ // compute the former when the latter is zero.
+ unsigned ExactTripCount = TripCount;
+ assert((ExactTripCount == 0 || MaxTripCount == 0) &&
+ "ExtractTripCound and MaxTripCount cannot both be non zero.");
+ unsigned FullUnrollTripCount = ExactTripCount ? ExactTripCount : MaxTripCount;
+ UP.Count = FullUnrollTripCount;
+ if (FullUnrollTripCount && FullUnrollTripCount <= UP.FullUnrollMaxCount) {
// When computing the unrolled size, note that BEInsns are not replicated
// like the rest of the loop body.
- UnrolledSize = (uint64_t)(LoopSize - BEInsns) * TripCount + BEInsns;
- if (canUnrollCompletely(L, UP.Threshold, 100, UP.DynamicCostSavingsDiscount,
- UnrolledSize, UnrolledSize)) {
- UP.Count = TripCount;
+ if (getUnrolledLoopSize(LoopSize, UP) < UP.Threshold) {
+ UseUpperBound = (MaxTripCount == FullUnrollTripCount);
+ TripCount = FullUnrollTripCount;
+ TripMultiple = UP.UpperBound ? 1 : TripMultiple;
return ExplicitUnroll;
} else {
// The loop isn't that small, but we still can fully unroll it if that
// helps to remove a significant number of instructions.
// To check that, run additional analysis on the loop.
if (Optional<EstimatedUnrollCost> Cost = analyzeLoopUnrollCost(
- L, TripCount, DT, *SE, TTI,
- UP.Threshold + UP.DynamicCostSavingsDiscount))
- if (canUnrollCompletely(L, UP.Threshold,
- UP.PercentDynamicCostSavedThreshold,
- UP.DynamicCostSavingsDiscount,
- Cost->UnrolledCost, Cost->RolledDynamicCost)) {
- UP.Count = TripCount;
+ L, FullUnrollTripCount, DT, *SE, TTI,
+ UP.Threshold * UP.MaxPercentThresholdBoost / 100)) {
+ unsigned Boost =
+ getFullUnrollBoostingFactor(*Cost, UP.MaxPercentThresholdBoost);
+ if (Cost->UnrolledCost < UP.Threshold * Boost / 100) {
+ UseUpperBound = (MaxTripCount == FullUnrollTripCount);
+ TripCount = FullUnrollTripCount;
+ TripMultiple = UP.UpperBound ? 1 : TripMultiple;
return ExplicitUnroll;
}
+ }
}
}
// 4rd priority is partial unrolling.
// Try partial unroll only when TripCount could be staticaly calculated.
if (TripCount) {
- if (UP.Count == 0)
- UP.Count = TripCount;
UP.Partial |= ExplicitUnroll;
if (!UP.Partial) {
DEBUG(dbgs() << " will not try to unroll partially because "
@@ -787,12 +780,14 @@ static bool computeUnrollCount(Loop *L, const TargetTransformInfo &TTI,
UP.Count = 0;
return false;
}
+ if (UP.Count == 0)
+ UP.Count = TripCount;
if (UP.PartialThreshold != NoThreshold) {
// Reduce unroll count to be modulo of TripCount for partial unrolling.
- UnrolledSize = (uint64_t)(LoopSize - BEInsns) * UP.Count + BEInsns;
- if (UnrolledSize > UP.PartialThreshold)
- UP.Count = (std::max(UP.PartialThreshold, 3u) - BEInsns) /
- (LoopSize - BEInsns);
+ if (getUnrolledLoopSize(LoopSize, UP) > UP.PartialThreshold)
+ UP.Count =
+ (std::max(UP.PartialThreshold, UP.BEInsns + 1) - UP.BEInsns) /
+ (LoopSize - UP.BEInsns);
if (UP.Count > UP.MaxCount)
UP.Count = UP.MaxCount;
while (UP.Count != 0 && TripCount % UP.Count != 0)
@@ -802,19 +797,18 @@ static bool computeUnrollCount(Loop *L, const TargetTransformInfo &TTI,
// largest power-of-two factor that satisfies the threshold limit.
// As we'll create fixup loop, do the type of unrolling only if
// remainder loop is allowed.
- UP.Count = DefaultUnrollRuntimeCount;
- UnrolledSize = (LoopSize - BEInsns) * UP.Count + BEInsns;
- while (UP.Count != 0 && UnrolledSize > UP.PartialThreshold) {
+ UP.Count = UP.DefaultUnrollRuntimeCount;
+ while (UP.Count != 0 &&
+ getUnrolledLoopSize(LoopSize, UP) > UP.PartialThreshold)
UP.Count >>= 1;
- UnrolledSize = (LoopSize - BEInsns) * UP.Count + BEInsns;
- }
}
if (UP.Count < 2) {
if (PragmaEnableUnroll)
- emitOptimizationRemarkMissed(
- Ctx, DEBUG_TYPE, *F, LoopLoc,
- "Unable to unroll loop as directed by unroll(enable) pragma "
- "because unrolled size is too large.");
+ ORE->emit(
+ OptimizationRemarkMissed(DEBUG_TYPE, "UnrollAsDirectedTooLarge",
+ L->getStartLoc(), L->getHeader())
+ << "Unable to unroll loop as directed by unroll(enable) pragma "
+ "because unrolled size is too large.");
UP.Count = 0;
}
} else {
@@ -822,26 +816,48 @@ static bool computeUnrollCount(Loop *L, const TargetTransformInfo &TTI,
}
if ((PragmaFullUnroll || PragmaEnableUnroll) && TripCount &&
UP.Count != TripCount)
- emitOptimizationRemarkMissed(
- Ctx, DEBUG_TYPE, *F, LoopLoc,
- "Unable to fully unroll loop as directed by unroll pragma because "
- "unrolled size is too large.");
+ ORE->emit(
+ OptimizationRemarkMissed(DEBUG_TYPE, "FullUnrollAsDirectedTooLarge",
+ L->getStartLoc(), L->getHeader())
+ << "Unable to fully unroll loop as directed by unroll pragma because "
+ "unrolled size is too large.");
return ExplicitUnroll;
}
assert(TripCount == 0 &&
"All cases when TripCount is constant should be covered here.");
if (PragmaFullUnroll)
- emitOptimizationRemarkMissed(
- Ctx, DEBUG_TYPE, *F, LoopLoc,
- "Unable to fully unroll loop as directed by unroll(full) pragma "
- "because loop has a runtime trip count.");
+ ORE->emit(
+ OptimizationRemarkMissed(DEBUG_TYPE,
+ "CantFullUnrollAsDirectedRuntimeTripCount",
+ L->getStartLoc(), L->getHeader())
+ << "Unable to fully unroll loop as directed by unroll(full) pragma "
+ "because loop has a runtime trip count.");
+
+ // 5th priority is loop peeling
+ computePeelCount(L, LoopSize, UP);
+ if (UP.PeelCount) {
+ UP.Runtime = false;
+ UP.Count = 1;
+ return ExplicitUnroll;
+ }
- // 5th priority is runtime unrolling.
+ // 6th priority is runtime unrolling.
// Don't unroll a runtime trip count loop when it is disabled.
if (HasRuntimeUnrollDisablePragma(L)) {
UP.Count = 0;
return false;
}
+
+ // Check if the runtime trip count is too small when profile is available.
+ if (L->getHeader()->getParent()->getEntryCount()) {
+ if (auto ProfileTripCount = getLoopEstimatedTripCount(L)) {
+ if (*ProfileTripCount < FlatLoopTripCountThreshold)
+ return false;
+ else
+ UP.AllowExpensiveTripCount = true;
+ }
+ }
+
// Reduce count based on the type of unrolling and the threshold values.
UP.Runtime |= PragmaEnableUnroll || PragmaCount > 0 || UserUnrollCount;
if (!UP.Runtime) {
@@ -851,15 +867,13 @@ static bool computeUnrollCount(Loop *L, const TargetTransformInfo &TTI,
return false;
}
if (UP.Count == 0)
- UP.Count = DefaultUnrollRuntimeCount;
- UnrolledSize = (LoopSize - BEInsns) * UP.Count + BEInsns;
+ UP.Count = UP.DefaultUnrollRuntimeCount;
// Reduce unroll count to be the largest power-of-two factor of
// the original count which satisfies the threshold limit.
- while (UP.Count != 0 && UnrolledSize > UP.PartialThreshold) {
+ while (UP.Count != 0 &&
+ getUnrolledLoopSize(LoopSize, UP) > UP.PartialThreshold)
UP.Count >>= 1;
- UnrolledSize = (LoopSize - BEInsns) * UP.Count + BEInsns;
- }
#ifndef NDEBUG
unsigned OrigCount = UP.Count;
@@ -874,16 +888,19 @@ static bool computeUnrollCount(Loop *L, const TargetTransformInfo &TTI,
"multiple, "
<< TripMultiple << ". Reducing unroll count from "
<< OrigCount << " to " << UP.Count << ".\n");
+ using namespace ore;
if (PragmaCount > 0 && !UP.AllowRemainder)
- emitOptimizationRemarkMissed(
- Ctx, DEBUG_TYPE, *F, LoopLoc,
- Twine("Unable to unroll loop the number of times directed by "
- "unroll_count pragma because remainder loop is restricted "
- "(that could architecture specific or because the loop "
- "contains a convergent instruction) and so must have an unroll "
- "count that divides the loop trip multiple of ") +
- Twine(TripMultiple) + ". Unrolling instead " + Twine(UP.Count) +
- " time(s).");
+ ORE->emit(
+ OptimizationRemarkMissed(DEBUG_TYPE,
+ "DifferentUnrollCountFromDirected",
+ L->getStartLoc(), L->getHeader())
+ << "Unable to unroll loop the number of times directed by "
+ "unroll_count pragma because remainder loop is restricted "
+ "(that could architecture specific or because the loop "
+ "contains a convergent instruction) and so must have an unroll "
+ "count that divides the loop trip multiple of "
+ << NV("TripMultiple", TripMultiple) << ". Unrolling instead "
+ << NV("UnrollCount", UP.Count) << " time(s).");
}
if (UP.Count > UP.MaxCount)
@@ -896,22 +913,34 @@ static bool computeUnrollCount(Loop *L, const TargetTransformInfo &TTI,
static bool tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI,
ScalarEvolution *SE, const TargetTransformInfo &TTI,
- AssumptionCache &AC, bool PreserveLCSSA,
+ AssumptionCache &AC, OptimizationRemarkEmitter &ORE,
+ bool PreserveLCSSA,
Optional<unsigned> ProvidedCount,
Optional<unsigned> ProvidedThreshold,
Optional<bool> ProvidedAllowPartial,
- Optional<bool> ProvidedRuntime) {
+ Optional<bool> ProvidedRuntime,
+ Optional<bool> ProvidedUpperBound) {
DEBUG(dbgs() << "Loop Unroll: F[" << L->getHeader()->getParent()->getName()
<< "] Loop %" << L->getHeader()->getName() << "\n");
- if (HasUnrollDisablePragma(L)) {
+ if (HasUnrollDisablePragma(L))
+ return false;
+ if (!L->isLoopSimplifyForm()) {
+ DEBUG(
+ dbgs() << " Not unrolling loop which is not in loop-simplify form.\n");
return false;
}
unsigned NumInlineCandidates;
bool NotDuplicatable;
bool Convergent;
+ TargetTransformInfo::UnrollingPreferences UP = gatherUnrollingPreferences(
+ L, TTI, ProvidedThreshold, ProvidedCount, ProvidedAllowPartial,
+ ProvidedRuntime, ProvidedUpperBound);
+ // Exit early if unrolling is disabled.
+ if (UP.Threshold == 0 && (!UP.Partial || UP.PartialThreshold == 0))
+ return false;
unsigned LoopSize = ApproximateLoopSize(
- L, NumInlineCandidates, NotDuplicatable, Convergent, TTI, &AC);
+ L, NumInlineCandidates, NotDuplicatable, Convergent, TTI, &AC, UP.BEInsns);
DEBUG(dbgs() << " Loop Size = " << LoopSize << "\n");
if (NotDuplicatable) {
DEBUG(dbgs() << " Not unrolling loop which contains non-duplicatable"
@@ -922,14 +951,10 @@ static bool tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI,
DEBUG(dbgs() << " Not unrolling loop with inlinable calls.\n");
return false;
}
- if (!L->isLoopSimplifyForm()) {
- DEBUG(
- dbgs() << " Not unrolling loop which is not in loop-simplify form.\n");
- return false;
- }
// Find trip count and trip multiple if count is not available
unsigned TripCount = 0;
+ unsigned MaxTripCount = 0;
unsigned TripMultiple = 1;
// If there are multiple exiting blocks but one of them is the latch, use the
// latch for the trip count estimation. Otherwise insist on a single exiting
@@ -942,10 +967,6 @@ static bool tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI,
TripMultiple = SE->getSmallConstantTripMultiple(L, ExitingBlock);
}
- TargetTransformInfo::UnrollingPreferences UP = gatherUnrollingPreferences(
- L, TTI, ProvidedThreshold, ProvidedCount, ProvidedAllowPartial,
- ProvidedRuntime);
-
// If the loop contains a convergent operation, the prelude we'd add
// to do the first few instructions before we hit the unrolled loop
// is unsafe -- it adds a control-flow dependency to the convergent
@@ -961,8 +982,31 @@ static bool tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI,
if (Convergent)
UP.AllowRemainder = false;
- bool IsCountSetExplicitly = computeUnrollCount(L, TTI, DT, LI, SE, TripCount,
- TripMultiple, LoopSize, UP);
+ // Try to find the trip count upper bound if we cannot find the exact trip
+ // count.
+ bool MaxOrZero = false;
+ if (!TripCount) {
+ MaxTripCount = SE->getSmallConstantMaxTripCount(L);
+ MaxOrZero = SE->isBackedgeTakenCountMaxOrZero(L);
+ // We can unroll by the upper bound amount if it's generally allowed or if
+ // we know that the loop is executed either the upper bound or zero times.
+ // (MaxOrZero unrolling keeps only the first loop test, so the number of
+ // loop tests remains the same compared to the non-unrolled version, whereas
+ // the generic upper bound unrolling keeps all but the last loop test so the
+ // number of loop tests goes up which may end up being worse on targets with
+ // constriained branch predictor resources so is controlled by an option.)
+ // In addition we only unroll small upper bounds.
+ if (!(UP.UpperBound || MaxOrZero) || MaxTripCount > UnrollMaxUpperBound) {
+ MaxTripCount = 0;
+ }
+ }
+
+ // computeUnrollCount() decides whether it is beneficial to use upper bound to
+ // fully unroll the loop.
+ bool UseUpperBound = false;
+ bool IsCountSetExplicitly =
+ computeUnrollCount(L, TTI, DT, LI, SE, &ORE, TripCount, MaxTripCount,
+ TripMultiple, LoopSize, UP, UseUpperBound);
if (!UP.Count)
return false;
// Unroll factor (Count) must be less or equal to TripCount.
@@ -971,14 +1015,18 @@ static bool tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI,
// Unroll the loop.
if (!UnrollLoop(L, UP.Count, TripCount, UP.Force, UP.Runtime,
- UP.AllowExpensiveTripCount, TripMultiple, LI, SE, &DT, &AC,
+ UP.AllowExpensiveTripCount, UseUpperBound, MaxOrZero,
+ TripMultiple, UP.PeelCount, LI, SE, &DT, &AC, &ORE,
PreserveLCSSA))
return false;
// If loop has an unroll count pragma or unrolled by explicitly set count
// mark loop as unrolled to prevent unrolling beyond that requested.
- if (IsCountSetExplicitly)
+ // If the loop was peeled, we already "used up" the profile information
+ // we had, so we don't want to unroll or peel again.
+ if (IsCountSetExplicitly || UP.PeelCount)
SetLoopAlreadyUnrolled(L);
+
return true;
}
@@ -988,10 +1036,11 @@ public:
static char ID; // Pass ID, replacement for typeid
LoopUnroll(Optional<unsigned> Threshold = None,
Optional<unsigned> Count = None,
- Optional<bool> AllowPartial = None, Optional<bool> Runtime = None)
+ Optional<bool> AllowPartial = None, Optional<bool> Runtime = None,
+ Optional<bool> UpperBound = None)
: LoopPass(ID), ProvidedCount(std::move(Count)),
ProvidedThreshold(Threshold), ProvidedAllowPartial(AllowPartial),
- ProvidedRuntime(Runtime) {
+ ProvidedRuntime(Runtime), ProvidedUpperBound(UpperBound) {
initializeLoopUnrollPass(*PassRegistry::getPassRegistry());
}
@@ -999,6 +1048,7 @@ public:
Optional<unsigned> ProvidedThreshold;
Optional<bool> ProvidedAllowPartial;
Optional<bool> ProvidedRuntime;
+ Optional<bool> ProvidedUpperBound;
bool runOnLoop(Loop *L, LPPassManager &) override {
if (skipLoop(L))
@@ -1012,11 +1062,16 @@ public:
const TargetTransformInfo &TTI =
getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+ // For the old PM, we can't use OptimizationRemarkEmitter as an analysis
+ // pass. Function analyses need to be preserved across loop transformations
+ // but ORE cannot be preserved (see comment before the pass definition).
+ OptimizationRemarkEmitter ORE(&F);
bool PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
- return tryToUnrollLoop(L, DT, LI, SE, TTI, AC, PreserveLCSSA, ProvidedCount,
- ProvidedThreshold, ProvidedAllowPartial,
- ProvidedRuntime);
+ return tryToUnrollLoop(L, DT, LI, SE, TTI, AC, ORE, PreserveLCSSA,
+ ProvidedCount, ProvidedThreshold,
+ ProvidedAllowPartial, ProvidedRuntime,
+ ProvidedUpperBound);
}
/// This transformation requires natural loop information & requires that
@@ -1040,7 +1095,7 @@ INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
INITIALIZE_PASS_END(LoopUnroll, "loop-unroll", "Unroll loops", false, false)
Pass *llvm::createLoopUnrollPass(int Threshold, int Count, int AllowPartial,
- int Runtime) {
+ int Runtime, int UpperBound) {
// TODO: It would make more sense for this function to take the optionals
// directly, but that's dangerous since it would silently break out of tree
// callers.
@@ -1048,9 +1103,33 @@ Pass *llvm::createLoopUnrollPass(int Threshold, int Count, int AllowPartial,
Count == -1 ? None : Optional<unsigned>(Count),
AllowPartial == -1 ? None
: Optional<bool>(AllowPartial),
- Runtime == -1 ? None : Optional<bool>(Runtime));
+ Runtime == -1 ? None : Optional<bool>(Runtime),
+ UpperBound == -1 ? None : Optional<bool>(UpperBound));
}
Pass *llvm::createSimpleLoopUnrollPass() {
- return llvm::createLoopUnrollPass(-1, -1, 0, 0);
+ return llvm::createLoopUnrollPass(-1, -1, 0, 0, 0);
+}
+
+PreservedAnalyses LoopUnrollPass::run(Loop &L, LoopAnalysisManager &AM,
+ LoopStandardAnalysisResults &AR,
+ LPMUpdater &) {
+ const auto &FAM =
+ AM.getResult<FunctionAnalysisManagerLoopProxy>(L, AR).getManager();
+ Function *F = L.getHeader()->getParent();
+
+ auto *ORE = FAM.getCachedResult<OptimizationRemarkEmitterAnalysis>(*F);
+ // FIXME: This should probably be optional rather than required.
+ if (!ORE)
+ report_fatal_error("LoopUnrollPass: OptimizationRemarkEmitterAnalysis not "
+ "cached at a higher level");
+
+ bool Changed = tryToUnrollLoop(&L, AR.DT, &AR.LI, &AR.SE, AR.TTI, AR.AC, *ORE,
+ /*PreserveLCSSA*/ true, ProvidedCount,
+ ProvidedThreshold, ProvidedAllowPartial,
+ ProvidedRuntime, ProvidedUpperBound);
+
+ if (!Changed)
+ return PreservedAnalyses::all();
+ return getLoopPassPreservedAnalyses();
}
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp
index 71980e8..76fe918 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp
@@ -210,7 +210,7 @@ namespace {
bool runOnLoop(Loop *L, LPPassManager &LPM) override;
bool processCurrentLoop();
-
+ bool isUnreachableDueToPreviousUnswitching(BasicBlock *);
/// This transformation requires natural loop information & requires that
/// loop preheaders be inserted into the CFG.
///
@@ -483,6 +483,35 @@ bool LoopUnswitch::runOnLoop(Loop *L, LPPassManager &LPM_Ref) {
return Changed;
}
+// Return true if the BasicBlock BB is unreachable from the loop header.
+// Return false, otherwise.
+bool LoopUnswitch::isUnreachableDueToPreviousUnswitching(BasicBlock *BB) {
+ auto *Node = DT->getNode(BB)->getIDom();
+ BasicBlock *DomBB = Node->getBlock();
+ while (currentLoop->contains(DomBB)) {
+ BranchInst *BInst = dyn_cast<BranchInst>(DomBB->getTerminator());
+
+ Node = DT->getNode(DomBB)->getIDom();
+ DomBB = Node->getBlock();
+
+ if (!BInst || !BInst->isConditional())
+ continue;
+
+ Value *Cond = BInst->getCondition();
+ if (!isa<ConstantInt>(Cond))
+ continue;
+
+ BasicBlock *UnreachableSucc =
+ Cond == ConstantInt::getTrue(Cond->getContext())
+ ? BInst->getSuccessor(1)
+ : BInst->getSuccessor(0);
+
+ if (DT->dominates(UnreachableSucc, BB))
+ return true;
+ }
+ return false;
+}
+
/// Do actual work and unswitch loop if possible and profitable.
bool LoopUnswitch::processCurrentLoop() {
bool Changed = false;
@@ -593,6 +622,12 @@ bool LoopUnswitch::processCurrentLoop() {
continue;
if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
+ // Some branches may be rendered unreachable because of previous
+ // unswitching.
+ // Unswitch only those branches that are reachable.
+ if (isUnreachableDueToPreviousUnswitching(*I))
+ continue;
+
// If this isn't branching on an invariant condition, we can't unswitch
// it.
if (BI->isConditional()) {
@@ -742,42 +777,6 @@ static Loop *CloneLoop(Loop *L, Loop *PL, ValueToValueMapTy &VM,
return &New;
}
-static void copyMetadata(Instruction *DstInst, const Instruction *SrcInst,
- bool Swapped) {
- if (!SrcInst || !SrcInst->hasMetadata())
- return;
-
- SmallVector<std::pair<unsigned, MDNode *>, 4> MDs;
- SrcInst->getAllMetadata(MDs);
- for (auto &MD : MDs) {
- switch (MD.first) {
- default:
- break;
- case LLVMContext::MD_prof:
- if (Swapped && MD.second->getNumOperands() == 3 &&
- isa<MDString>(MD.second->getOperand(0))) {
- MDString *MDName = cast<MDString>(MD.second->getOperand(0));
- if (MDName->getString() == "branch_weights") {
- auto *ValT = cast_or_null<ConstantAsMetadata>(
- MD.second->getOperand(1))->getValue();
- auto *ValF = cast_or_null<ConstantAsMetadata>(
- MD.second->getOperand(2))->getValue();
- assert(ValT && ValF && "Invalid Operands of branch_weights");
- auto NewMD =
- MDBuilder(DstInst->getParent()->getContext())
- .createBranchWeights(cast<ConstantInt>(ValF)->getZExtValue(),
- cast<ConstantInt>(ValT)->getZExtValue());
- MD.second = NewMD;
- }
- }
- // fallthrough.
- case LLVMContext::MD_make_implicit:
- case LLVMContext::MD_dbg:
- DstInst->setMetadata(MD.first, MD.second);
- }
- }
-}
-
/// Emit a conditional branch on two values if LIC == Val, branch to TrueDst,
/// otherwise branch to FalseDest. Insert the code immediately before InsertPt.
void LoopUnswitch::EmitPreheaderBranchOnCondition(Value *LIC, Constant *Val,
@@ -799,8 +798,10 @@ void LoopUnswitch::EmitPreheaderBranchOnCondition(Value *LIC, Constant *Val,
}
// Insert the new branch.
- BranchInst *BI = BranchInst::Create(TrueDest, FalseDest, BranchVal, InsertPt);
- copyMetadata(BI, TI, Swapped);
+ BranchInst *BI =
+ IRBuilder<>(InsertPt).CreateCondBr(BranchVal, TrueDest, FalseDest, TI);
+ if (Swapped)
+ BI->swapProfMetadata();
// If either edge is critical, split it. This helps preserve LoopSimplify
// form for enclosing loops.
@@ -1078,10 +1079,6 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val,
F->getBasicBlockList(),
NewBlocks[0]->getIterator(), F->end());
- // FIXME: We could register any cloned assumptions instead of clearing the
- // whole function's cache.
- AC->clear();
-
// Now we create the new Loop object for the versioned loop.
Loop *NewLoop = CloneLoop(L, L->getParentLoop(), VMap, LI, LPM);
@@ -1131,10 +1128,15 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val,
}
// Rewrite the code to refer to itself.
- for (unsigned i = 0, e = NewBlocks.size(); i != e; ++i)
- for (Instruction &I : *NewBlocks[i])
+ for (unsigned i = 0, e = NewBlocks.size(); i != e; ++i) {
+ for (Instruction &I : *NewBlocks[i]) {
RemapInstruction(&I, VMap,
RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
+ if (auto *II = dyn_cast<IntrinsicInst>(&I))
+ if (II->getIntrinsicID() == Intrinsic::assume)
+ AC->registerAssumption(II);
+ }
+ }
// Rewrite the original preheader to select between versions of the loop.
BranchInst *OldBR = cast<BranchInst>(loopPreheader->getTerminator());
@@ -1380,8 +1382,8 @@ void LoopUnswitch::SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L) {
Pred->getInstList().splice(BI->getIterator(), Succ->getInstList(),
Succ->begin(), Succ->end());
LPM->deleteSimpleAnalysisValue(BI, L);
- BI->eraseFromParent();
RemoveFromWorklist(BI, Worklist);
+ BI->eraseFromParent();
// Remove Succ from the loop tree.
LI->removeBlock(Succ);
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp
index 0ccf0af..c23d891 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp
@@ -92,8 +92,7 @@
#include "llvm/Transforms/Utils/ValueMapper.h"
#define DEBUG_TYPE "loop-versioning-licm"
-static const char* LICMVersioningMetaData =
- "llvm.loop.licm_versioning.disable";
+static const char *LICMVersioningMetaData = "llvm.loop.licm_versioning.disable";
using namespace llvm;
@@ -158,34 +157,48 @@ struct LoopVersioningLICM : public LoopPass {
AU.addRequired<LoopInfoWrapperPass>();
AU.addRequiredID(LoopSimplifyID);
AU.addRequired<ScalarEvolutionWrapperPass>();
- AU.addRequired<TargetLibraryInfoWrapperPass>();
AU.addPreserved<AAResultsWrapperPass>();
AU.addPreserved<GlobalsAAWrapperPass>();
}
LoopVersioningLICM()
- : LoopPass(ID), AA(nullptr), SE(nullptr), LI(nullptr), DT(nullptr),
- TLI(nullptr), LAA(nullptr), LAI(nullptr), Changed(false),
- Preheader(nullptr), CurLoop(nullptr), CurAST(nullptr),
- LoopDepthThreshold(LVLoopDepthThreshold),
+ : LoopPass(ID), AA(nullptr), SE(nullptr), LAA(nullptr), LAI(nullptr),
+ CurLoop(nullptr), LoopDepthThreshold(LVLoopDepthThreshold),
InvariantThreshold(LVInvarThreshold), LoadAndStoreCounter(0),
InvariantCounter(0), IsReadOnlyLoop(true) {
initializeLoopVersioningLICMPass(*PassRegistry::getPassRegistry());
}
+ StringRef getPassName() const override { return "Loop Versioning for LICM"; }
- AliasAnalysis *AA; // Current AliasAnalysis information
- ScalarEvolution *SE; // Current ScalarEvolution
- LoopInfo *LI; // Current LoopInfo
- DominatorTree *DT; // Dominator Tree for the current Loop.
- TargetLibraryInfo *TLI; // TargetLibraryInfo for constant folding.
- LoopAccessLegacyAnalysis *LAA; // Current LoopAccessAnalysis
- const LoopAccessInfo *LAI; // Current Loop's LoopAccessInfo
+ void reset() {
+ AA = nullptr;
+ SE = nullptr;
+ LAA = nullptr;
+ CurLoop = nullptr;
+ LoadAndStoreCounter = 0;
+ InvariantCounter = 0;
+ IsReadOnlyLoop = true;
+ CurAST.reset();
+ }
+
+ class AutoResetter {
+ public:
+ AutoResetter(LoopVersioningLICM &LVLICM) : LVLICM(LVLICM) {}
+ ~AutoResetter() { LVLICM.reset(); }
+
+ private:
+ LoopVersioningLICM &LVLICM;
+ };
- bool Changed; // Set to true when we change anything.
- BasicBlock *Preheader; // The preheader block of the current loop.
- Loop *CurLoop; // The current loop we are working on.
- AliasSetTracker *CurAST; // AliasSet information for the current loop.
- ValueToValueMap Strides;
+private:
+ AliasAnalysis *AA; // Current AliasAnalysis information
+ ScalarEvolution *SE; // Current ScalarEvolution
+ LoopAccessLegacyAnalysis *LAA; // Current LoopAccessAnalysis
+ const LoopAccessInfo *LAI; // Current Loop's LoopAccessInfo
+
+ Loop *CurLoop; // The current loop we are working on.
+ std::unique_ptr<AliasSetTracker>
+ CurAST; // AliasSet information for the current loop.
unsigned LoopDepthThreshold; // Maximum loop nest threshold
float InvariantThreshold; // Minimum invariant threshold
@@ -200,15 +213,15 @@ struct LoopVersioningLICM : public LoopPass {
bool isLoopAlreadyVisited();
void setNoAliasToLoop(Loop *);
bool instructionSafeForVersioning(Instruction *);
- const char *getPassName() const override { return "Loop Versioning"; }
};
}
/// \brief Check loop structure and confirms it's good for LoopVersioningLICM.
bool LoopVersioningLICM::legalLoopStructure() {
- // Loop must have a preheader, if not return false.
- if (!CurLoop->getLoopPreheader()) {
- DEBUG(dbgs() << " loop preheader is missing\n");
+ // Loop must be in loop simplify form.
+ if (!CurLoop->isLoopSimplifyForm()) {
+ DEBUG(
+ dbgs() << " loop is not in loop-simplify form.\n");
return false;
}
// Loop should be innermost loop, if not return false.
@@ -244,11 +257,6 @@ bool LoopVersioningLICM::legalLoopStructure() {
DEBUG(dbgs() << " loop depth is more then threshold\n");
return false;
}
- // Loop should have a dedicated exit block, if not return false.
- if (!CurLoop->hasDedicatedExits()) {
- DEBUG(dbgs() << " loop does not has dedicated exit blocks\n");
- return false;
- }
// We need to be able to compute the loop trip count in order
// to generate the bound checks.
const SCEV *ExitCount = SE->getBackedgeTakenCount(CurLoop);
@@ -505,29 +513,30 @@ void LoopVersioningLICM::setNoAliasToLoop(Loop *VerLoop) {
}
bool LoopVersioningLICM::runOnLoop(Loop *L, LPPassManager &LPM) {
+ // This will automatically release all resources hold by the current
+ // LoopVersioningLICM object.
+ AutoResetter Resetter(*this);
+
if (skipLoop(L))
return false;
- Changed = false;
// Get Analysis information.
- LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
- DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
- TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
LAI = nullptr;
// Set Current Loop
CurLoop = L;
- // Get the preheader block.
- Preheader = L->getLoopPreheader();
- // Initial allocation
- CurAST = new AliasSetTracker(*AA);
+ CurAST.reset(new AliasSetTracker(*AA));
// Loop over the body of this loop, construct AST.
+ LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
for (auto *Block : L->getBlocks()) {
if (LI->getLoopFor(Block) == L) // Ignore blocks in subloop.
CurAST->add(*Block); // Incorporate the specified basic block
}
+
+ bool Changed = false;
+
// Check feasiblity of LoopVersioningLICM.
// If versioning found to be feasible and beneficial then proceed
// else simply return, by cleaning up memory.
@@ -535,6 +544,7 @@ bool LoopVersioningLICM::runOnLoop(Loop *L, LPPassManager &LPM) {
// Do loop versioning.
// Create memcheck for memory accessed inside loop.
// Clone original loop, and set blocks properly.
+ DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
LoopVersioning LVer(*LAI, CurLoop, LI, DT, SE, true);
LVer.versionLoop();
// Set Loop Versioning metaData for original loop.
@@ -548,8 +558,6 @@ bool LoopVersioningLICM::runOnLoop(Loop *L, LPPassManager &LPM) {
setNoAliasToLoop(LVer.getVersionedLoop());
Changed = true;
}
- // Delete allocated memory.
- delete CurAST;
return Changed;
}
@@ -564,7 +572,6 @@ INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
INITIALIZE_PASS_END(LoopVersioningLICM, "loop-versioning-licm",
"Loop Versioning For LICM", false, false)
diff --git a/contrib/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp b/contrib/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp
index 79f0db1..52975ef 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp
@@ -83,9 +83,8 @@ static bool handleSwitchExpect(SwitchInst &SI) {
return true;
}
-static bool handleBranchExpect(BranchInst &BI) {
- if (BI.isUnconditional())
- return false;
+// Handle both BranchInst and SelectInst.
+template <class BrSelInst> static bool handleBrSelExpect(BrSelInst &BSI) {
// Handle non-optimized IR code like:
// %expval = call i64 @llvm.expect.i64(i64 %conv1, i64 1)
@@ -98,9 +97,9 @@ static bool handleBranchExpect(BranchInst &BI) {
CallInst *CI;
- ICmpInst *CmpI = dyn_cast<ICmpInst>(BI.getCondition());
+ ICmpInst *CmpI = dyn_cast<ICmpInst>(BSI.getCondition());
if (!CmpI) {
- CI = dyn_cast<CallInst>(BI.getCondition());
+ CI = dyn_cast<CallInst>(BSI.getCondition());
} else {
if (CmpI->getPredicate() != CmpInst::ICMP_NE)
return false;
@@ -129,15 +128,22 @@ static bool handleBranchExpect(BranchInst &BI) {
else
Node = MDB.createBranchWeights(UnlikelyBranchWeight, LikelyBranchWeight);
- BI.setMetadata(LLVMContext::MD_prof, Node);
+ BSI.setMetadata(LLVMContext::MD_prof, Node);
if (CmpI)
CmpI->setOperand(0, ArgValue);
else
- BI.setCondition(ArgValue);
+ BSI.setCondition(ArgValue);
return true;
}
+static bool handleBranchExpect(BranchInst &BI) {
+ if (BI.isUnconditional())
+ return false;
+
+ return handleBrSelExpect<BranchInst>(BI);
+}
+
static bool lowerExpectIntrinsic(Function &F) {
bool Changed = false;
@@ -151,11 +157,19 @@ static bool lowerExpectIntrinsic(Function &F) {
ExpectIntrinsicsHandled++;
}
- // Remove llvm.expect intrinsics.
- for (BasicBlock::iterator BI = BB.begin(), BE = BB.end(); BI != BE;) {
- CallInst *CI = dyn_cast<CallInst>(BI++);
- if (!CI)
+ // Remove llvm.expect intrinsics. Iterate backwards in order
+ // to process select instructions before the intrinsic gets
+ // removed.
+ for (auto BI = BB.rbegin(), BE = BB.rend(); BI != BE;) {
+ Instruction *Inst = &*BI++;
+ CallInst *CI = dyn_cast<CallInst>(Inst);
+ if (!CI) {
+ if (SelectInst *SI = dyn_cast<SelectInst>(Inst)) {
+ if (handleBrSelExpect(*SI))
+ ExpectIntrinsicsHandled++;
+ }
continue;
+ }
Function *Fn = CI->getCalledFunction();
if (Fn && Fn->getIntrinsicID() == Intrinsic::expect) {
diff --git a/contrib/llvm/lib/Transforms/Scalar/LowerGuardIntrinsic.cpp b/contrib/llvm/lib/Transforms/Scalar/LowerGuardIntrinsic.cpp
index 5749100..4f41371 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LowerGuardIntrinsic.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LowerGuardIntrinsic.cpp
@@ -13,7 +13,7 @@
//
//===----------------------------------------------------------------------===//
-#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/LowerGuardIntrinsic.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/Function.h"
@@ -24,6 +24,7 @@
#include "llvm/IR/MDBuilder.h"
#include "llvm/IR/Module.h"
#include "llvm/Pass.h"
+#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
using namespace llvm;
@@ -34,10 +35,11 @@ static cl::opt<uint32_t> PredicatePassBranchWeight(
"reciprocal of this value (default = 1 << 20)"));
namespace {
-struct LowerGuardIntrinsic : public FunctionPass {
+struct LowerGuardIntrinsicLegacyPass : public FunctionPass {
static char ID;
- LowerGuardIntrinsic() : FunctionPass(ID) {
- initializeLowerGuardIntrinsicPass(*PassRegistry::getPassRegistry());
+ LowerGuardIntrinsicLegacyPass() : FunctionPass(ID) {
+ initializeLowerGuardIntrinsicLegacyPassPass(
+ *PassRegistry::getPassRegistry());
}
bool runOnFunction(Function &F) override;
@@ -83,7 +85,7 @@ static void MakeGuardControlFlowExplicit(Function *DeoptIntrinsic,
DeoptBlockTerm->eraseFromParent();
}
-bool LowerGuardIntrinsic::runOnFunction(Function &F) {
+static bool lowerGuardIntrinsic(Function &F) {
// Check if we can cheaply rule out the possibility of not having any work to
// do.
auto *GuardDecl = F.getParent()->getFunction(
@@ -113,11 +115,23 @@ bool LowerGuardIntrinsic::runOnFunction(Function &F) {
return true;
}
-char LowerGuardIntrinsic::ID = 0;
-INITIALIZE_PASS(LowerGuardIntrinsic, "lower-guard-intrinsic",
+bool LowerGuardIntrinsicLegacyPass::runOnFunction(Function &F) {
+ return lowerGuardIntrinsic(F);
+}
+
+char LowerGuardIntrinsicLegacyPass::ID = 0;
+INITIALIZE_PASS(LowerGuardIntrinsicLegacyPass, "lower-guard-intrinsic",
"Lower the guard intrinsic to normal control flow", false,
false)
Pass *llvm::createLowerGuardIntrinsicPass() {
- return new LowerGuardIntrinsic();
+ return new LowerGuardIntrinsicLegacyPass();
+}
+
+PreservedAnalyses LowerGuardIntrinsicPass::run(Function &F,
+ FunctionAnalysisManager &AM) {
+ if (lowerGuardIntrinsic(F))
+ return PreservedAnalyses::none();
+
+ return PreservedAnalyses::all();
}
diff --git a/contrib/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/contrib/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index d64c658..1b59014 100644
--- a/contrib/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -52,7 +52,7 @@ static int64_t GetOffsetFromIndex(const GEPOperator *GEP, unsigned Idx,
if (OpC->isZero()) continue; // No offset.
// Handle struct indices, which add their field offset to the pointer.
- if (StructType *STy = dyn_cast<StructType>(*GTI)) {
+ if (StructType *STy = GTI.getStructTypeOrNull()) {
Offset += DL.getStructLayout(STy)->getElementOffset(OpC->getZExtValue());
continue;
}
@@ -489,7 +489,8 @@ static unsigned findCommonAlignment(const DataLayout &DL, const StoreInst *SI,
// It will lift the store and its argument + that anything that
// may alias with these.
// The method returns true if it was successful.
-static bool moveUp(AliasAnalysis &AA, StoreInst *SI, Instruction *P) {
+static bool moveUp(AliasAnalysis &AA, StoreInst *SI, Instruction *P,
+ const LoadInst *LI) {
// If the store alias this position, early bail out.
MemoryLocation StoreLoc = MemoryLocation::get(SI);
if (AA.getModRefInfo(P, StoreLoc) != MRI_NoModRef)
@@ -506,12 +507,13 @@ static bool moveUp(AliasAnalysis &AA, StoreInst *SI, Instruction *P) {
SmallVector<Instruction*, 8> ToLift;
// Memory locations of lifted instructions.
- SmallVector<MemoryLocation, 8> MemLocs;
- MemLocs.push_back(StoreLoc);
+ SmallVector<MemoryLocation, 8> MemLocs{StoreLoc};
// Lifted callsites.
SmallVector<ImmutableCallSite, 8> CallSites;
+ const MemoryLocation LoadLoc = MemoryLocation::get(LI);
+
for (auto I = --SI->getIterator(), E = P->getIterator(); I != E; --I) {
auto *C = &*I;
@@ -521,23 +523,25 @@ static bool moveUp(AliasAnalysis &AA, StoreInst *SI, Instruction *P) {
if (Args.erase(C))
NeedLift = true;
else if (MayAlias) {
- NeedLift = std::any_of(MemLocs.begin(), MemLocs.end(),
- [C, &AA](const MemoryLocation &ML) {
- return AA.getModRefInfo(C, ML);
- });
+ NeedLift = any_of(MemLocs, [C, &AA](const MemoryLocation &ML) {
+ return AA.getModRefInfo(C, ML);
+ });
if (!NeedLift)
- NeedLift = std::any_of(CallSites.begin(), CallSites.end(),
- [C, &AA](const ImmutableCallSite &CS) {
- return AA.getModRefInfo(C, CS);
- });
+ NeedLift = any_of(CallSites, [C, &AA](const ImmutableCallSite &CS) {
+ return AA.getModRefInfo(C, CS);
+ });
}
if (!NeedLift)
continue;
if (MayAlias) {
- if (auto CS = ImmutableCallSite(C)) {
+ // Since LI is implicitly moved downwards past the lifted instructions,
+ // none of them may modify its source.
+ if (AA.getModRefInfo(C, LoadLoc) & MRI_Mod)
+ return false;
+ else if (auto CS = ImmutableCallSite(C)) {
// If we can't lift this before P, it's game over.
if (AA.getModRefInfo(P, CS) != MRI_NoModRef)
return false;
@@ -612,7 +616,7 @@ bool MemCpyOptPass::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
// position if nothing alias the store memory after this and the store
// destination is not in the range.
if (P && P != SI) {
- if (!moveUp(AA, SI, P))
+ if (!moveUp(AA, SI, P, LI))
P = nullptr;
}
@@ -1082,10 +1086,10 @@ bool MemCpyOptPass::processMemSetMemCpyDependence(MemCpyInst *MemCpy,
DestSize = Builder.CreateZExt(DestSize, SrcSize->getType());
}
- Value *MemsetLen =
- Builder.CreateSelect(Builder.CreateICmpULE(DestSize, SrcSize),
- ConstantInt::getNullValue(DestSize->getType()),
- Builder.CreateSub(DestSize, SrcSize));
+ Value *Ule = Builder.CreateICmpULE(DestSize, SrcSize);
+ Value *SizeDiff = Builder.CreateSub(DestSize, SrcSize);
+ Value *MemsetLen = Builder.CreateSelect(
+ Ule, ConstantInt::getNullValue(DestSize->getType()), SizeDiff);
Builder.CreateMemSet(Builder.CreateGEP(Dest, SrcSize), MemSet->getOperand(1),
MemsetLen, Align);
@@ -1110,8 +1114,11 @@ bool MemCpyOptPass::processMemSetMemCpyDependence(MemCpyInst *MemCpy,
/// The \p MemCpy must have a Constant length.
bool MemCpyOptPass::performMemCpyToMemSetOptzn(MemCpyInst *MemCpy,
MemSetInst *MemSet) {
- // This only makes sense on memcpy(..., memset(...), ...).
- if (MemSet->getRawDest() != MemCpy->getRawSource())
+ AliasAnalysis &AA = LookupAliasAnalysis();
+
+ // Make sure that memcpy(..., memset(...), ...), that is we are memsetting and
+ // memcpying from the same address. Otherwise it is hard to reason about.
+ if (!AA.isMustAlias(MemSet->getRawDest(), MemCpy->getRawSource()))
return false;
ConstantInt *CopySize = cast<ConstantInt>(MemCpy->getLength());
diff --git a/contrib/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp b/contrib/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp
index 30261b7..6a64c6b 100644
--- a/contrib/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp
@@ -260,7 +260,7 @@ void MergedLoadStoreMotion::hoistInstruction(BasicBlock *BB,
assert(HoistCand->getParent() != BB);
// Intersect optional metadata.
- HoistCand->intersectOptionalDataWith(ElseInst);
+ HoistCand->andIRFlags(ElseInst);
HoistCand->dropUnknownNonDebugMetadata();
// Prepend point for instruction insert
@@ -434,7 +434,7 @@ bool MergedLoadStoreMotion::sinkStore(BasicBlock *BB, StoreInst *S0,
// Hoist the instruction.
BasicBlock::iterator InsertPt = BB->getFirstInsertionPt();
// Intersect optional metadata.
- S0->intersectOptionalDataWith(S1);
+ S0->andIRFlags(S1);
S0->dropUnknownNonDebugMetadata();
// Create the new store to be inserted at the join point.
@@ -563,7 +563,6 @@ public:
}
private:
- // This transformation requires dominator postdominator info
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
AU.addRequired<AAResultsWrapperPass>();
@@ -590,7 +589,7 @@ INITIALIZE_PASS_END(MergedLoadStoreMotionLegacyPass, "mldst-motion",
"MergedLoadStoreMotion", false, false)
PreservedAnalyses
-MergedLoadStoreMotionPass::run(Function &F, AnalysisManager<Function> &AM) {
+MergedLoadStoreMotionPass::run(Function &F, FunctionAnalysisManager &AM) {
MergedLoadStoreMotion Impl;
auto *MD = AM.getCachedResult<MemoryDependenceAnalysis>(F);
auto &AA = AM.getResult<AAManager>(F);
diff --git a/contrib/llvm/lib/Transforms/Scalar/NaryReassociate.cpp b/contrib/llvm/lib/Transforms/Scalar/NaryReassociate.cpp
index ed754fa..0a3bf7b 100644
--- a/contrib/llvm/lib/Transforms/Scalar/NaryReassociate.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/NaryReassociate.cpp
@@ -76,12 +76,8 @@
//
//===----------------------------------------------------------------------===//
-#include "llvm/Analysis/AssumptionCache.h"
-#include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Transforms/Scalar/NaryReassociate.h"
#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/IR/Dominators.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/Support/Debug.h"
@@ -94,16 +90,15 @@ using namespace PatternMatch;
#define DEBUG_TYPE "nary-reassociate"
namespace {
-class NaryReassociate : public FunctionPass {
+class NaryReassociateLegacyPass : public FunctionPass {
public:
static char ID;
- NaryReassociate(): FunctionPass(ID) {
- initializeNaryReassociatePass(*PassRegistry::getPassRegistry());
+ NaryReassociateLegacyPass() : FunctionPass(ID) {
+ initializeNaryReassociateLegacyPassPass(*PassRegistry::getPassRegistry());
}
bool doInitialization(Module &M) override {
- DL = &M.getDataLayout();
return false;
}
bool runOnFunction(Function &F) override;
@@ -121,101 +116,73 @@ public:
}
private:
- // Runs only one iteration of the dominator-based algorithm. See the header
- // comments for why we need multiple iterations.
- bool doOneIteration(Function &F);
-
- // Reassociates I for better CSE.
- Instruction *tryReassociate(Instruction *I);
-
- // Reassociate GEP for better CSE.
- Instruction *tryReassociateGEP(GetElementPtrInst *GEP);
- // Try splitting GEP at the I-th index and see whether either part can be
- // CSE'ed. This is a helper function for tryReassociateGEP.
- //
- // \p IndexedType The element type indexed by GEP's I-th index. This is
- // equivalent to
- // GEP->getIndexedType(GEP->getPointerOperand(), 0-th index,
- // ..., i-th index).
- GetElementPtrInst *tryReassociateGEPAtIndex(GetElementPtrInst *GEP,
- unsigned I, Type *IndexedType);
- // Given GEP's I-th index = LHS + RHS, see whether &Base[..][LHS][..] or
- // &Base[..][RHS][..] can be CSE'ed and rewrite GEP accordingly.
- GetElementPtrInst *tryReassociateGEPAtIndex(GetElementPtrInst *GEP,
- unsigned I, Value *LHS,
- Value *RHS, Type *IndexedType);
-
- // Reassociate binary operators for better CSE.
- Instruction *tryReassociateBinaryOp(BinaryOperator *I);
-
- // A helper function for tryReassociateBinaryOp. LHS and RHS are explicitly
- // passed.
- Instruction *tryReassociateBinaryOp(Value *LHS, Value *RHS,
- BinaryOperator *I);
- // Rewrites I to (LHS op RHS) if LHS is computed already.
- Instruction *tryReassociatedBinaryOp(const SCEV *LHS, Value *RHS,
- BinaryOperator *I);
-
- // Tries to match Op1 and Op2 by using V.
- bool matchTernaryOp(BinaryOperator *I, Value *V, Value *&Op1, Value *&Op2);
-
- // Gets SCEV for (LHS op RHS).
- const SCEV *getBinarySCEV(BinaryOperator *I, const SCEV *LHS,
- const SCEV *RHS);
-
- // Returns the closest dominator of \c Dominatee that computes
- // \c CandidateExpr. Returns null if not found.
- Instruction *findClosestMatchingDominator(const SCEV *CandidateExpr,
- Instruction *Dominatee);
- // GetElementPtrInst implicitly sign-extends an index if the index is shorter
- // than the pointer size. This function returns whether Index is shorter than
- // GEP's pointer size, i.e., whether Index needs to be sign-extended in order
- // to be an index of GEP.
- bool requiresSignExtension(Value *Index, GetElementPtrInst *GEP);
-
- AssumptionCache *AC;
- const DataLayout *DL;
- DominatorTree *DT;
- ScalarEvolution *SE;
- TargetLibraryInfo *TLI;
- TargetTransformInfo *TTI;
- // A lookup table quickly telling which instructions compute the given SCEV.
- // Note that there can be multiple instructions at different locations
- // computing to the same SCEV, so we map a SCEV to an instruction list. For
- // example,
- //
- // if (p1)
- // foo(a + b);
- // if (p2)
- // bar(a + b);
- DenseMap<const SCEV *, SmallVector<WeakVH, 2>> SeenExprs;
+ NaryReassociatePass Impl;
};
} // anonymous namespace
-char NaryReassociate::ID = 0;
-INITIALIZE_PASS_BEGIN(NaryReassociate, "nary-reassociate", "Nary reassociation",
- false, false)
+char NaryReassociateLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(NaryReassociateLegacyPass, "nary-reassociate",
+ "Nary reassociation", false, false)
INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
-INITIALIZE_PASS_END(NaryReassociate, "nary-reassociate", "Nary reassociation",
- false, false)
+INITIALIZE_PASS_END(NaryReassociateLegacyPass, "nary-reassociate",
+ "Nary reassociation", false, false)
FunctionPass *llvm::createNaryReassociatePass() {
- return new NaryReassociate();
+ return new NaryReassociateLegacyPass();
}
-bool NaryReassociate::runOnFunction(Function &F) {
+bool NaryReassociateLegacyPass::runOnFunction(Function &F) {
if (skipFunction(F))
return false;
- AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
- DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
- SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
- TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
- TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+ auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+ auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+ auto *TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+ auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+
+ return Impl.runImpl(F, AC, DT, SE, TLI, TTI);
+}
+
+PreservedAnalyses NaryReassociatePass::run(Function &F,
+ FunctionAnalysisManager &AM) {
+ auto *AC = &AM.getResult<AssumptionAnalysis>(F);
+ auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);
+ auto *SE = &AM.getResult<ScalarEvolutionAnalysis>(F);
+ auto *TLI = &AM.getResult<TargetLibraryAnalysis>(F);
+ auto *TTI = &AM.getResult<TargetIRAnalysis>(F);
+
+ bool Changed = runImpl(F, AC, DT, SE, TLI, TTI);
+
+ // FIXME: We need to invalidate this to avoid PR28400. Is there a better
+ // solution?
+ AM.invalidate<ScalarEvolutionAnalysis>(F);
+
+ if (!Changed)
+ return PreservedAnalyses::all();
+
+ // FIXME: This should also 'preserve the CFG'.
+ PreservedAnalyses PA;
+ PA.preserve<DominatorTreeAnalysis>();
+ PA.preserve<ScalarEvolutionAnalysis>();
+ PA.preserve<TargetLibraryAnalysis>();
+ return PA;
+}
+
+bool NaryReassociatePass::runImpl(Function &F, AssumptionCache *AC_,
+ DominatorTree *DT_, ScalarEvolution *SE_,
+ TargetLibraryInfo *TLI_,
+ TargetTransformInfo *TTI_) {
+ AC = AC_;
+ DT = DT_;
+ SE = SE_;
+ TLI = TLI_;
+ TTI = TTI_;
+ DL = &F.getParent()->getDataLayout();
bool Changed = false, ChangedInThisIteration;
do {
@@ -237,13 +204,13 @@ static bool isPotentiallyNaryReassociable(Instruction *I) {
}
}
-bool NaryReassociate::doOneIteration(Function &F) {
+bool NaryReassociatePass::doOneIteration(Function &F) {
bool Changed = false;
SeenExprs.clear();
- // Process the basic blocks in pre-order of the dominator tree. This order
- // ensures that all bases of a candidate are in Candidates when we process it.
- for (auto Node = GraphTraits<DominatorTree *>::nodes_begin(DT);
- Node != GraphTraits<DominatorTree *>::nodes_end(DT); ++Node) {
+ // Process the basic blocks in a depth first traversal of the dominator
+ // tree. This order ensures that all bases of a candidate are in Candidates
+ // when we process it.
+ for (const auto Node : depth_first(DT)) {
BasicBlock *BB = Node->getBlock();
for (auto I = BB->begin(); I != BB->end(); ++I) {
if (SE->isSCEVable(I->getType()) && isPotentiallyNaryReassociable(&*I)) {
@@ -287,7 +254,7 @@ bool NaryReassociate::doOneIteration(Function &F) {
return Changed;
}
-Instruction *NaryReassociate::tryReassociate(Instruction *I) {
+Instruction *NaryReassociatePass::tryReassociate(Instruction *I) {
switch (I->getOpcode()) {
case Instruction::Add:
case Instruction::Mul:
@@ -308,15 +275,16 @@ static bool isGEPFoldable(GetElementPtrInst *GEP,
Indices) == TargetTransformInfo::TCC_Free;
}
-Instruction *NaryReassociate::tryReassociateGEP(GetElementPtrInst *GEP) {
+Instruction *NaryReassociatePass::tryReassociateGEP(GetElementPtrInst *GEP) {
// Not worth reassociating GEP if it is foldable.
if (isGEPFoldable(GEP, TTI))
return nullptr;
gep_type_iterator GTI = gep_type_begin(*GEP);
- for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I) {
- if (isa<SequentialType>(*GTI++)) {
- if (auto *NewGEP = tryReassociateGEPAtIndex(GEP, I - 1, *GTI)) {
+ for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I, ++GTI) {
+ if (GTI.isSequential()) {
+ if (auto *NewGEP = tryReassociateGEPAtIndex(GEP, I - 1,
+ GTI.getIndexedType())) {
return NewGEP;
}
}
@@ -324,16 +292,16 @@ Instruction *NaryReassociate::tryReassociateGEP(GetElementPtrInst *GEP) {
return nullptr;
}
-bool NaryReassociate::requiresSignExtension(Value *Index,
- GetElementPtrInst *GEP) {
+bool NaryReassociatePass::requiresSignExtension(Value *Index,
+ GetElementPtrInst *GEP) {
unsigned PointerSizeInBits =
DL->getPointerSizeInBits(GEP->getType()->getPointerAddressSpace());
return cast<IntegerType>(Index->getType())->getBitWidth() < PointerSizeInBits;
}
GetElementPtrInst *
-NaryReassociate::tryReassociateGEPAtIndex(GetElementPtrInst *GEP, unsigned I,
- Type *IndexedType) {
+NaryReassociatePass::tryReassociateGEPAtIndex(GetElementPtrInst *GEP,
+ unsigned I, Type *IndexedType) {
Value *IndexToSplit = GEP->getOperand(I + 1);
if (SExtInst *SExt = dyn_cast<SExtInst>(IndexToSplit)) {
IndexToSplit = SExt->getOperand(0);
@@ -366,9 +334,10 @@ NaryReassociate::tryReassociateGEPAtIndex(GetElementPtrInst *GEP, unsigned I,
return nullptr;
}
-GetElementPtrInst *NaryReassociate::tryReassociateGEPAtIndex(
- GetElementPtrInst *GEP, unsigned I, Value *LHS, Value *RHS,
- Type *IndexedType) {
+GetElementPtrInst *
+NaryReassociatePass::tryReassociateGEPAtIndex(GetElementPtrInst *GEP,
+ unsigned I, Value *LHS,
+ Value *RHS, Type *IndexedType) {
// Look for GEP's closest dominator that has the same SCEV as GEP except that
// the I-th index is replaced with LHS.
SmallVector<const SCEV *, 4> IndexExprs;
@@ -386,9 +355,8 @@ GetElementPtrInst *NaryReassociate::tryReassociateGEPAtIndex(
IndexExprs[I] =
SE->getZeroExtendExpr(IndexExprs[I], GEP->getOperand(I)->getType());
}
- const SCEV *CandidateExpr = SE->getGEPExpr(
- GEP->getSourceElementType(), SE->getSCEV(GEP->getPointerOperand()),
- IndexExprs, GEP->isInBounds());
+ const SCEV *CandidateExpr = SE->getGEPExpr(cast<GEPOperator>(GEP),
+ IndexExprs);
Value *Candidate = findClosestMatchingDominator(CandidateExpr, GEP);
if (Candidate == nullptr)
@@ -437,7 +405,7 @@ GetElementPtrInst *NaryReassociate::tryReassociateGEPAtIndex(
return NewGEP;
}
-Instruction *NaryReassociate::tryReassociateBinaryOp(BinaryOperator *I) {
+Instruction *NaryReassociatePass::tryReassociateBinaryOp(BinaryOperator *I) {
Value *LHS = I->getOperand(0), *RHS = I->getOperand(1);
if (auto *NewI = tryReassociateBinaryOp(LHS, RHS, I))
return NewI;
@@ -446,8 +414,8 @@ Instruction *NaryReassociate::tryReassociateBinaryOp(BinaryOperator *I) {
return nullptr;
}
-Instruction *NaryReassociate::tryReassociateBinaryOp(Value *LHS, Value *RHS,
- BinaryOperator *I) {
+Instruction *NaryReassociatePass::tryReassociateBinaryOp(Value *LHS, Value *RHS,
+ BinaryOperator *I) {
Value *A = nullptr, *B = nullptr;
// To be conservative, we reassociate I only when it is the only user of (A op
// B).
@@ -470,9 +438,9 @@ Instruction *NaryReassociate::tryReassociateBinaryOp(Value *LHS, Value *RHS,
return nullptr;
}
-Instruction *NaryReassociate::tryReassociatedBinaryOp(const SCEV *LHSExpr,
- Value *RHS,
- BinaryOperator *I) {
+Instruction *NaryReassociatePass::tryReassociatedBinaryOp(const SCEV *LHSExpr,
+ Value *RHS,
+ BinaryOperator *I) {
// Look for the closest dominator LHS of I that computes LHSExpr, and replace
// I with LHS op RHS.
auto *LHS = findClosestMatchingDominator(LHSExpr, I);
@@ -494,8 +462,8 @@ Instruction *NaryReassociate::tryReassociatedBinaryOp(const SCEV *LHSExpr,
return NewI;
}
-bool NaryReassociate::matchTernaryOp(BinaryOperator *I, Value *V, Value *&Op1,
- Value *&Op2) {
+bool NaryReassociatePass::matchTernaryOp(BinaryOperator *I, Value *V,
+ Value *&Op1, Value *&Op2) {
switch (I->getOpcode()) {
case Instruction::Add:
return match(V, m_Add(m_Value(Op1), m_Value(Op2)));
@@ -507,8 +475,9 @@ bool NaryReassociate::matchTernaryOp(BinaryOperator *I, Value *V, Value *&Op1,
return false;
}
-const SCEV *NaryReassociate::getBinarySCEV(BinaryOperator *I, const SCEV *LHS,
- const SCEV *RHS) {
+const SCEV *NaryReassociatePass::getBinarySCEV(BinaryOperator *I,
+ const SCEV *LHS,
+ const SCEV *RHS) {
switch (I->getOpcode()) {
case Instruction::Add:
return SE->getAddExpr(LHS, RHS);
@@ -521,8 +490,8 @@ const SCEV *NaryReassociate::getBinarySCEV(BinaryOperator *I, const SCEV *LHS,
}
Instruction *
-NaryReassociate::findClosestMatchingDominator(const SCEV *CandidateExpr,
- Instruction *Dominatee) {
+NaryReassociatePass::findClosestMatchingDominator(const SCEV *CandidateExpr,
+ Instruction *Dominatee) {
auto Pos = SeenExprs.find(CandidateExpr);
if (Pos == SeenExprs.end())
return nullptr;
diff --git a/contrib/llvm/lib/Transforms/Scalar/NewGVN.cpp b/contrib/llvm/lib/Transforms/Scalar/NewGVN.cpp
new file mode 100644
index 0000000..57e6e3d
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/NewGVN.cpp
@@ -0,0 +1,2257 @@
+//===---- NewGVN.cpp - Global Value Numbering Pass --------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements the new LLVM's Global Value Numbering pass.
+/// GVN partitions values computed by a function into congruence classes.
+/// Values ending up in the same congruence class are guaranteed to be the same
+/// for every execution of the program. In that respect, congruency is a
+/// compile-time approximation of equivalence of values at runtime.
+/// The algorithm implemented here uses a sparse formulation and it's based
+/// on the ideas described in the paper:
+/// "A Sparse Algorithm for Predicated Global Value Numbering" from
+/// Karthik Gargi.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/NewGVN.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SparseBitVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/TinyPtrVector.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/CFGPrinter.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/Loads.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/Analysis/MemoryDependenceAnalysis.h"
+#include "llvm/Analysis/MemoryLocation.h"
+#include "llvm/Analysis/PHITransAddr.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/PredIteratorCache.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/GVNExpression.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/MemorySSA.h"
+#include "llvm/Transforms/Utils/SSAUpdater.h"
+#include <unordered_map>
+#include <utility>
+#include <vector>
+using namespace llvm;
+using namespace PatternMatch;
+using namespace llvm::GVNExpression;
+
+#define DEBUG_TYPE "newgvn"
+
+STATISTIC(NumGVNInstrDeleted, "Number of instructions deleted");
+STATISTIC(NumGVNBlocksDeleted, "Number of blocks deleted");
+STATISTIC(NumGVNOpsSimplified, "Number of Expressions simplified");
+STATISTIC(NumGVNPhisAllSame, "Number of PHIs whos arguments are all the same");
+STATISTIC(NumGVNMaxIterations,
+ "Maximum Number of iterations it took to converge GVN");
+STATISTIC(NumGVNLeaderChanges, "Number of leader changes");
+STATISTIC(NumGVNSortedLeaderChanges, "Number of sorted leader changes");
+STATISTIC(NumGVNAvoidedSortedLeaderChanges,
+ "Number of avoided sorted leader changes");
+STATISTIC(NumGVNNotMostDominatingLeader,
+ "Number of times a member dominated it's new classes' leader");
+
+//===----------------------------------------------------------------------===//
+// GVN Pass
+//===----------------------------------------------------------------------===//
+
+// Anchor methods.
+namespace llvm {
+namespace GVNExpression {
+Expression::~Expression() = default;
+BasicExpression::~BasicExpression() = default;
+CallExpression::~CallExpression() = default;
+LoadExpression::~LoadExpression() = default;
+StoreExpression::~StoreExpression() = default;
+AggregateValueExpression::~AggregateValueExpression() = default;
+PHIExpression::~PHIExpression() = default;
+}
+}
+
+// Congruence classes represent the set of expressions/instructions
+// that are all the same *during some scope in the function*.
+// That is, because of the way we perform equality propagation, and
+// because of memory value numbering, it is not correct to assume
+// you can willy-nilly replace any member with any other at any
+// point in the function.
+//
+// For any Value in the Member set, it is valid to replace any dominated member
+// with that Value.
+//
+// Every congruence class has a leader, and the leader is used to
+// symbolize instructions in a canonical way (IE every operand of an
+// instruction that is a member of the same congruence class will
+// always be replaced with leader during symbolization).
+// To simplify symbolization, we keep the leader as a constant if class can be
+// proved to be a constant value.
+// Otherwise, the leader is a randomly chosen member of the value set, it does
+// not matter which one is chosen.
+// Each congruence class also has a defining expression,
+// though the expression may be null. If it exists, it can be used for forward
+// propagation and reassociation of values.
+//
+struct CongruenceClass {
+ using MemberSet = SmallPtrSet<Value *, 4>;
+ unsigned ID;
+ // Representative leader.
+ Value *RepLeader = nullptr;
+ // Defining Expression.
+ const Expression *DefiningExpr = nullptr;
+ // Actual members of this class.
+ MemberSet Members;
+
+ // True if this class has no members left. This is mainly used for assertion
+ // purposes, and for skipping empty classes.
+ bool Dead = false;
+
+ // Number of stores in this congruence class.
+ // This is used so we can detect store equivalence changes properly.
+ int StoreCount = 0;
+
+ // The most dominating leader after our current leader, because the member set
+ // is not sorted and is expensive to keep sorted all the time.
+ std::pair<Value *, unsigned int> NextLeader = {nullptr, ~0U};
+
+ explicit CongruenceClass(unsigned ID) : ID(ID) {}
+ CongruenceClass(unsigned ID, Value *Leader, const Expression *E)
+ : ID(ID), RepLeader(Leader), DefiningExpr(E) {}
+};
+
+namespace llvm {
+template <> struct DenseMapInfo<const Expression *> {
+ static const Expression *getEmptyKey() {
+ auto Val = static_cast<uintptr_t>(-1);
+ Val <<= PointerLikeTypeTraits<const Expression *>::NumLowBitsAvailable;
+ return reinterpret_cast<const Expression *>(Val);
+ }
+ static const Expression *getTombstoneKey() {
+ auto Val = static_cast<uintptr_t>(~1U);
+ Val <<= PointerLikeTypeTraits<const Expression *>::NumLowBitsAvailable;
+ return reinterpret_cast<const Expression *>(Val);
+ }
+ static unsigned getHashValue(const Expression *V) {
+ return static_cast<unsigned>(V->getHashValue());
+ }
+ static bool isEqual(const Expression *LHS, const Expression *RHS) {
+ if (LHS == RHS)
+ return true;
+ if (LHS == getTombstoneKey() || RHS == getTombstoneKey() ||
+ LHS == getEmptyKey() || RHS == getEmptyKey())
+ return false;
+ return *LHS == *RHS;
+ }
+};
+} // end namespace llvm
+
+class NewGVN : public FunctionPass {
+ DominatorTree *DT;
+ const DataLayout *DL;
+ const TargetLibraryInfo *TLI;
+ AssumptionCache *AC;
+ AliasAnalysis *AA;
+ MemorySSA *MSSA;
+ MemorySSAWalker *MSSAWalker;
+ BumpPtrAllocator ExpressionAllocator;
+ ArrayRecycler<Value *> ArgRecycler;
+
+ // Congruence class info.
+ CongruenceClass *InitialClass;
+ std::vector<CongruenceClass *> CongruenceClasses;
+ unsigned NextCongruenceNum;
+
+ // Value Mappings.
+ DenseMap<Value *, CongruenceClass *> ValueToClass;
+ DenseMap<Value *, const Expression *> ValueToExpression;
+
+ // A table storing which memorydefs/phis represent a memory state provably
+ // equivalent to another memory state.
+ // We could use the congruence class machinery, but the MemoryAccess's are
+ // abstract memory states, so they can only ever be equivalent to each other,
+ // and not to constants, etc.
+ DenseMap<const MemoryAccess *, MemoryAccess *> MemoryAccessEquiv;
+
+ // Expression to class mapping.
+ using ExpressionClassMap = DenseMap<const Expression *, CongruenceClass *>;
+ ExpressionClassMap ExpressionToClass;
+
+ // Which values have changed as a result of leader changes.
+ SmallPtrSet<Value *, 8> LeaderChanges;
+
+ // Reachability info.
+ using BlockEdge = BasicBlockEdge;
+ DenseSet<BlockEdge> ReachableEdges;
+ SmallPtrSet<const BasicBlock *, 8> ReachableBlocks;
+
+ // This is a bitvector because, on larger functions, we may have
+ // thousands of touched instructions at once (entire blocks,
+ // instructions with hundreds of uses, etc). Even with optimization
+ // for when we mark whole blocks as touched, when this was a
+ // SmallPtrSet or DenseSet, for some functions, we spent >20% of all
+ // the time in GVN just managing this list. The bitvector, on the
+ // other hand, efficiently supports test/set/clear of both
+ // individual and ranges, as well as "find next element" This
+ // enables us to use it as a worklist with essentially 0 cost.
+ BitVector TouchedInstructions;
+
+ DenseMap<const BasicBlock *, std::pair<unsigned, unsigned>> BlockInstRange;
+ DenseMap<const DomTreeNode *, std::pair<unsigned, unsigned>>
+ DominatedInstRange;
+
+#ifndef NDEBUG
+ // Debugging for how many times each block and instruction got processed.
+ DenseMap<const Value *, unsigned> ProcessedCount;
+#endif
+
+ // DFS info.
+ DenseMap<const BasicBlock *, std::pair<int, int>> DFSDomMap;
+ DenseMap<const Value *, unsigned> InstrDFS;
+ SmallVector<Value *, 32> DFSToInstr;
+
+ // Deletion info.
+ SmallPtrSet<Instruction *, 8> InstructionsToErase;
+
+public:
+ static char ID; // Pass identification, replacement for typeid.
+ NewGVN() : FunctionPass(ID) {
+ initializeNewGVNPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override;
+ bool runGVN(Function &F, DominatorTree *DT, AssumptionCache *AC,
+ TargetLibraryInfo *TLI, AliasAnalysis *AA, MemorySSA *MSSA);
+
+private:
+ // This transformation requires dominator postdominator info.
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<AssumptionCacheTracker>();
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
+ AU.addRequired<MemorySSAWrapperPass>();
+ AU.addRequired<AAResultsWrapperPass>();
+
+ AU.addPreserved<DominatorTreeWrapperPass>();
+ AU.addPreserved<GlobalsAAWrapperPass>();
+ }
+
+ // Expression handling.
+ const Expression *createExpression(Instruction *, const BasicBlock *);
+ const Expression *createBinaryExpression(unsigned, Type *, Value *, Value *,
+ const BasicBlock *);
+ PHIExpression *createPHIExpression(Instruction *);
+ const VariableExpression *createVariableExpression(Value *);
+ const ConstantExpression *createConstantExpression(Constant *);
+ const Expression *createVariableOrConstant(Value *V, const BasicBlock *B);
+ const UnknownExpression *createUnknownExpression(Instruction *);
+ const StoreExpression *createStoreExpression(StoreInst *, MemoryAccess *,
+ const BasicBlock *);
+ LoadExpression *createLoadExpression(Type *, Value *, LoadInst *,
+ MemoryAccess *, const BasicBlock *);
+
+ const CallExpression *createCallExpression(CallInst *, MemoryAccess *,
+ const BasicBlock *);
+ const AggregateValueExpression *
+ createAggregateValueExpression(Instruction *, const BasicBlock *);
+ bool setBasicExpressionInfo(Instruction *, BasicExpression *,
+ const BasicBlock *);
+
+ // Congruence class handling.
+ CongruenceClass *createCongruenceClass(Value *Leader, const Expression *E) {
+ auto *result = new CongruenceClass(NextCongruenceNum++, Leader, E);
+ CongruenceClasses.emplace_back(result);
+ return result;
+ }
+
+ CongruenceClass *createSingletonCongruenceClass(Value *Member) {
+ CongruenceClass *CClass = createCongruenceClass(Member, nullptr);
+ CClass->Members.insert(Member);
+ ValueToClass[Member] = CClass;
+ return CClass;
+ }
+ void initializeCongruenceClasses(Function &F);
+
+ // Value number an Instruction or MemoryPhi.
+ void valueNumberMemoryPhi(MemoryPhi *);
+ void valueNumberInstruction(Instruction *);
+
+ // Symbolic evaluation.
+ const Expression *checkSimplificationResults(Expression *, Instruction *,
+ Value *);
+ const Expression *performSymbolicEvaluation(Value *, const BasicBlock *);
+ const Expression *performSymbolicLoadEvaluation(Instruction *,
+ const BasicBlock *);
+ const Expression *performSymbolicStoreEvaluation(Instruction *,
+ const BasicBlock *);
+ const Expression *performSymbolicCallEvaluation(Instruction *,
+ const BasicBlock *);
+ const Expression *performSymbolicPHIEvaluation(Instruction *,
+ const BasicBlock *);
+ bool setMemoryAccessEquivTo(MemoryAccess *From, MemoryAccess *To);
+ const Expression *performSymbolicAggrValueEvaluation(Instruction *,
+ const BasicBlock *);
+
+ // Congruence finding.
+ // Templated to allow them to work both on BB's and BB-edges.
+ template <class T>
+ Value *lookupOperandLeader(Value *, const User *, const T &) const;
+ void performCongruenceFinding(Instruction *, const Expression *);
+ void moveValueToNewCongruenceClass(Instruction *, CongruenceClass *,
+ CongruenceClass *);
+ // Reachability handling.
+ void updateReachableEdge(BasicBlock *, BasicBlock *);
+ void processOutgoingEdges(TerminatorInst *, BasicBlock *);
+ bool isOnlyReachableViaThisEdge(const BasicBlockEdge &) const;
+ Value *findConditionEquivalence(Value *, BasicBlock *) const;
+ MemoryAccess *lookupMemoryAccessEquiv(MemoryAccess *) const;
+
+ // Elimination.
+ struct ValueDFS;
+ void convertDenseToDFSOrdered(CongruenceClass::MemberSet &,
+ SmallVectorImpl<ValueDFS> &);
+
+ bool eliminateInstructions(Function &);
+ void replaceInstruction(Instruction *, Value *);
+ void markInstructionForDeletion(Instruction *);
+ void deleteInstructionsInBlock(BasicBlock *);
+
+ // New instruction creation.
+ void handleNewInstruction(Instruction *){};
+
+ // Various instruction touch utilities
+ void markUsersTouched(Value *);
+ void markMemoryUsersTouched(MemoryAccess *);
+ void markLeaderChangeTouched(CongruenceClass *CC);
+
+ // Utilities.
+ void cleanupTables();
+ std::pair<unsigned, unsigned> assignDFSNumbers(BasicBlock *, unsigned);
+ void updateProcessedCount(Value *V);
+ void verifyMemoryCongruency() const;
+ bool singleReachablePHIPath(const MemoryAccess *, const MemoryAccess *) const;
+};
+
+char NewGVN::ID = 0;
+
+// createGVNPass - The public interface to this file.
+FunctionPass *llvm::createNewGVNPass() { return new NewGVN(); }
+
+template <typename T>
+static bool equalsLoadStoreHelper(const T &LHS, const Expression &RHS) {
+ if ((!isa<LoadExpression>(RHS) && !isa<StoreExpression>(RHS)) ||
+ !LHS.BasicExpression::equals(RHS)) {
+ return false;
+ } else if (const auto *L = dyn_cast<LoadExpression>(&RHS)) {
+ if (LHS.getDefiningAccess() != L->getDefiningAccess())
+ return false;
+ } else if (const auto *S = dyn_cast<StoreExpression>(&RHS)) {
+ if (LHS.getDefiningAccess() != S->getDefiningAccess())
+ return false;
+ }
+ return true;
+}
+
+bool LoadExpression::equals(const Expression &Other) const {
+ return equalsLoadStoreHelper(*this, Other);
+}
+
+bool StoreExpression::equals(const Expression &Other) const {
+ return equalsLoadStoreHelper(*this, Other);
+}
+
+#ifndef NDEBUG
+static std::string getBlockName(const BasicBlock *B) {
+ return DOTGraphTraits<const Function *>::getSimpleNodeLabel(B, nullptr);
+}
+#endif
+
+INITIALIZE_PASS_BEGIN(NewGVN, "newgvn", "Global Value Numbering", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
+INITIALIZE_PASS_END(NewGVN, "newgvn", "Global Value Numbering", false, false)
+
+PHIExpression *NewGVN::createPHIExpression(Instruction *I) {
+ BasicBlock *PHIBlock = I->getParent();
+ auto *PN = cast<PHINode>(I);
+ auto *E =
+ new (ExpressionAllocator) PHIExpression(PN->getNumOperands(), PHIBlock);
+
+ E->allocateOperands(ArgRecycler, ExpressionAllocator);
+ E->setType(I->getType());
+ E->setOpcode(I->getOpcode());
+
+ auto ReachablePhiArg = [&](const Use &U) {
+ return ReachableBlocks.count(PN->getIncomingBlock(U));
+ };
+
+ // Filter out unreachable operands
+ auto Filtered = make_filter_range(PN->operands(), ReachablePhiArg);
+
+ std::transform(Filtered.begin(), Filtered.end(), op_inserter(E),
+ [&](const Use &U) -> Value * {
+ // Don't try to transform self-defined phis.
+ if (U == PN)
+ return PN;
+ const BasicBlockEdge BBE(PN->getIncomingBlock(U), PHIBlock);
+ return lookupOperandLeader(U, I, BBE);
+ });
+ return E;
+}
+
+// Set basic expression info (Arguments, type, opcode) for Expression
+// E from Instruction I in block B.
+bool NewGVN::setBasicExpressionInfo(Instruction *I, BasicExpression *E,
+ const BasicBlock *B) {
+ bool AllConstant = true;
+ if (auto *GEP = dyn_cast<GetElementPtrInst>(I))
+ E->setType(GEP->getSourceElementType());
+ else
+ E->setType(I->getType());
+ E->setOpcode(I->getOpcode());
+ E->allocateOperands(ArgRecycler, ExpressionAllocator);
+
+ // Transform the operand array into an operand leader array, and keep track of
+ // whether all members are constant.
+ std::transform(I->op_begin(), I->op_end(), op_inserter(E), [&](Value *O) {
+ auto Operand = lookupOperandLeader(O, I, B);
+ AllConstant &= isa<Constant>(Operand);
+ return Operand;
+ });
+
+ return AllConstant;
+}
+
+const Expression *NewGVN::createBinaryExpression(unsigned Opcode, Type *T,
+ Value *Arg1, Value *Arg2,
+ const BasicBlock *B) {
+ auto *E = new (ExpressionAllocator) BasicExpression(2);
+
+ E->setType(T);
+ E->setOpcode(Opcode);
+ E->allocateOperands(ArgRecycler, ExpressionAllocator);
+ if (Instruction::isCommutative(Opcode)) {
+ // Ensure that commutative instructions that only differ by a permutation
+ // of their operands get the same value number by sorting the operand value
+ // numbers. Since all commutative instructions have two operands it is more
+ // efficient to sort by hand rather than using, say, std::sort.
+ if (Arg1 > Arg2)
+ std::swap(Arg1, Arg2);
+ }
+ E->op_push_back(lookupOperandLeader(Arg1, nullptr, B));
+ E->op_push_back(lookupOperandLeader(Arg2, nullptr, B));
+
+ Value *V = SimplifyBinOp(Opcode, E->getOperand(0), E->getOperand(1), *DL, TLI,
+ DT, AC);
+ if (const Expression *SimplifiedE = checkSimplificationResults(E, nullptr, V))
+ return SimplifiedE;
+ return E;
+}
+
+// Take a Value returned by simplification of Expression E/Instruction
+// I, and see if it resulted in a simpler expression. If so, return
+// that expression.
+// TODO: Once finished, this should not take an Instruction, we only
+// use it for printing.
+const Expression *NewGVN::checkSimplificationResults(Expression *E,
+ Instruction *I, Value *V) {
+ if (!V)
+ return nullptr;
+ if (auto *C = dyn_cast<Constant>(V)) {
+ if (I)
+ DEBUG(dbgs() << "Simplified " << *I << " to "
+ << " constant " << *C << "\n");
+ NumGVNOpsSimplified++;
+ assert(isa<BasicExpression>(E) &&
+ "We should always have had a basic expression here");
+
+ cast<BasicExpression>(E)->deallocateOperands(ArgRecycler);
+ ExpressionAllocator.Deallocate(E);
+ return createConstantExpression(C);
+ } else if (isa<Argument>(V) || isa<GlobalVariable>(V)) {
+ if (I)
+ DEBUG(dbgs() << "Simplified " << *I << " to "
+ << " variable " << *V << "\n");
+ cast<BasicExpression>(E)->deallocateOperands(ArgRecycler);
+ ExpressionAllocator.Deallocate(E);
+ return createVariableExpression(V);
+ }
+
+ CongruenceClass *CC = ValueToClass.lookup(V);
+ if (CC && CC->DefiningExpr) {
+ if (I)
+ DEBUG(dbgs() << "Simplified " << *I << " to "
+ << " expression " << *V << "\n");
+ NumGVNOpsSimplified++;
+ assert(isa<BasicExpression>(E) &&
+ "We should always have had a basic expression here");
+ cast<BasicExpression>(E)->deallocateOperands(ArgRecycler);
+ ExpressionAllocator.Deallocate(E);
+ return CC->DefiningExpr;
+ }
+ return nullptr;
+}
+
+const Expression *NewGVN::createExpression(Instruction *I,
+ const BasicBlock *B) {
+
+ auto *E = new (ExpressionAllocator) BasicExpression(I->getNumOperands());
+
+ bool AllConstant = setBasicExpressionInfo(I, E, B);
+
+ if (I->isCommutative()) {
+ // Ensure that commutative instructions that only differ by a permutation
+ // of their operands get the same value number by sorting the operand value
+ // numbers. Since all commutative instructions have two operands it is more
+ // efficient to sort by hand rather than using, say, std::sort.
+ assert(I->getNumOperands() == 2 && "Unsupported commutative instruction!");
+ if (E->getOperand(0) > E->getOperand(1))
+ E->swapOperands(0, 1);
+ }
+
+ // Perform simplificaiton
+ // TODO: Right now we only check to see if we get a constant result.
+ // We may get a less than constant, but still better, result for
+ // some operations.
+ // IE
+ // add 0, x -> x
+ // and x, x -> x
+ // We should handle this by simply rewriting the expression.
+ if (auto *CI = dyn_cast<CmpInst>(I)) {
+ // Sort the operand value numbers so x<y and y>x get the same value
+ // number.
+ CmpInst::Predicate Predicate = CI->getPredicate();
+ if (E->getOperand(0) > E->getOperand(1)) {
+ E->swapOperands(0, 1);
+ Predicate = CmpInst::getSwappedPredicate(Predicate);
+ }
+ E->setOpcode((CI->getOpcode() << 8) | Predicate);
+ // TODO: 25% of our time is spent in SimplifyCmpInst with pointer operands
+ // TODO: Since we noop bitcasts, we may need to check types before
+ // simplifying, so that we don't end up simplifying based on a wrong
+ // type assumption. We should clean this up so we can use constants of the
+ // wrong type
+
+ assert(I->getOperand(0)->getType() == I->getOperand(1)->getType() &&
+ "Wrong types on cmp instruction");
+ if ((E->getOperand(0)->getType() == I->getOperand(0)->getType() &&
+ E->getOperand(1)->getType() == I->getOperand(1)->getType())) {
+ Value *V = SimplifyCmpInst(Predicate, E->getOperand(0), E->getOperand(1),
+ *DL, TLI, DT, AC);
+ if (const Expression *SimplifiedE = checkSimplificationResults(E, I, V))
+ return SimplifiedE;
+ }
+ } else if (isa<SelectInst>(I)) {
+ if (isa<Constant>(E->getOperand(0)) ||
+ (E->getOperand(1)->getType() == I->getOperand(1)->getType() &&
+ E->getOperand(2)->getType() == I->getOperand(2)->getType())) {
+ Value *V = SimplifySelectInst(E->getOperand(0), E->getOperand(1),
+ E->getOperand(2), *DL, TLI, DT, AC);
+ if (const Expression *SimplifiedE = checkSimplificationResults(E, I, V))
+ return SimplifiedE;
+ }
+ } else if (I->isBinaryOp()) {
+ Value *V = SimplifyBinOp(E->getOpcode(), E->getOperand(0), E->getOperand(1),
+ *DL, TLI, DT, AC);
+ if (const Expression *SimplifiedE = checkSimplificationResults(E, I, V))
+ return SimplifiedE;
+ } else if (auto *BI = dyn_cast<BitCastInst>(I)) {
+ Value *V = SimplifyInstruction(BI, *DL, TLI, DT, AC);
+ if (const Expression *SimplifiedE = checkSimplificationResults(E, I, V))
+ return SimplifiedE;
+ } else if (isa<GetElementPtrInst>(I)) {
+ Value *V = SimplifyGEPInst(E->getType(),
+ ArrayRef<Value *>(E->op_begin(), E->op_end()),
+ *DL, TLI, DT, AC);
+ if (const Expression *SimplifiedE = checkSimplificationResults(E, I, V))
+ return SimplifiedE;
+ } else if (AllConstant) {
+ // We don't bother trying to simplify unless all of the operands
+ // were constant.
+ // TODO: There are a lot of Simplify*'s we could call here, if we
+ // wanted to. The original motivating case for this code was a
+ // zext i1 false to i8, which we don't have an interface to
+ // simplify (IE there is no SimplifyZExt).
+
+ SmallVector<Constant *, 8> C;
+ for (Value *Arg : E->operands())
+ C.emplace_back(cast<Constant>(Arg));
+
+ if (Value *V = ConstantFoldInstOperands(I, C, *DL, TLI))
+ if (const Expression *SimplifiedE = checkSimplificationResults(E, I, V))
+ return SimplifiedE;
+ }
+ return E;
+}
+
+const AggregateValueExpression *
+NewGVN::createAggregateValueExpression(Instruction *I, const BasicBlock *B) {
+ if (auto *II = dyn_cast<InsertValueInst>(I)) {
+ auto *E = new (ExpressionAllocator)
+ AggregateValueExpression(I->getNumOperands(), II->getNumIndices());
+ setBasicExpressionInfo(I, E, B);
+ E->allocateIntOperands(ExpressionAllocator);
+ std::copy(II->idx_begin(), II->idx_end(), int_op_inserter(E));
+ return E;
+ } else if (auto *EI = dyn_cast<ExtractValueInst>(I)) {
+ auto *E = new (ExpressionAllocator)
+ AggregateValueExpression(I->getNumOperands(), EI->getNumIndices());
+ setBasicExpressionInfo(EI, E, B);
+ E->allocateIntOperands(ExpressionAllocator);
+ std::copy(EI->idx_begin(), EI->idx_end(), int_op_inserter(E));
+ return E;
+ }
+ llvm_unreachable("Unhandled type of aggregate value operation");
+}
+
+const VariableExpression *NewGVN::createVariableExpression(Value *V) {
+ auto *E = new (ExpressionAllocator) VariableExpression(V);
+ E->setOpcode(V->getValueID());
+ return E;
+}
+
+const Expression *NewGVN::createVariableOrConstant(Value *V,
+ const BasicBlock *B) {
+ auto Leader = lookupOperandLeader(V, nullptr, B);
+ if (auto *C = dyn_cast<Constant>(Leader))
+ return createConstantExpression(C);
+ return createVariableExpression(Leader);
+}
+
+const ConstantExpression *NewGVN::createConstantExpression(Constant *C) {
+ auto *E = new (ExpressionAllocator) ConstantExpression(C);
+ E->setOpcode(C->getValueID());
+ return E;
+}
+
+const UnknownExpression *NewGVN::createUnknownExpression(Instruction *I) {
+ auto *E = new (ExpressionAllocator) UnknownExpression(I);
+ E->setOpcode(I->getOpcode());
+ return E;
+}
+
+const CallExpression *NewGVN::createCallExpression(CallInst *CI,
+ MemoryAccess *HV,
+ const BasicBlock *B) {
+ // FIXME: Add operand bundles for calls.
+ auto *E =
+ new (ExpressionAllocator) CallExpression(CI->getNumOperands(), CI, HV);
+ setBasicExpressionInfo(CI, E, B);
+ return E;
+}
+
+// See if we have a congruence class and leader for this operand, and if so,
+// return it. Otherwise, return the operand itself.
+template <class T>
+Value *NewGVN::lookupOperandLeader(Value *V, const User *U, const T &B) const {
+ CongruenceClass *CC = ValueToClass.lookup(V);
+ if (CC && (CC != InitialClass))
+ return CC->RepLeader;
+ return V;
+}
+
+MemoryAccess *NewGVN::lookupMemoryAccessEquiv(MemoryAccess *MA) const {
+ MemoryAccess *Result = MemoryAccessEquiv.lookup(MA);
+ return Result ? Result : MA;
+}
+
+LoadExpression *NewGVN::createLoadExpression(Type *LoadType, Value *PointerOp,
+ LoadInst *LI, MemoryAccess *DA,
+ const BasicBlock *B) {
+ auto *E = new (ExpressionAllocator) LoadExpression(1, LI, DA);
+ E->allocateOperands(ArgRecycler, ExpressionAllocator);
+ E->setType(LoadType);
+
+ // Give store and loads same opcode so they value number together.
+ E->setOpcode(0);
+ E->op_push_back(lookupOperandLeader(PointerOp, LI, B));
+ if (LI)
+ E->setAlignment(LI->getAlignment());
+
+ // TODO: Value number heap versions. We may be able to discover
+ // things alias analysis can't on it's own (IE that a store and a
+ // load have the same value, and thus, it isn't clobbering the load).
+ return E;
+}
+
+const StoreExpression *NewGVN::createStoreExpression(StoreInst *SI,
+ MemoryAccess *DA,
+ const BasicBlock *B) {
+ auto *E =
+ new (ExpressionAllocator) StoreExpression(SI->getNumOperands(), SI, DA);
+ E->allocateOperands(ArgRecycler, ExpressionAllocator);
+ E->setType(SI->getValueOperand()->getType());
+
+ // Give store and loads same opcode so they value number together.
+ E->setOpcode(0);
+ E->op_push_back(lookupOperandLeader(SI->getPointerOperand(), SI, B));
+
+ // TODO: Value number heap versions. We may be able to discover
+ // things alias analysis can't on it's own (IE that a store and a
+ // load have the same value, and thus, it isn't clobbering the load).
+ return E;
+}
+
+// Utility function to check whether the congruence class has a member other
+// than the given instruction.
+bool hasMemberOtherThanUs(const CongruenceClass *CC, Instruction *I) {
+ // Either it has more than one store, in which case it must contain something
+ // other than us (because it's indexed by value), or if it only has one store
+ // right now, that member should not be us.
+ return CC->StoreCount > 1 || CC->Members.count(I) == 0;
+}
+
+const Expression *NewGVN::performSymbolicStoreEvaluation(Instruction *I,
+ const BasicBlock *B) {
+ // Unlike loads, we never try to eliminate stores, so we do not check if they
+ // are simple and avoid value numbering them.
+ auto *SI = cast<StoreInst>(I);
+ MemoryAccess *StoreAccess = MSSA->getMemoryAccess(SI);
+ // See if we are defined by a previous store expression, it already has a
+ // value, and it's the same value as our current store. FIXME: Right now, we
+ // only do this for simple stores, we should expand to cover memcpys, etc.
+ if (SI->isSimple()) {
+ // Get the expression, if any, for the RHS of the MemoryDef.
+ MemoryAccess *StoreRHS = lookupMemoryAccessEquiv(
+ cast<MemoryDef>(StoreAccess)->getDefiningAccess());
+ const Expression *OldStore = createStoreExpression(SI, StoreRHS, B);
+ CongruenceClass *CC = ExpressionToClass.lookup(OldStore);
+ // Basically, check if the congruence class the store is in is defined by a
+ // store that isn't us, and has the same value. MemorySSA takes care of
+ // ensuring the store has the same memory state as us already.
+ if (CC && CC->DefiningExpr && isa<StoreExpression>(CC->DefiningExpr) &&
+ CC->RepLeader == lookupOperandLeader(SI->getValueOperand(), SI, B) &&
+ hasMemberOtherThanUs(CC, I))
+ return createStoreExpression(SI, StoreRHS, B);
+ }
+
+ return createStoreExpression(SI, StoreAccess, B);
+}
+
+const Expression *NewGVN::performSymbolicLoadEvaluation(Instruction *I,
+ const BasicBlock *B) {
+ auto *LI = cast<LoadInst>(I);
+
+ // We can eliminate in favor of non-simple loads, but we won't be able to
+ // eliminate the loads themselves.
+ if (!LI->isSimple())
+ return nullptr;
+
+ Value *LoadAddressLeader = lookupOperandLeader(LI->getPointerOperand(), I, B);
+ // Load of undef is undef.
+ if (isa<UndefValue>(LoadAddressLeader))
+ return createConstantExpression(UndefValue::get(LI->getType()));
+
+ MemoryAccess *DefiningAccess = MSSAWalker->getClobberingMemoryAccess(I);
+
+ if (!MSSA->isLiveOnEntryDef(DefiningAccess)) {
+ if (auto *MD = dyn_cast<MemoryDef>(DefiningAccess)) {
+ Instruction *DefiningInst = MD->getMemoryInst();
+ // If the defining instruction is not reachable, replace with undef.
+ if (!ReachableBlocks.count(DefiningInst->getParent()))
+ return createConstantExpression(UndefValue::get(LI->getType()));
+ }
+ }
+
+ const Expression *E =
+ createLoadExpression(LI->getType(), LI->getPointerOperand(), LI,
+ lookupMemoryAccessEquiv(DefiningAccess), B);
+ return E;
+}
+
+// Evaluate read only and pure calls, and create an expression result.
+const Expression *NewGVN::performSymbolicCallEvaluation(Instruction *I,
+ const BasicBlock *B) {
+ auto *CI = cast<CallInst>(I);
+ if (AA->doesNotAccessMemory(CI))
+ return createCallExpression(CI, nullptr, B);
+ if (AA->onlyReadsMemory(CI)) {
+ MemoryAccess *DefiningAccess = MSSAWalker->getClobberingMemoryAccess(CI);
+ return createCallExpression(CI, lookupMemoryAccessEquiv(DefiningAccess), B);
+ }
+ return nullptr;
+}
+
+// Update the memory access equivalence table to say that From is equal to To,
+// and return true if this is different from what already existed in the table.
+bool NewGVN::setMemoryAccessEquivTo(MemoryAccess *From, MemoryAccess *To) {
+ DEBUG(dbgs() << "Setting " << *From << " equivalent to ");
+ if (!To)
+ DEBUG(dbgs() << "itself");
+ else
+ DEBUG(dbgs() << *To);
+ DEBUG(dbgs() << "\n");
+ auto LookupResult = MemoryAccessEquiv.find(From);
+ bool Changed = false;
+ // If it's already in the table, see if the value changed.
+ if (LookupResult != MemoryAccessEquiv.end()) {
+ if (To && LookupResult->second != To) {
+ // It wasn't equivalent before, and now it is.
+ LookupResult->second = To;
+ Changed = true;
+ } else if (!To) {
+ // It used to be equivalent to something, and now it's not.
+ MemoryAccessEquiv.erase(LookupResult);
+ Changed = true;
+ }
+ } else {
+ assert(!To &&
+ "Memory equivalence should never change from nothing to something");
+ }
+
+ return Changed;
+}
+// Evaluate PHI nodes symbolically, and create an expression result.
+const Expression *NewGVN::performSymbolicPHIEvaluation(Instruction *I,
+ const BasicBlock *B) {
+ auto *E = cast<PHIExpression>(createPHIExpression(I));
+ // We match the semantics of SimplifyPhiNode from InstructionSimplify here.
+
+ // See if all arguaments are the same.
+ // We track if any were undef because they need special handling.
+ bool HasUndef = false;
+ auto Filtered = make_filter_range(E->operands(), [&](const Value *Arg) {
+ if (Arg == I)
+ return false;
+ if (isa<UndefValue>(Arg)) {
+ HasUndef = true;
+ return false;
+ }
+ return true;
+ });
+ // If we are left with no operands, it's undef
+ if (Filtered.begin() == Filtered.end()) {
+ DEBUG(dbgs() << "Simplified PHI node " << *I << " to undef"
+ << "\n");
+ E->deallocateOperands(ArgRecycler);
+ ExpressionAllocator.Deallocate(E);
+ return createConstantExpression(UndefValue::get(I->getType()));
+ }
+ Value *AllSameValue = *(Filtered.begin());
+ ++Filtered.begin();
+ // Can't use std::equal here, sadly, because filter.begin moves.
+ if (llvm::all_of(Filtered, [AllSameValue](const Value *V) {
+ return V == AllSameValue;
+ })) {
+ // In LLVM's non-standard representation of phi nodes, it's possible to have
+ // phi nodes with cycles (IE dependent on other phis that are .... dependent
+ // on the original phi node), especially in weird CFG's where some arguments
+ // are unreachable, or uninitialized along certain paths. This can cause
+ // infinite loops during evaluation. We work around this by not trying to
+ // really evaluate them independently, but instead using a variable
+ // expression to say if one is equivalent to the other.
+ // We also special case undef, so that if we have an undef, we can't use the
+ // common value unless it dominates the phi block.
+ if (HasUndef) {
+ // Only have to check for instructions
+ if (auto *AllSameInst = dyn_cast<Instruction>(AllSameValue))
+ if (!DT->dominates(AllSameInst, I))
+ return E;
+ }
+
+ NumGVNPhisAllSame++;
+ DEBUG(dbgs() << "Simplified PHI node " << *I << " to " << *AllSameValue
+ << "\n");
+ E->deallocateOperands(ArgRecycler);
+ ExpressionAllocator.Deallocate(E);
+ if (auto *C = dyn_cast<Constant>(AllSameValue))
+ return createConstantExpression(C);
+ return createVariableExpression(AllSameValue);
+ }
+ return E;
+}
+
+const Expression *
+NewGVN::performSymbolicAggrValueEvaluation(Instruction *I,
+ const BasicBlock *B) {
+ if (auto *EI = dyn_cast<ExtractValueInst>(I)) {
+ auto *II = dyn_cast<IntrinsicInst>(EI->getAggregateOperand());
+ if (II && EI->getNumIndices() == 1 && *EI->idx_begin() == 0) {
+ unsigned Opcode = 0;
+ // EI might be an extract from one of our recognised intrinsics. If it
+ // is we'll synthesize a semantically equivalent expression instead on
+ // an extract value expression.
+ switch (II->getIntrinsicID()) {
+ case Intrinsic::sadd_with_overflow:
+ case Intrinsic::uadd_with_overflow:
+ Opcode = Instruction::Add;
+ break;
+ case Intrinsic::ssub_with_overflow:
+ case Intrinsic::usub_with_overflow:
+ Opcode = Instruction::Sub;
+ break;
+ case Intrinsic::smul_with_overflow:
+ case Intrinsic::umul_with_overflow:
+ Opcode = Instruction::Mul;
+ break;
+ default:
+ break;
+ }
+
+ if (Opcode != 0) {
+ // Intrinsic recognized. Grab its args to finish building the
+ // expression.
+ assert(II->getNumArgOperands() == 2 &&
+ "Expect two args for recognised intrinsics.");
+ return createBinaryExpression(Opcode, EI->getType(),
+ II->getArgOperand(0),
+ II->getArgOperand(1), B);
+ }
+ }
+ }
+
+ return createAggregateValueExpression(I, B);
+}
+
+// Substitute and symbolize the value before value numbering.
+const Expression *NewGVN::performSymbolicEvaluation(Value *V,
+ const BasicBlock *B) {
+ const Expression *E = nullptr;
+ if (auto *C = dyn_cast<Constant>(V))
+ E = createConstantExpression(C);
+ else if (isa<Argument>(V) || isa<GlobalVariable>(V)) {
+ E = createVariableExpression(V);
+ } else {
+ // TODO: memory intrinsics.
+ // TODO: Some day, we should do the forward propagation and reassociation
+ // parts of the algorithm.
+ auto *I = cast<Instruction>(V);
+ switch (I->getOpcode()) {
+ case Instruction::ExtractValue:
+ case Instruction::InsertValue:
+ E = performSymbolicAggrValueEvaluation(I, B);
+ break;
+ case Instruction::PHI:
+ E = performSymbolicPHIEvaluation(I, B);
+ break;
+ case Instruction::Call:
+ E = performSymbolicCallEvaluation(I, B);
+ break;
+ case Instruction::Store:
+ E = performSymbolicStoreEvaluation(I, B);
+ break;
+ case Instruction::Load:
+ E = performSymbolicLoadEvaluation(I, B);
+ break;
+ case Instruction::BitCast: {
+ E = createExpression(I, B);
+ } break;
+
+ case Instruction::Add:
+ case Instruction::FAdd:
+ case Instruction::Sub:
+ case Instruction::FSub:
+ case Instruction::Mul:
+ case Instruction::FMul:
+ case Instruction::UDiv:
+ case Instruction::SDiv:
+ case Instruction::FDiv:
+ case Instruction::URem:
+ case Instruction::SRem:
+ case Instruction::FRem:
+ case Instruction::Shl:
+ case Instruction::LShr:
+ case Instruction::AShr:
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Xor:
+ case Instruction::ICmp:
+ case Instruction::FCmp:
+ case Instruction::Trunc:
+ case Instruction::ZExt:
+ case Instruction::SExt:
+ case Instruction::FPToUI:
+ case Instruction::FPToSI:
+ case Instruction::UIToFP:
+ case Instruction::SIToFP:
+ case Instruction::FPTrunc:
+ case Instruction::FPExt:
+ case Instruction::PtrToInt:
+ case Instruction::IntToPtr:
+ case Instruction::Select:
+ case Instruction::ExtractElement:
+ case Instruction::InsertElement:
+ case Instruction::ShuffleVector:
+ case Instruction::GetElementPtr:
+ E = createExpression(I, B);
+ break;
+ default:
+ return nullptr;
+ }
+ }
+ return E;
+}
+
+// There is an edge from 'Src' to 'Dst'. Return true if every path from
+// the entry block to 'Dst' passes via this edge. In particular 'Dst'
+// must not be reachable via another edge from 'Src'.
+bool NewGVN::isOnlyReachableViaThisEdge(const BasicBlockEdge &E) const {
+
+ // While in theory it is interesting to consider the case in which Dst has
+ // more than one predecessor, because Dst might be part of a loop which is
+ // only reachable from Src, in practice it is pointless since at the time
+ // GVN runs all such loops have preheaders, which means that Dst will have
+ // been changed to have only one predecessor, namely Src.
+ const BasicBlock *Pred = E.getEnd()->getSinglePredecessor();
+ const BasicBlock *Src = E.getStart();
+ assert((!Pred || Pred == Src) && "No edge between these basic blocks!");
+ (void)Src;
+ return Pred != nullptr;
+}
+
+void NewGVN::markUsersTouched(Value *V) {
+ // Now mark the users as touched.
+ for (auto *User : V->users()) {
+ assert(isa<Instruction>(User) && "Use of value not within an instruction?");
+ TouchedInstructions.set(InstrDFS[User]);
+ }
+}
+
+void NewGVN::markMemoryUsersTouched(MemoryAccess *MA) {
+ for (auto U : MA->users()) {
+ if (auto *MUD = dyn_cast<MemoryUseOrDef>(U))
+ TouchedInstructions.set(InstrDFS[MUD->getMemoryInst()]);
+ else
+ TouchedInstructions.set(InstrDFS[U]);
+ }
+}
+
+// Touch the instructions that need to be updated after a congruence class has a
+// leader change, and mark changed values.
+void NewGVN::markLeaderChangeTouched(CongruenceClass *CC) {
+ for (auto M : CC->Members) {
+ if (auto *I = dyn_cast<Instruction>(M))
+ TouchedInstructions.set(InstrDFS[I]);
+ LeaderChanges.insert(M);
+ }
+}
+
+// Move a value, currently in OldClass, to be part of NewClass
+// Update OldClass for the move (including changing leaders, etc)
+void NewGVN::moveValueToNewCongruenceClass(Instruction *I,
+ CongruenceClass *OldClass,
+ CongruenceClass *NewClass) {
+ DEBUG(dbgs() << "New congruence class for " << I << " is " << NewClass->ID
+ << "\n");
+
+ if (I == OldClass->NextLeader.first)
+ OldClass->NextLeader = {nullptr, ~0U};
+
+ // It's possible, though unlikely, for us to discover equivalences such
+ // that the current leader does not dominate the old one.
+ // This statistic tracks how often this happens.
+ // We assert on phi nodes when this happens, currently, for debugging, because
+ // we want to make sure we name phi node cycles properly.
+ if (isa<Instruction>(NewClass->RepLeader) && NewClass->RepLeader &&
+ I != NewClass->RepLeader &&
+ DT->properlyDominates(
+ I->getParent(),
+ cast<Instruction>(NewClass->RepLeader)->getParent())) {
+ ++NumGVNNotMostDominatingLeader;
+ assert(!isa<PHINode>(I) &&
+ "New class for instruction should not be dominated by instruction");
+ }
+
+ if (NewClass->RepLeader != I) {
+ auto DFSNum = InstrDFS.lookup(I);
+ if (DFSNum < NewClass->NextLeader.second)
+ NewClass->NextLeader = {I, DFSNum};
+ }
+
+ OldClass->Members.erase(I);
+ NewClass->Members.insert(I);
+ if (isa<StoreInst>(I)) {
+ --OldClass->StoreCount;
+ assert(OldClass->StoreCount >= 0);
+ ++NewClass->StoreCount;
+ assert(NewClass->StoreCount > 0);
+ }
+
+ ValueToClass[I] = NewClass;
+ // See if we destroyed the class or need to swap leaders.
+ if (OldClass->Members.empty() && OldClass != InitialClass) {
+ if (OldClass->DefiningExpr) {
+ OldClass->Dead = true;
+ DEBUG(dbgs() << "Erasing expression " << OldClass->DefiningExpr
+ << " from table\n");
+ ExpressionToClass.erase(OldClass->DefiningExpr);
+ }
+ } else if (OldClass->RepLeader == I) {
+ // When the leader changes, the value numbering of
+ // everything may change due to symbolization changes, so we need to
+ // reprocess.
+ DEBUG(dbgs() << "Leader change!\n");
+ ++NumGVNLeaderChanges;
+ // We don't need to sort members if there is only 1, and we don't care about
+ // sorting the initial class because everything either gets out of it or is
+ // unreachable.
+ if (OldClass->Members.size() == 1 || OldClass == InitialClass) {
+ OldClass->RepLeader = *(OldClass->Members.begin());
+ } else if (OldClass->NextLeader.first) {
+ ++NumGVNAvoidedSortedLeaderChanges;
+ OldClass->RepLeader = OldClass->NextLeader.first;
+ OldClass->NextLeader = {nullptr, ~0U};
+ } else {
+ ++NumGVNSortedLeaderChanges;
+ // TODO: If this ends up to slow, we can maintain a dual structure for
+ // member testing/insertion, or keep things mostly sorted, and sort only
+ // here, or ....
+ std::pair<Value *, unsigned> MinDFS = {nullptr, ~0U};
+ for (const auto X : OldClass->Members) {
+ auto DFSNum = InstrDFS.lookup(X);
+ if (DFSNum < MinDFS.second)
+ MinDFS = {X, DFSNum};
+ }
+ OldClass->RepLeader = MinDFS.first;
+ }
+ markLeaderChangeTouched(OldClass);
+ }
+}
+
+// Perform congruence finding on a given value numbering expression.
+void NewGVN::performCongruenceFinding(Instruction *I, const Expression *E) {
+ ValueToExpression[I] = E;
+ // This is guaranteed to return something, since it will at least find
+ // INITIAL.
+
+ CongruenceClass *IClass = ValueToClass[I];
+ assert(IClass && "Should have found a IClass");
+ // Dead classes should have been eliminated from the mapping.
+ assert(!IClass->Dead && "Found a dead class");
+
+ CongruenceClass *EClass;
+ if (const auto *VE = dyn_cast<VariableExpression>(E)) {
+ EClass = ValueToClass[VE->getVariableValue()];
+ } else {
+ auto lookupResult = ExpressionToClass.insert({E, nullptr});
+
+ // If it's not in the value table, create a new congruence class.
+ if (lookupResult.second) {
+ CongruenceClass *NewClass = createCongruenceClass(nullptr, E);
+ auto place = lookupResult.first;
+ place->second = NewClass;
+
+ // Constants and variables should always be made the leader.
+ if (const auto *CE = dyn_cast<ConstantExpression>(E)) {
+ NewClass->RepLeader = CE->getConstantValue();
+ } else if (const auto *SE = dyn_cast<StoreExpression>(E)) {
+ StoreInst *SI = SE->getStoreInst();
+ NewClass->RepLeader =
+ lookupOperandLeader(SI->getValueOperand(), SI, SI->getParent());
+ } else {
+ NewClass->RepLeader = I;
+ }
+ assert(!isa<VariableExpression>(E) &&
+ "VariableExpression should have been handled already");
+
+ EClass = NewClass;
+ DEBUG(dbgs() << "Created new congruence class for " << *I
+ << " using expression " << *E << " at " << NewClass->ID
+ << " and leader " << *(NewClass->RepLeader) << "\n");
+ DEBUG(dbgs() << "Hash value was " << E->getHashValue() << "\n");
+ } else {
+ EClass = lookupResult.first->second;
+ if (isa<ConstantExpression>(E))
+ assert(isa<Constant>(EClass->RepLeader) &&
+ "Any class with a constant expression should have a "
+ "constant leader");
+
+ assert(EClass && "Somehow don't have an eclass");
+
+ assert(!EClass->Dead && "We accidentally looked up a dead class");
+ }
+ }
+ bool ClassChanged = IClass != EClass;
+ bool LeaderChanged = LeaderChanges.erase(I);
+ if (ClassChanged || LeaderChanged) {
+ DEBUG(dbgs() << "Found class " << EClass->ID << " for expression " << E
+ << "\n");
+
+ if (ClassChanged)
+ moveValueToNewCongruenceClass(I, IClass, EClass);
+ markUsersTouched(I);
+ if (MemoryAccess *MA = MSSA->getMemoryAccess(I)) {
+ // If this is a MemoryDef, we need to update the equivalence table. If
+ // we determined the expression is congruent to a different memory
+ // state, use that different memory state. If we determined it didn't,
+ // we update that as well. Right now, we only support store
+ // expressions.
+ if (!isa<MemoryUse>(MA) && isa<StoreExpression>(E) &&
+ EClass->Members.size() != 1) {
+ auto *DefAccess = cast<StoreExpression>(E)->getDefiningAccess();
+ setMemoryAccessEquivTo(MA, DefAccess != MA ? DefAccess : nullptr);
+ } else {
+ setMemoryAccessEquivTo(MA, nullptr);
+ }
+ markMemoryUsersTouched(MA);
+ }
+ } else if (auto *SI = dyn_cast<StoreInst>(I)) {
+ // There is, sadly, one complicating thing for stores. Stores do not
+ // produce values, only consume them. However, in order to make loads and
+ // stores value number the same, we ignore the value operand of the store.
+ // But the value operand will still be the leader of our class, and thus, it
+ // may change. Because the store is a use, the store will get reprocessed,
+ // but nothing will change about it, and so nothing above will catch it
+ // (since the class will not change). In order to make sure everything ends
+ // up okay, we need to recheck the leader of the class. Since stores of
+ // different values value number differently due to different memorydefs, we
+ // are guaranteed the leader is always the same between stores in the same
+ // class.
+ DEBUG(dbgs() << "Checking store leader\n");
+ auto ProperLeader =
+ lookupOperandLeader(SI->getValueOperand(), SI, SI->getParent());
+ if (EClass->RepLeader != ProperLeader) {
+ DEBUG(dbgs() << "Store leader changed, fixing\n");
+ EClass->RepLeader = ProperLeader;
+ markLeaderChangeTouched(EClass);
+ markMemoryUsersTouched(MSSA->getMemoryAccess(SI));
+ }
+ }
+}
+
+// Process the fact that Edge (from, to) is reachable, including marking
+// any newly reachable blocks and instructions for processing.
+void NewGVN::updateReachableEdge(BasicBlock *From, BasicBlock *To) {
+ // Check if the Edge was reachable before.
+ if (ReachableEdges.insert({From, To}).second) {
+ // If this block wasn't reachable before, all instructions are touched.
+ if (ReachableBlocks.insert(To).second) {
+ DEBUG(dbgs() << "Block " << getBlockName(To) << " marked reachable\n");
+ const auto &InstRange = BlockInstRange.lookup(To);
+ TouchedInstructions.set(InstRange.first, InstRange.second);
+ } else {
+ DEBUG(dbgs() << "Block " << getBlockName(To)
+ << " was reachable, but new edge {" << getBlockName(From)
+ << "," << getBlockName(To) << "} to it found\n");
+
+ // We've made an edge reachable to an existing block, which may
+ // impact predicates. Otherwise, only mark the phi nodes as touched, as
+ // they are the only thing that depend on new edges. Anything using their
+ // values will get propagated to if necessary.
+ if (MemoryAccess *MemPhi = MSSA->getMemoryAccess(To))
+ TouchedInstructions.set(InstrDFS[MemPhi]);
+
+ auto BI = To->begin();
+ while (isa<PHINode>(BI)) {
+ TouchedInstructions.set(InstrDFS[&*BI]);
+ ++BI;
+ }
+ }
+ }
+}
+
+// Given a predicate condition (from a switch, cmp, or whatever) and a block,
+// see if we know some constant value for it already.
+Value *NewGVN::findConditionEquivalence(Value *Cond, BasicBlock *B) const {
+ auto Result = lookupOperandLeader(Cond, nullptr, B);
+ if (isa<Constant>(Result))
+ return Result;
+ return nullptr;
+}
+
+// Process the outgoing edges of a block for reachability.
+void NewGVN::processOutgoingEdges(TerminatorInst *TI, BasicBlock *B) {
+ // Evaluate reachability of terminator instruction.
+ BranchInst *BR;
+ if ((BR = dyn_cast<BranchInst>(TI)) && BR->isConditional()) {
+ Value *Cond = BR->getCondition();
+ Value *CondEvaluated = findConditionEquivalence(Cond, B);
+ if (!CondEvaluated) {
+ if (auto *I = dyn_cast<Instruction>(Cond)) {
+ const Expression *E = createExpression(I, B);
+ if (const auto *CE = dyn_cast<ConstantExpression>(E)) {
+ CondEvaluated = CE->getConstantValue();
+ }
+ } else if (isa<ConstantInt>(Cond)) {
+ CondEvaluated = Cond;
+ }
+ }
+ ConstantInt *CI;
+ BasicBlock *TrueSucc = BR->getSuccessor(0);
+ BasicBlock *FalseSucc = BR->getSuccessor(1);
+ if (CondEvaluated && (CI = dyn_cast<ConstantInt>(CondEvaluated))) {
+ if (CI->isOne()) {
+ DEBUG(dbgs() << "Condition for Terminator " << *TI
+ << " evaluated to true\n");
+ updateReachableEdge(B, TrueSucc);
+ } else if (CI->isZero()) {
+ DEBUG(dbgs() << "Condition for Terminator " << *TI
+ << " evaluated to false\n");
+ updateReachableEdge(B, FalseSucc);
+ }
+ } else {
+ updateReachableEdge(B, TrueSucc);
+ updateReachableEdge(B, FalseSucc);
+ }
+ } else if (auto *SI = dyn_cast<SwitchInst>(TI)) {
+ // For switches, propagate the case values into the case
+ // destinations.
+
+ // Remember how many outgoing edges there are to every successor.
+ SmallDenseMap<BasicBlock *, unsigned, 16> SwitchEdges;
+
+ Value *SwitchCond = SI->getCondition();
+ Value *CondEvaluated = findConditionEquivalence(SwitchCond, B);
+ // See if we were able to turn this switch statement into a constant.
+ if (CondEvaluated && isa<ConstantInt>(CondEvaluated)) {
+ auto *CondVal = cast<ConstantInt>(CondEvaluated);
+ // We should be able to get case value for this.
+ auto CaseVal = SI->findCaseValue(CondVal);
+ if (CaseVal.getCaseSuccessor() == SI->getDefaultDest()) {
+ // We proved the value is outside of the range of the case.
+ // We can't do anything other than mark the default dest as reachable,
+ // and go home.
+ updateReachableEdge(B, SI->getDefaultDest());
+ return;
+ }
+ // Now get where it goes and mark it reachable.
+ BasicBlock *TargetBlock = CaseVal.getCaseSuccessor();
+ updateReachableEdge(B, TargetBlock);
+ } else {
+ for (unsigned i = 0, e = SI->getNumSuccessors(); i != e; ++i) {
+ BasicBlock *TargetBlock = SI->getSuccessor(i);
+ ++SwitchEdges[TargetBlock];
+ updateReachableEdge(B, TargetBlock);
+ }
+ }
+ } else {
+ // Otherwise this is either unconditional, or a type we have no
+ // idea about. Just mark successors as reachable.
+ for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i) {
+ BasicBlock *TargetBlock = TI->getSuccessor(i);
+ updateReachableEdge(B, TargetBlock);
+ }
+
+ // This also may be a memory defining terminator, in which case, set it
+ // equivalent to nothing.
+ if (MemoryAccess *MA = MSSA->getMemoryAccess(TI))
+ setMemoryAccessEquivTo(MA, nullptr);
+ }
+}
+
+// The algorithm initially places the values of the routine in the INITIAL
+// congruence
+// class. The leader of INITIAL is the undetermined value `TOP`.
+// When the algorithm has finished, values still in INITIAL are unreachable.
+void NewGVN::initializeCongruenceClasses(Function &F) {
+ // FIXME now i can't remember why this is 2
+ NextCongruenceNum = 2;
+ // Initialize all other instructions to be in INITIAL class.
+ CongruenceClass::MemberSet InitialValues;
+ InitialClass = createCongruenceClass(nullptr, nullptr);
+ for (auto &B : F) {
+ if (auto *MP = MSSA->getMemoryAccess(&B))
+ MemoryAccessEquiv.insert({MP, MSSA->getLiveOnEntryDef()});
+
+ for (auto &I : B) {
+ InitialValues.insert(&I);
+ ValueToClass[&I] = InitialClass;
+ // All memory accesses are equivalent to live on entry to start. They must
+ // be initialized to something so that initial changes are noticed. For
+ // the maximal answer, we initialize them all to be the same as
+ // liveOnEntry. Note that to save time, we only initialize the
+ // MemoryDef's for stores and all MemoryPhis to be equal. Right now, no
+ // other expression can generate a memory equivalence. If we start
+ // handling memcpy/etc, we can expand this.
+ if (isa<StoreInst>(&I)) {
+ MemoryAccessEquiv.insert(
+ {MSSA->getMemoryAccess(&I), MSSA->getLiveOnEntryDef()});
+ ++InitialClass->StoreCount;
+ assert(InitialClass->StoreCount > 0);
+ }
+ }
+ }
+ InitialClass->Members.swap(InitialValues);
+
+ // Initialize arguments to be in their own unique congruence classes
+ for (auto &FA : F.args())
+ createSingletonCongruenceClass(&FA);
+}
+
+void NewGVN::cleanupTables() {
+ for (unsigned i = 0, e = CongruenceClasses.size(); i != e; ++i) {
+ DEBUG(dbgs() << "Congruence class " << CongruenceClasses[i]->ID << " has "
+ << CongruenceClasses[i]->Members.size() << " members\n");
+ // Make sure we delete the congruence class (probably worth switching to
+ // a unique_ptr at some point.
+ delete CongruenceClasses[i];
+ CongruenceClasses[i] = nullptr;
+ }
+
+ ValueToClass.clear();
+ ArgRecycler.clear(ExpressionAllocator);
+ ExpressionAllocator.Reset();
+ CongruenceClasses.clear();
+ ExpressionToClass.clear();
+ ValueToExpression.clear();
+ ReachableBlocks.clear();
+ ReachableEdges.clear();
+#ifndef NDEBUG
+ ProcessedCount.clear();
+#endif
+ DFSDomMap.clear();
+ InstrDFS.clear();
+ InstructionsToErase.clear();
+
+ DFSToInstr.clear();
+ BlockInstRange.clear();
+ TouchedInstructions.clear();
+ DominatedInstRange.clear();
+ MemoryAccessEquiv.clear();
+}
+
+std::pair<unsigned, unsigned> NewGVN::assignDFSNumbers(BasicBlock *B,
+ unsigned Start) {
+ unsigned End = Start;
+ if (MemoryAccess *MemPhi = MSSA->getMemoryAccess(B)) {
+ InstrDFS[MemPhi] = End++;
+ DFSToInstr.emplace_back(MemPhi);
+ }
+
+ for (auto &I : *B) {
+ InstrDFS[&I] = End++;
+ DFSToInstr.emplace_back(&I);
+ }
+
+ // All of the range functions taken half-open ranges (open on the end side).
+ // So we do not subtract one from count, because at this point it is one
+ // greater than the last instruction.
+ return std::make_pair(Start, End);
+}
+
+void NewGVN::updateProcessedCount(Value *V) {
+#ifndef NDEBUG
+ if (ProcessedCount.count(V) == 0) {
+ ProcessedCount.insert({V, 1});
+ } else {
+ ProcessedCount[V] += 1;
+ assert(ProcessedCount[V] < 100 &&
+ "Seem to have processed the same Value a lot");
+ }
+#endif
+}
+// Evaluate MemoryPhi nodes symbolically, just like PHI nodes
+void NewGVN::valueNumberMemoryPhi(MemoryPhi *MP) {
+ // If all the arguments are the same, the MemoryPhi has the same value as the
+ // argument.
+ // Filter out unreachable blocks from our operands.
+ auto Filtered = make_filter_range(MP->operands(), [&](const Use &U) {
+ return ReachableBlocks.count(MP->getIncomingBlock(U));
+ });
+
+ assert(Filtered.begin() != Filtered.end() &&
+ "We should not be processing a MemoryPhi in a completely "
+ "unreachable block");
+
+ // Transform the remaining operands into operand leaders.
+ // FIXME: mapped_iterator should have a range version.
+ auto LookupFunc = [&](const Use &U) {
+ return lookupMemoryAccessEquiv(cast<MemoryAccess>(U));
+ };
+ auto MappedBegin = map_iterator(Filtered.begin(), LookupFunc);
+ auto MappedEnd = map_iterator(Filtered.end(), LookupFunc);
+
+ // and now check if all the elements are equal.
+ // Sadly, we can't use std::equals since these are random access iterators.
+ MemoryAccess *AllSameValue = *MappedBegin;
+ ++MappedBegin;
+ bool AllEqual = std::all_of(
+ MappedBegin, MappedEnd,
+ [&AllSameValue](const MemoryAccess *V) { return V == AllSameValue; });
+
+ if (AllEqual)
+ DEBUG(dbgs() << "Memory Phi value numbered to " << *AllSameValue << "\n");
+ else
+ DEBUG(dbgs() << "Memory Phi value numbered to itself\n");
+
+ if (setMemoryAccessEquivTo(MP, AllEqual ? AllSameValue : nullptr))
+ markMemoryUsersTouched(MP);
+}
+
+// Value number a single instruction, symbolically evaluating, performing
+// congruence finding, and updating mappings.
+void NewGVN::valueNumberInstruction(Instruction *I) {
+ DEBUG(dbgs() << "Processing instruction " << *I << "\n");
+ if (isInstructionTriviallyDead(I, TLI)) {
+ DEBUG(dbgs() << "Skipping unused instruction\n");
+ markInstructionForDeletion(I);
+ return;
+ }
+ if (!I->isTerminator()) {
+ const auto *Symbolized = performSymbolicEvaluation(I, I->getParent());
+ // If we couldn't come up with a symbolic expression, use the unknown
+ // expression
+ if (Symbolized == nullptr)
+ Symbolized = createUnknownExpression(I);
+ performCongruenceFinding(I, Symbolized);
+ } else {
+ // Handle terminators that return values. All of them produce values we
+ // don't currently understand.
+ if (!I->getType()->isVoidTy()) {
+ auto *Symbolized = createUnknownExpression(I);
+ performCongruenceFinding(I, Symbolized);
+ }
+ processOutgoingEdges(dyn_cast<TerminatorInst>(I), I->getParent());
+ }
+}
+
+// Check if there is a path, using single or equal argument phi nodes, from
+// First to Second.
+bool NewGVN::singleReachablePHIPath(const MemoryAccess *First,
+ const MemoryAccess *Second) const {
+ if (First == Second)
+ return true;
+
+ if (auto *FirstDef = dyn_cast<MemoryUseOrDef>(First)) {
+ auto *DefAccess = FirstDef->getDefiningAccess();
+ return singleReachablePHIPath(DefAccess, Second);
+ } else {
+ auto *MP = cast<MemoryPhi>(First);
+ auto ReachableOperandPred = [&](const Use &U) {
+ return ReachableBlocks.count(MP->getIncomingBlock(U));
+ };
+ auto FilteredPhiArgs =
+ make_filter_range(MP->operands(), ReachableOperandPred);
+ SmallVector<const Value *, 32> OperandList;
+ std::copy(FilteredPhiArgs.begin(), FilteredPhiArgs.end(),
+ std::back_inserter(OperandList));
+ bool Okay = OperandList.size() == 1;
+ if (!Okay)
+ Okay = std::equal(OperandList.begin(), OperandList.end(),
+ OperandList.begin());
+ if (Okay)
+ return singleReachablePHIPath(cast<MemoryAccess>(OperandList[0]), Second);
+ return false;
+ }
+}
+
+// Verify the that the memory equivalence table makes sense relative to the
+// congruence classes. Note that this checking is not perfect, and is currently
+// subject to very rare false negatives. It is only useful for testing/debugging.
+void NewGVN::verifyMemoryCongruency() const {
+ // Anything equivalent in the memory access table should be in the same
+ // congruence class.
+
+ // Filter out the unreachable and trivially dead entries, because they may
+ // never have been updated if the instructions were not processed.
+ auto ReachableAccessPred =
+ [&](const std::pair<const MemoryAccess *, MemoryAccess *> Pair) {
+ bool Result = ReachableBlocks.count(Pair.first->getBlock());
+ if (!Result)
+ return false;
+ if (auto *MemDef = dyn_cast<MemoryDef>(Pair.first))
+ return !isInstructionTriviallyDead(MemDef->getMemoryInst());
+ return true;
+ };
+
+ auto Filtered = make_filter_range(MemoryAccessEquiv, ReachableAccessPred);
+ for (auto KV : Filtered) {
+ assert(KV.first != KV.second &&
+ "We added a useless equivalence to the memory equivalence table");
+ // Unreachable instructions may not have changed because we never process
+ // them.
+ if (!ReachableBlocks.count(KV.first->getBlock()))
+ continue;
+ if (auto *FirstMUD = dyn_cast<MemoryUseOrDef>(KV.first)) {
+ auto *SecondMUD = dyn_cast<MemoryUseOrDef>(KV.second);
+ if (FirstMUD && SecondMUD)
+ assert((singleReachablePHIPath(FirstMUD, SecondMUD) ||
+ ValueToClass.lookup(FirstMUD->getMemoryInst()) ==
+ ValueToClass.lookup(SecondMUD->getMemoryInst())) &&
+ "The instructions for these memory operations should have "
+ "been in the same congruence class or reachable through"
+ "a single argument phi");
+ } else if (auto *FirstMP = dyn_cast<MemoryPhi>(KV.first)) {
+
+ // We can only sanely verify that MemoryDefs in the operand list all have
+ // the same class.
+ auto ReachableOperandPred = [&](const Use &U) {
+ return ReachableBlocks.count(FirstMP->getIncomingBlock(U)) &&
+ isa<MemoryDef>(U);
+
+ };
+ // All arguments should in the same class, ignoring unreachable arguments
+ auto FilteredPhiArgs =
+ make_filter_range(FirstMP->operands(), ReachableOperandPred);
+ SmallVector<const CongruenceClass *, 16> PhiOpClasses;
+ std::transform(FilteredPhiArgs.begin(), FilteredPhiArgs.end(),
+ std::back_inserter(PhiOpClasses), [&](const Use &U) {
+ const MemoryDef *MD = cast<MemoryDef>(U);
+ return ValueToClass.lookup(MD->getMemoryInst());
+ });
+ assert(std::equal(PhiOpClasses.begin(), PhiOpClasses.end(),
+ PhiOpClasses.begin()) &&
+ "All MemoryPhi arguments should be in the same class");
+ }
+ }
+}
+
+// This is the main transformation entry point.
+bool NewGVN::runGVN(Function &F, DominatorTree *_DT, AssumptionCache *_AC,
+ TargetLibraryInfo *_TLI, AliasAnalysis *_AA,
+ MemorySSA *_MSSA) {
+ bool Changed = false;
+ DT = _DT;
+ AC = _AC;
+ TLI = _TLI;
+ AA = _AA;
+ MSSA = _MSSA;
+ DL = &F.getParent()->getDataLayout();
+ MSSAWalker = MSSA->getWalker();
+
+ // Count number of instructions for sizing of hash tables, and come
+ // up with a global dfs numbering for instructions.
+ unsigned ICount = 1;
+ // Add an empty instruction to account for the fact that we start at 1
+ DFSToInstr.emplace_back(nullptr);
+ // Note: We want RPO traversal of the blocks, which is not quite the same as
+ // dominator tree order, particularly with regard whether backedges get
+ // visited first or second, given a block with multiple successors.
+ // If we visit in the wrong order, we will end up performing N times as many
+ // iterations.
+ // The dominator tree does guarantee that, for a given dom tree node, it's
+ // parent must occur before it in the RPO ordering. Thus, we only need to sort
+ // the siblings.
+ DenseMap<const DomTreeNode *, unsigned> RPOOrdering;
+ ReversePostOrderTraversal<Function *> RPOT(&F);
+ unsigned Counter = 0;
+ for (auto &B : RPOT) {
+ auto *Node = DT->getNode(B);
+ assert(Node && "RPO and Dominator tree should have same reachability");
+ RPOOrdering[Node] = ++Counter;
+ }
+ // Sort dominator tree children arrays into RPO.
+ for (auto &B : RPOT) {
+ auto *Node = DT->getNode(B);
+ if (Node->getChildren().size() > 1)
+ std::sort(Node->begin(), Node->end(),
+ [&RPOOrdering](const DomTreeNode *A, const DomTreeNode *B) {
+ return RPOOrdering[A] < RPOOrdering[B];
+ });
+ }
+
+ // Now a standard depth first ordering of the domtree is equivalent to RPO.
+ auto DFI = df_begin(DT->getRootNode());
+ for (auto DFE = df_end(DT->getRootNode()); DFI != DFE; ++DFI) {
+ BasicBlock *B = DFI->getBlock();
+ const auto &BlockRange = assignDFSNumbers(B, ICount);
+ BlockInstRange.insert({B, BlockRange});
+ ICount += BlockRange.second - BlockRange.first;
+ }
+
+ // Handle forward unreachable blocks and figure out which blocks
+ // have single preds.
+ for (auto &B : F) {
+ // Assign numbers to unreachable blocks.
+ if (!DFI.nodeVisited(DT->getNode(&B))) {
+ const auto &BlockRange = assignDFSNumbers(&B, ICount);
+ BlockInstRange.insert({&B, BlockRange});
+ ICount += BlockRange.second - BlockRange.first;
+ }
+ }
+
+ TouchedInstructions.resize(ICount);
+ DominatedInstRange.reserve(F.size());
+ // Ensure we don't end up resizing the expressionToClass map, as
+ // that can be quite expensive. At most, we have one expression per
+ // instruction.
+ ExpressionToClass.reserve(ICount);
+
+ // Initialize the touched instructions to include the entry block.
+ const auto &InstRange = BlockInstRange.lookup(&F.getEntryBlock());
+ TouchedInstructions.set(InstRange.first, InstRange.second);
+ ReachableBlocks.insert(&F.getEntryBlock());
+
+ initializeCongruenceClasses(F);
+
+ unsigned int Iterations = 0;
+ // We start out in the entry block.
+ BasicBlock *LastBlock = &F.getEntryBlock();
+ while (TouchedInstructions.any()) {
+ ++Iterations;
+ // Walk through all the instructions in all the blocks in RPO.
+ for (int InstrNum = TouchedInstructions.find_first(); InstrNum != -1;
+ InstrNum = TouchedInstructions.find_next(InstrNum)) {
+ assert(InstrNum != 0 && "Bit 0 should never be set, something touched an "
+ "instruction not in the lookup table");
+ Value *V = DFSToInstr[InstrNum];
+ BasicBlock *CurrBlock = nullptr;
+
+ if (auto *I = dyn_cast<Instruction>(V))
+ CurrBlock = I->getParent();
+ else if (auto *MP = dyn_cast<MemoryPhi>(V))
+ CurrBlock = MP->getBlock();
+ else
+ llvm_unreachable("DFSToInstr gave us an unknown type of instruction");
+
+ // If we hit a new block, do reachability processing.
+ if (CurrBlock != LastBlock) {
+ LastBlock = CurrBlock;
+ bool BlockReachable = ReachableBlocks.count(CurrBlock);
+ const auto &CurrInstRange = BlockInstRange.lookup(CurrBlock);
+
+ // If it's not reachable, erase any touched instructions and move on.
+ if (!BlockReachable) {
+ TouchedInstructions.reset(CurrInstRange.first, CurrInstRange.second);
+ DEBUG(dbgs() << "Skipping instructions in block "
+ << getBlockName(CurrBlock)
+ << " because it is unreachable\n");
+ continue;
+ }
+ updateProcessedCount(CurrBlock);
+ }
+
+ if (auto *MP = dyn_cast<MemoryPhi>(V)) {
+ DEBUG(dbgs() << "Processing MemoryPhi " << *MP << "\n");
+ valueNumberMemoryPhi(MP);
+ } else if (auto *I = dyn_cast<Instruction>(V)) {
+ valueNumberInstruction(I);
+ } else {
+ llvm_unreachable("Should have been a MemoryPhi or Instruction");
+ }
+ updateProcessedCount(V);
+ // Reset after processing (because we may mark ourselves as touched when
+ // we propagate equalities).
+ TouchedInstructions.reset(InstrNum);
+ }
+ }
+ NumGVNMaxIterations = std::max(NumGVNMaxIterations.getValue(), Iterations);
+#ifndef NDEBUG
+ verifyMemoryCongruency();
+#endif
+ Changed |= eliminateInstructions(F);
+
+ // Delete all instructions marked for deletion.
+ for (Instruction *ToErase : InstructionsToErase) {
+ if (!ToErase->use_empty())
+ ToErase->replaceAllUsesWith(UndefValue::get(ToErase->getType()));
+
+ ToErase->eraseFromParent();
+ }
+
+ // Delete all unreachable blocks.
+ auto UnreachableBlockPred = [&](const BasicBlock &BB) {
+ return !ReachableBlocks.count(&BB);
+ };
+
+ for (auto &BB : make_filter_range(F, UnreachableBlockPred)) {
+ DEBUG(dbgs() << "We believe block " << getBlockName(&BB)
+ << " is unreachable\n");
+ deleteInstructionsInBlock(&BB);
+ Changed = true;
+ }
+
+ cleanupTables();
+ return Changed;
+}
+
+bool NewGVN::runOnFunction(Function &F) {
+ if (skipFunction(F))
+ return false;
+ return runGVN(F, &getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
+ &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F),
+ &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(),
+ &getAnalysis<AAResultsWrapperPass>().getAAResults(),
+ &getAnalysis<MemorySSAWrapperPass>().getMSSA());
+}
+
+PreservedAnalyses NewGVNPass::run(Function &F, AnalysisManager<Function> &AM) {
+ NewGVN Impl;
+
+ // Apparently the order in which we get these results matter for
+ // the old GVN (see Chandler's comment in GVN.cpp). I'll keep
+ // the same order here, just in case.
+ auto &AC = AM.getResult<AssumptionAnalysis>(F);
+ auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+ auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
+ auto &AA = AM.getResult<AAManager>(F);
+ auto &MSSA = AM.getResult<MemorySSAAnalysis>(F).getMSSA();
+ bool Changed = Impl.runGVN(F, &DT, &AC, &TLI, &AA, &MSSA);
+ if (!Changed)
+ return PreservedAnalyses::all();
+ PreservedAnalyses PA;
+ PA.preserve<DominatorTreeAnalysis>();
+ PA.preserve<GlobalsAA>();
+ return PA;
+}
+
+// Return true if V is a value that will always be available (IE can
+// be placed anywhere) in the function. We don't do globals here
+// because they are often worse to put in place.
+// TODO: Separate cost from availability
+static bool alwaysAvailable(Value *V) {
+ return isa<Constant>(V) || isa<Argument>(V);
+}
+
+// Get the basic block from an instruction/value.
+static BasicBlock *getBlockForValue(Value *V) {
+ if (auto *I = dyn_cast<Instruction>(V))
+ return I->getParent();
+ return nullptr;
+}
+
+struct NewGVN::ValueDFS {
+ int DFSIn = 0;
+ int DFSOut = 0;
+ int LocalNum = 0;
+ // Only one of these will be set.
+ Value *Val = nullptr;
+ Use *U = nullptr;
+
+ bool operator<(const ValueDFS &Other) const {
+ // It's not enough that any given field be less than - we have sets
+ // of fields that need to be evaluated together to give a proper ordering.
+ // For example, if you have;
+ // DFS (1, 3)
+ // Val 0
+ // DFS (1, 2)
+ // Val 50
+ // We want the second to be less than the first, but if we just go field
+ // by field, we will get to Val 0 < Val 50 and say the first is less than
+ // the second. We only want it to be less than if the DFS orders are equal.
+ //
+ // Each LLVM instruction only produces one value, and thus the lowest-level
+ // differentiator that really matters for the stack (and what we use as as a
+ // replacement) is the local dfs number.
+ // Everything else in the structure is instruction level, and only affects
+ // the order in which we will replace operands of a given instruction.
+ //
+ // For a given instruction (IE things with equal dfsin, dfsout, localnum),
+ // the order of replacement of uses does not matter.
+ // IE given,
+ // a = 5
+ // b = a + a
+ // When you hit b, you will have two valuedfs with the same dfsin, out, and
+ // localnum.
+ // The .val will be the same as well.
+ // The .u's will be different.
+ // You will replace both, and it does not matter what order you replace them
+ // in (IE whether you replace operand 2, then operand 1, or operand 1, then
+ // operand 2).
+ // Similarly for the case of same dfsin, dfsout, localnum, but different
+ // .val's
+ // a = 5
+ // b = 6
+ // c = a + b
+ // in c, we will a valuedfs for a, and one for b,with everything the same
+ // but .val and .u.
+ // It does not matter what order we replace these operands in.
+ // You will always end up with the same IR, and this is guaranteed.
+ return std::tie(DFSIn, DFSOut, LocalNum, Val, U) <
+ std::tie(Other.DFSIn, Other.DFSOut, Other.LocalNum, Other.Val,
+ Other.U);
+ }
+};
+
+void NewGVN::convertDenseToDFSOrdered(
+ CongruenceClass::MemberSet &Dense,
+ SmallVectorImpl<ValueDFS> &DFSOrderedSet) {
+ for (auto D : Dense) {
+ // First add the value.
+ BasicBlock *BB = getBlockForValue(D);
+ // Constants are handled prior to ever calling this function, so
+ // we should only be left with instructions as members.
+ assert(BB && "Should have figured out a basic block for value");
+ ValueDFS VD;
+
+ std::pair<int, int> DFSPair = DFSDomMap[BB];
+ assert(DFSPair.first != -1 && DFSPair.second != -1 && "Invalid DFS Pair");
+ VD.DFSIn = DFSPair.first;
+ VD.DFSOut = DFSPair.second;
+ VD.Val = D;
+ // If it's an instruction, use the real local dfs number.
+ if (auto *I = dyn_cast<Instruction>(D))
+ VD.LocalNum = InstrDFS[I];
+ else
+ llvm_unreachable("Should have been an instruction");
+
+ DFSOrderedSet.emplace_back(VD);
+
+ // Now add the users.
+ for (auto &U : D->uses()) {
+ if (auto *I = dyn_cast<Instruction>(U.getUser())) {
+ ValueDFS VD;
+ // Put the phi node uses in the incoming block.
+ BasicBlock *IBlock;
+ if (auto *P = dyn_cast<PHINode>(I)) {
+ IBlock = P->getIncomingBlock(U);
+ // Make phi node users appear last in the incoming block
+ // they are from.
+ VD.LocalNum = InstrDFS.size() + 1;
+ } else {
+ IBlock = I->getParent();
+ VD.LocalNum = InstrDFS[I];
+ }
+ std::pair<int, int> DFSPair = DFSDomMap[IBlock];
+ VD.DFSIn = DFSPair.first;
+ VD.DFSOut = DFSPair.second;
+ VD.U = &U;
+ DFSOrderedSet.emplace_back(VD);
+ }
+ }
+ }
+}
+
+static void patchReplacementInstruction(Instruction *I, Value *Repl) {
+ // Patch the replacement so that it is not more restrictive than the value
+ // being replaced.
+ auto *Op = dyn_cast<BinaryOperator>(I);
+ auto *ReplOp = dyn_cast<BinaryOperator>(Repl);
+
+ if (Op && ReplOp)
+ ReplOp->andIRFlags(Op);
+
+ if (auto *ReplInst = dyn_cast<Instruction>(Repl)) {
+ // FIXME: If both the original and replacement value are part of the
+ // same control-flow region (meaning that the execution of one
+ // guarentees the executation of the other), then we can combine the
+ // noalias scopes here and do better than the general conservative
+ // answer used in combineMetadata().
+
+ // In general, GVN unifies expressions over different control-flow
+ // regions, and so we need a conservative combination of the noalias
+ // scopes.
+ unsigned KnownIDs[] = {
+ LLVMContext::MD_tbaa, LLVMContext::MD_alias_scope,
+ LLVMContext::MD_noalias, LLVMContext::MD_range,
+ LLVMContext::MD_fpmath, LLVMContext::MD_invariant_load,
+ LLVMContext::MD_invariant_group};
+ combineMetadata(ReplInst, I, KnownIDs);
+ }
+}
+
+static void patchAndReplaceAllUsesWith(Instruction *I, Value *Repl) {
+ patchReplacementInstruction(I, Repl);
+ I->replaceAllUsesWith(Repl);
+}
+
+void NewGVN::deleteInstructionsInBlock(BasicBlock *BB) {
+ DEBUG(dbgs() << " BasicBlock Dead:" << *BB);
+ ++NumGVNBlocksDeleted;
+
+ // Check to see if there are non-terminating instructions to delete.
+ if (isa<TerminatorInst>(BB->begin()))
+ return;
+
+ // Delete the instructions backwards, as it has a reduced likelihood of having
+ // to update as many def-use and use-def chains. Start after the terminator.
+ auto StartPoint = BB->rbegin();
+ ++StartPoint;
+ // Note that we explicitly recalculate BB->rend() on each iteration,
+ // as it may change when we remove the first instruction.
+ for (BasicBlock::reverse_iterator I(StartPoint); I != BB->rend();) {
+ Instruction &Inst = *I++;
+ if (!Inst.use_empty())
+ Inst.replaceAllUsesWith(UndefValue::get(Inst.getType()));
+ if (isa<LandingPadInst>(Inst))
+ continue;
+
+ Inst.eraseFromParent();
+ ++NumGVNInstrDeleted;
+ }
+}
+
+void NewGVN::markInstructionForDeletion(Instruction *I) {
+ DEBUG(dbgs() << "Marking " << *I << " for deletion\n");
+ InstructionsToErase.insert(I);
+}
+
+void NewGVN::replaceInstruction(Instruction *I, Value *V) {
+
+ DEBUG(dbgs() << "Replacing " << *I << " with " << *V << "\n");
+ patchAndReplaceAllUsesWith(I, V);
+ // We save the actual erasing to avoid invalidating memory
+ // dependencies until we are done with everything.
+ markInstructionForDeletion(I);
+}
+
+namespace {
+
+// This is a stack that contains both the value and dfs info of where
+// that value is valid.
+class ValueDFSStack {
+public:
+ Value *back() const { return ValueStack.back(); }
+ std::pair<int, int> dfs_back() const { return DFSStack.back(); }
+
+ void push_back(Value *V, int DFSIn, int DFSOut) {
+ ValueStack.emplace_back(V);
+ DFSStack.emplace_back(DFSIn, DFSOut);
+ }
+ bool empty() const { return DFSStack.empty(); }
+ bool isInScope(int DFSIn, int DFSOut) const {
+ if (empty())
+ return false;
+ return DFSIn >= DFSStack.back().first && DFSOut <= DFSStack.back().second;
+ }
+
+ void popUntilDFSScope(int DFSIn, int DFSOut) {
+
+ // These two should always be in sync at this point.
+ assert(ValueStack.size() == DFSStack.size() &&
+ "Mismatch between ValueStack and DFSStack");
+ while (
+ !DFSStack.empty() &&
+ !(DFSIn >= DFSStack.back().first && DFSOut <= DFSStack.back().second)) {
+ DFSStack.pop_back();
+ ValueStack.pop_back();
+ }
+ }
+
+private:
+ SmallVector<Value *, 8> ValueStack;
+ SmallVector<std::pair<int, int>, 8> DFSStack;
+};
+}
+
+bool NewGVN::eliminateInstructions(Function &F) {
+ // This is a non-standard eliminator. The normal way to eliminate is
+ // to walk the dominator tree in order, keeping track of available
+ // values, and eliminating them. However, this is mildly
+ // pointless. It requires doing lookups on every instruction,
+ // regardless of whether we will ever eliminate it. For
+ // instructions part of most singleton congruence classes, we know we
+ // will never eliminate them.
+
+ // Instead, this eliminator looks at the congruence classes directly, sorts
+ // them into a DFS ordering of the dominator tree, and then we just
+ // perform elimination straight on the sets by walking the congruence
+ // class member uses in order, and eliminate the ones dominated by the
+ // last member. This is worst case O(E log E) where E = number of
+ // instructions in a single congruence class. In theory, this is all
+ // instructions. In practice, it is much faster, as most instructions are
+ // either in singleton congruence classes or can't possibly be eliminated
+ // anyway (if there are no overlapping DFS ranges in class).
+ // When we find something not dominated, it becomes the new leader
+ // for elimination purposes.
+ // TODO: If we wanted to be faster, We could remove any members with no
+ // overlapping ranges while sorting, as we will never eliminate anything
+ // with those members, as they don't dominate anything else in our set.
+
+ bool AnythingReplaced = false;
+
+ // Since we are going to walk the domtree anyway, and we can't guarantee the
+ // DFS numbers are updated, we compute some ourselves.
+ DT->updateDFSNumbers();
+
+ for (auto &B : F) {
+ if (!ReachableBlocks.count(&B)) {
+ for (const auto S : successors(&B)) {
+ for (auto II = S->begin(); isa<PHINode>(II); ++II) {
+ auto &Phi = cast<PHINode>(*II);
+ DEBUG(dbgs() << "Replacing incoming value of " << *II << " for block "
+ << getBlockName(&B)
+ << " with undef due to it being unreachable\n");
+ for (auto &Operand : Phi.incoming_values())
+ if (Phi.getIncomingBlock(Operand) == &B)
+ Operand.set(UndefValue::get(Phi.getType()));
+ }
+ }
+ }
+ DomTreeNode *Node = DT->getNode(&B);
+ if (Node)
+ DFSDomMap[&B] = {Node->getDFSNumIn(), Node->getDFSNumOut()};
+ }
+
+ for (CongruenceClass *CC : CongruenceClasses) {
+ // FIXME: We should eventually be able to replace everything still
+ // in the initial class with undef, as they should be unreachable.
+ // Right now, initial still contains some things we skip value
+ // numbering of (UNREACHABLE's, for example).
+ if (CC == InitialClass || CC->Dead)
+ continue;
+ assert(CC->RepLeader && "We should have had a leader");
+
+ // If this is a leader that is always available, and it's a
+ // constant or has no equivalences, just replace everything with
+ // it. We then update the congruence class with whatever members
+ // are left.
+ if (alwaysAvailable(CC->RepLeader)) {
+ SmallPtrSet<Value *, 4> MembersLeft;
+ for (auto M : CC->Members) {
+
+ Value *Member = M;
+
+ // Void things have no uses we can replace.
+ if (Member == CC->RepLeader || Member->getType()->isVoidTy()) {
+ MembersLeft.insert(Member);
+ continue;
+ }
+
+ DEBUG(dbgs() << "Found replacement " << *(CC->RepLeader) << " for "
+ << *Member << "\n");
+ // Due to equality propagation, these may not always be
+ // instructions, they may be real values. We don't really
+ // care about trying to replace the non-instructions.
+ if (auto *I = dyn_cast<Instruction>(Member)) {
+ assert(CC->RepLeader != I &&
+ "About to accidentally remove our leader");
+ replaceInstruction(I, CC->RepLeader);
+ AnythingReplaced = true;
+
+ continue;
+ } else {
+ MembersLeft.insert(I);
+ }
+ }
+ CC->Members.swap(MembersLeft);
+
+ } else {
+ DEBUG(dbgs() << "Eliminating in congruence class " << CC->ID << "\n");
+ // If this is a singleton, we can skip it.
+ if (CC->Members.size() != 1) {
+
+ // This is a stack because equality replacement/etc may place
+ // constants in the middle of the member list, and we want to use
+ // those constant values in preference to the current leader, over
+ // the scope of those constants.
+ ValueDFSStack EliminationStack;
+
+ // Convert the members to DFS ordered sets and then merge them.
+ SmallVector<ValueDFS, 8> DFSOrderedSet;
+ convertDenseToDFSOrdered(CC->Members, DFSOrderedSet);
+
+ // Sort the whole thing.
+ std::sort(DFSOrderedSet.begin(), DFSOrderedSet.end());
+
+ for (auto &VD : DFSOrderedSet) {
+ int MemberDFSIn = VD.DFSIn;
+ int MemberDFSOut = VD.DFSOut;
+ Value *Member = VD.Val;
+ Use *MemberUse = VD.U;
+
+ if (Member) {
+ // We ignore void things because we can't get a value from them.
+ // FIXME: We could actually use this to kill dead stores that are
+ // dominated by equivalent earlier stores.
+ if (Member->getType()->isVoidTy())
+ continue;
+ }
+
+ if (EliminationStack.empty()) {
+ DEBUG(dbgs() << "Elimination Stack is empty\n");
+ } else {
+ DEBUG(dbgs() << "Elimination Stack Top DFS numbers are ("
+ << EliminationStack.dfs_back().first << ","
+ << EliminationStack.dfs_back().second << ")\n");
+ }
+
+ DEBUG(dbgs() << "Current DFS numbers are (" << MemberDFSIn << ","
+ << MemberDFSOut << ")\n");
+ // First, we see if we are out of scope or empty. If so,
+ // and there equivalences, we try to replace the top of
+ // stack with equivalences (if it's on the stack, it must
+ // not have been eliminated yet).
+ // Then we synchronize to our current scope, by
+ // popping until we are back within a DFS scope that
+ // dominates the current member.
+ // Then, what happens depends on a few factors
+ // If the stack is now empty, we need to push
+ // If we have a constant or a local equivalence we want to
+ // start using, we also push.
+ // Otherwise, we walk along, processing members who are
+ // dominated by this scope, and eliminate them.
+ bool ShouldPush =
+ Member && (EliminationStack.empty() || isa<Constant>(Member));
+ bool OutOfScope =
+ !EliminationStack.isInScope(MemberDFSIn, MemberDFSOut);
+
+ if (OutOfScope || ShouldPush) {
+ // Sync to our current scope.
+ EliminationStack.popUntilDFSScope(MemberDFSIn, MemberDFSOut);
+ ShouldPush |= Member && EliminationStack.empty();
+ if (ShouldPush) {
+ EliminationStack.push_back(Member, MemberDFSIn, MemberDFSOut);
+ }
+ }
+
+ // If we get to this point, and the stack is empty we must have a use
+ // with nothing we can use to eliminate it, just skip it.
+ if (EliminationStack.empty())
+ continue;
+
+ // Skip the Value's, we only want to eliminate on their uses.
+ if (Member)
+ continue;
+ Value *Result = EliminationStack.back();
+
+ // Don't replace our existing users with ourselves.
+ if (MemberUse->get() == Result)
+ continue;
+
+ DEBUG(dbgs() << "Found replacement " << *Result << " for "
+ << *MemberUse->get() << " in " << *(MemberUse->getUser())
+ << "\n");
+
+ // If we replaced something in an instruction, handle the patching of
+ // metadata.
+ if (auto *ReplacedInst = dyn_cast<Instruction>(MemberUse->get()))
+ patchReplacementInstruction(ReplacedInst, Result);
+
+ assert(isa<Instruction>(MemberUse->getUser()));
+ MemberUse->set(Result);
+ AnythingReplaced = true;
+ }
+ }
+ }
+
+ // Cleanup the congruence class.
+ SmallPtrSet<Value *, 4> MembersLeft;
+ for (Value *Member : CC->Members) {
+ if (Member->getType()->isVoidTy()) {
+ MembersLeft.insert(Member);
+ continue;
+ }
+
+ if (auto *MemberInst = dyn_cast<Instruction>(Member)) {
+ if (isInstructionTriviallyDead(MemberInst)) {
+ // TODO: Don't mark loads of undefs.
+ markInstructionForDeletion(MemberInst);
+ continue;
+ }
+ }
+ MembersLeft.insert(Member);
+ }
+ CC->Members.swap(MembersLeft);
+ }
+
+ return AnythingReplaced;
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp b/contrib/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp
index c4b3e34..1a7ddc9 100644
--- a/contrib/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp
@@ -123,7 +123,7 @@ static bool runPartiallyInlineLibCalls(Function &F, TargetLibraryInfo *TLI,
}
PreservedAnalyses
-PartiallyInlineLibCallsPass::run(Function &F, AnalysisManager<Function> &AM) {
+PartiallyInlineLibCallsPass::run(Function &F, FunctionAnalysisManager &AM) {
auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
auto &TTI = AM.getResult<TargetIRAnalysis>(F);
if (!runPartiallyInlineLibCalls(F, &TLI, &TTI))
diff --git a/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp b/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp
index e42e2c6..65c814d 100644
--- a/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp
@@ -145,7 +145,8 @@ static BinaryOperator *isReassociableOp(Value *V, unsigned Opcode1,
return nullptr;
}
-void ReassociatePass::BuildRankMap(Function &F) {
+void ReassociatePass::BuildRankMap(Function &F,
+ ReversePostOrderTraversal<Function*> &RPOT) {
unsigned i = 2;
// Assign distinct ranks to function arguments.
@@ -154,7 +155,7 @@ void ReassociatePass::BuildRankMap(Function &F) {
DEBUG(dbgs() << "Calculated Rank[" << I->getName() << "] = " << i << "\n");
}
- ReversePostOrderTraversal<Function *> RPOT(&F);
+ // Traverse basic blocks in ReversePostOrder
for (BasicBlock *BB : RPOT) {
unsigned BBRank = RankMap[BB] = ++i << 16;
@@ -507,9 +508,10 @@ static bool LinearizeExprTree(BinaryOperator *I,
continue;
}
// No uses outside the expression, try morphing it.
- } else if (It != Leaves.end()) {
+ } else {
// Already in the leaf map.
- assert(Visited.count(Op) && "In leaf map but not visited!");
+ assert(It != Leaves.end() && Visited.count(Op) &&
+ "In leaf map but not visited!");
// Update the number of paths to the leaf.
IncorporateWeight(It->second, Weight, Opcode);
@@ -1519,8 +1521,8 @@ Value *ReassociatePass::OptimizeAdd(Instruction *I,
if (ConstantInt *CI = dyn_cast<ConstantInt>(Factor)) {
if (CI->isNegative() && !CI->isMinValue(true)) {
Factor = ConstantInt::get(CI->getContext(), -CI->getValue());
- assert(!Duplicates.count(Factor) &&
- "Shouldn't have two constant factors, missed a canonicalize");
+ if (!Duplicates.insert(Factor).second)
+ continue;
unsigned Occ = ++FactorOccurrences[Factor];
if (Occ > MaxOcc) {
MaxOcc = Occ;
@@ -1532,8 +1534,8 @@ Value *ReassociatePass::OptimizeAdd(Instruction *I,
APFloat F(CF->getValueAPF());
F.changeSign();
Factor = ConstantFP::get(CF->getContext(), F);
- assert(!Duplicates.count(Factor) &&
- "Shouldn't have two constant factors, missed a canonicalize");
+ if (!Duplicates.insert(Factor).second)
+ continue;
unsigned Occ = ++FactorOccurrences[Factor];
if (Occ > MaxOcc) {
MaxOcc = Occ;
@@ -1776,6 +1778,12 @@ Value *ReassociatePass::OptimizeMul(BinaryOperator *I,
return nullptr; // All distinct factors, so nothing left for us to do.
IRBuilder<> Builder(I);
+ // The reassociate transformation for FP operations is performed only
+ // if unsafe algebra is permitted by FastMathFlags. Propagate those flags
+ // to the newly generated operations.
+ if (auto FPI = dyn_cast<FPMathOperator>(I))
+ Builder.setFastMathFlags(FPI->getFastMathFlags());
+
Value *V = buildMinimalMultiplyDAG(Builder, Factors);
if (Ops.empty())
return V;
@@ -1863,6 +1871,8 @@ void ReassociatePass::RecursivelyEraseDeadInsts(
/// Zap the given instruction, adding interesting operands to the work list.
void ReassociatePass::EraseInst(Instruction *I) {
assert(isInstructionTriviallyDead(I) && "Trivially dead instructions only!");
+ DEBUG(dbgs() << "Erasing dead inst: "; I->dump());
+
SmallVector<Value*, 8> Ops(I->op_begin(), I->op_end());
// Erase the dead instruction.
ValueRankMap.erase(I);
@@ -2172,11 +2182,19 @@ void ReassociatePass::ReassociateExpression(BinaryOperator *I) {
}
PreservedAnalyses ReassociatePass::run(Function &F, FunctionAnalysisManager &) {
+ // Get the functions basic blocks in Reverse Post Order. This order is used by
+ // BuildRankMap to pre calculate ranks correctly. It also excludes dead basic
+ // blocks (it has been seen that the analysis in this pass could hang when
+ // analysing dead basic blocks).
+ ReversePostOrderTraversal<Function *> RPOT(&F);
+
// Calculate the rank map for F.
- BuildRankMap(F);
+ BuildRankMap(F, RPOT);
MadeChange = false;
- for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) {
+ // Traverse the same blocks that was analysed by BuildRankMap.
+ for (BasicBlock *BI : RPOT) {
+ assert(RankMap.count(&*BI) && "BB should be ranked.");
// Optimize every instruction in the basic block.
for (BasicBlock::iterator II = BI->begin(), IE = BI->end(); II != IE;)
if (isInstructionTriviallyDead(&*II)) {
@@ -2196,8 +2214,10 @@ PreservedAnalyses ReassociatePass::run(Function &F, FunctionAnalysisManager &) {
// trivially dead instructions have been removed.
while (!ToRedo.empty()) {
Instruction *I = ToRedo.pop_back_val();
- if (isInstructionTriviallyDead(I))
+ if (isInstructionTriviallyDead(I)) {
RecursivelyEraseDeadInsts(I, ToRedo);
+ MadeChange = true;
+ }
}
// Now that we have removed dead instructions, we can reoptimize the
diff --git a/contrib/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp b/contrib/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
index bab39a3..1de7420 100644
--- a/contrib/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
@@ -453,7 +453,7 @@ static BaseDefiningValueResult findBaseDefiningValue(Value *I) {
if (isa<CallInst>(I) || isa<InvokeInst>(I))
return BaseDefiningValueResult(I, true);
- // I have absolutely no idea how to implement this part yet. It's not
+ // TODO: I have absolutely no idea how to implement this part yet. It's not
// necessarily hard, I just haven't really looked at it yet.
assert(!isa<LandingPadInst>(I) && "Landing Pad is unimplemented");
@@ -676,7 +676,8 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &Cache) {
#ifndef NDEBUG
auto isExpectedBDVType = [](Value *BDV) {
return isa<PHINode>(BDV) || isa<SelectInst>(BDV) ||
- isa<ExtractElementInst>(BDV) || isa<InsertElementInst>(BDV);
+ isa<ExtractElementInst>(BDV) || isa<InsertElementInst>(BDV) ||
+ isa<ShuffleVectorInst>(BDV);
};
#endif
@@ -719,9 +720,11 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &Cache) {
} else if (auto *IE = dyn_cast<InsertElementInst>(Current)) {
visitIncomingValue(IE->getOperand(0)); // vector operand
visitIncomingValue(IE->getOperand(1)); // scalar operand
- } else {
- // There is one known class of instructions we know we don't handle.
- assert(isa<ShuffleVectorInst>(Current));
+ } else if (auto *SV = dyn_cast<ShuffleVectorInst>(Current)) {
+ visitIncomingValue(SV->getOperand(0));
+ visitIncomingValue(SV->getOperand(1));
+ }
+ else {
llvm_unreachable("Unimplemented instruction case");
}
}
@@ -778,12 +781,17 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &Cache) {
// useful in that it drives us to conflict if our input is.
NewState =
meetBDVState(NewState, getStateForInput(EE->getVectorOperand()));
- } else {
+ } else if (auto *IE = dyn_cast<InsertElementInst>(BDV)){
// Given there's a inherent type mismatch between the operands, will
// *always* produce Conflict.
- auto *IE = cast<InsertElementInst>(BDV);
NewState = meetBDVState(NewState, getStateForInput(IE->getOperand(0)));
NewState = meetBDVState(NewState, getStateForInput(IE->getOperand(1)));
+ } else {
+ // The only instance this does not return a Conflict is when both the
+ // vector operands are the same vector.
+ auto *SV = cast<ShuffleVectorInst>(BDV);
+ NewState = meetBDVState(NewState, getStateForInput(SV->getOperand(0)));
+ NewState = meetBDVState(NewState, getStateForInput(SV->getOperand(1)));
}
BDVState OldState = States[BDV];
@@ -855,13 +863,18 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &Cache) {
std::string Name = suffixed_name_or(I, ".base", "base_ee");
return ExtractElementInst::Create(Undef, EE->getIndexOperand(), Name,
EE);
- } else {
- auto *IE = cast<InsertElementInst>(I);
+ } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
UndefValue *VecUndef = UndefValue::get(IE->getOperand(0)->getType());
UndefValue *ScalarUndef = UndefValue::get(IE->getOperand(1)->getType());
std::string Name = suffixed_name_or(I, ".base", "base_ie");
return InsertElementInst::Create(VecUndef, ScalarUndef,
IE->getOperand(2), Name, IE);
+ } else {
+ auto *SV = cast<ShuffleVectorInst>(I);
+ UndefValue *VecUndef = UndefValue::get(SV->getOperand(0)->getType());
+ std::string Name = suffixed_name_or(I, ".base", "base_sv");
+ return new ShuffleVectorInst(VecUndef, VecUndef, SV->getOperand(2),
+ Name, SV);
}
};
Instruction *BaseInst = MakeBaseInstPlaceholder(I);
@@ -963,8 +976,7 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &Cache) {
// Find the instruction which produces the base for each input. We may
// need to insert a bitcast.
BaseEE->setOperand(0, getBaseForInput(InVal, BaseEE));
- } else {
- auto *BaseIE = cast<InsertElementInst>(State.getBaseValue());
+ } else if (auto *BaseIE = dyn_cast<InsertElementInst>(State.getBaseValue())){
auto *BdvIE = cast<InsertElementInst>(BDV);
auto UpdateOperand = [&](int OperandIdx) {
Value *InVal = BdvIE->getOperand(OperandIdx);
@@ -973,6 +985,16 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &Cache) {
};
UpdateOperand(0); // vector operand
UpdateOperand(1); // scalar operand
+ } else {
+ auto *BaseSV = cast<ShuffleVectorInst>(State.getBaseValue());
+ auto *BdvSV = cast<ShuffleVectorInst>(BDV);
+ auto UpdateOperand = [&](int OperandIdx) {
+ Value *InVal = BdvSV->getOperand(OperandIdx);
+ Value *Base = getBaseForInput(InVal, BaseSV);
+ BaseSV->setOperand(OperandIdx, Base);
+ };
+ UpdateOperand(0); // vector operand
+ UpdateOperand(1); // vector operand
}
}
@@ -1154,7 +1176,7 @@ static void CreateGCRelocates(ArrayRef<Value *> LiveVariables,
return;
auto FindIndex = [](ArrayRef<Value *> LiveVec, Value *Val) {
- auto ValIt = std::find(LiveVec.begin(), LiveVec.end(), Val);
+ auto ValIt = find(LiveVec, Val);
assert(ValIt != LiveVec.end() && "Val not found in LiveVec!");
size_t Index = std::distance(LiveVec.begin(), ValIt);
assert(Index < LiveVec.size() && "Bug in std::find?");
@@ -1273,6 +1295,24 @@ public:
};
}
+static StringRef getDeoptLowering(CallSite CS) {
+ const char *DeoptLowering = "deopt-lowering";
+ if (CS.hasFnAttr(DeoptLowering)) {
+ // FIXME: CallSite has a *really* confusing interface around attributes
+ // with values.
+ const AttributeSet &CSAS = CS.getAttributes();
+ if (CSAS.hasAttribute(AttributeSet::FunctionIndex,
+ DeoptLowering))
+ return CSAS.getAttribute(AttributeSet::FunctionIndex,
+ DeoptLowering).getValueAsString();
+ Function *F = CS.getCalledFunction();
+ assert(F && F->hasFnAttribute(DeoptLowering));
+ return F->getFnAttribute(DeoptLowering).getValueAsString();
+ }
+ return "live-through";
+}
+
+
static void
makeStatepointExplicitImpl(const CallSite CS, /* to replace */
const SmallVectorImpl<Value *> &BasePtrs,
@@ -1314,6 +1354,14 @@ makeStatepointExplicitImpl(const CallSite CS, /* to replace */
if (SD.StatepointID)
StatepointID = *SD.StatepointID;
+ // Pass through the requested lowering if any. The default is live-through.
+ StringRef DeoptLowering = getDeoptLowering(CS);
+ if (DeoptLowering.equals("live-in"))
+ Flags |= uint32_t(StatepointFlags::DeoptLiveIn);
+ else {
+ assert(DeoptLowering.equals("live-through") && "Unsupported value!");
+ }
+
Value *CallTarget = CS.getCalledValue();
if (Function *F = dyn_cast<Function>(CallTarget)) {
if (F->getIntrinsicID() == Intrinsic::experimental_deoptimize) {
@@ -1347,7 +1395,7 @@ makeStatepointExplicitImpl(const CallSite CS, /* to replace */
StatepointID, NumPatchBytes, CallTarget, Flags, CallArgs,
TransitionArgs, DeoptArgs, GCArgs, "safepoint_token");
- Call->setTailCall(ToReplace->isTailCall());
+ Call->setTailCallKind(ToReplace->getTailCallKind());
Call->setCallingConv(ToReplace->getCallingConv());
// Currently we will fail on parameter attributes and on certain
@@ -1740,9 +1788,8 @@ static void relocationViaAlloca(
/// tests in ways which make them less useful in testing fused safepoints.
template <typename T> static void unique_unsorted(SmallVectorImpl<T> &Vec) {
SmallSet<T, 8> Seen;
- Vec.erase(std::remove_if(Vec.begin(), Vec.end(), [&](const T &V) {
- return !Seen.insert(V).second;
- }), Vec.end());
+ Vec.erase(remove_if(Vec, [&](const T &V) { return !Seen.insert(V).second; }),
+ Vec.end());
}
/// Insert holders so that each Value is obviously live through the entire
@@ -1784,38 +1831,33 @@ static void findLiveReferences(
}
// Helper function for the "rematerializeLiveValues". It walks use chain
-// starting from the "CurrentValue" until it meets "BaseValue". Only "simple"
-// values are visited (currently it is GEP's and casts). Returns true if it
-// successfully reached "BaseValue" and false otherwise.
-// Fills "ChainToBase" array with all visited values. "BaseValue" is not
-// recorded.
-static bool findRematerializableChainToBasePointer(
+// starting from the "CurrentValue" until it reaches the root of the chain, i.e.
+// the base or a value it cannot process. Only "simple" values are processed
+// (currently it is GEP's and casts). The returned root is examined by the
+// callers of findRematerializableChainToBasePointer. Fills "ChainToBase" array
+// with all visited values.
+static Value* findRematerializableChainToBasePointer(
SmallVectorImpl<Instruction*> &ChainToBase,
- Value *CurrentValue, Value *BaseValue) {
-
- // We have found a base value
- if (CurrentValue == BaseValue) {
- return true;
- }
+ Value *CurrentValue) {
if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(CurrentValue)) {
ChainToBase.push_back(GEP);
return findRematerializableChainToBasePointer(ChainToBase,
- GEP->getPointerOperand(),
- BaseValue);
+ GEP->getPointerOperand());
}
if (CastInst *CI = dyn_cast<CastInst>(CurrentValue)) {
if (!CI->isNoopCast(CI->getModule()->getDataLayout()))
- return false;
+ return CI;
ChainToBase.push_back(CI);
return findRematerializableChainToBasePointer(ChainToBase,
- CI->getOperand(0), BaseValue);
+ CI->getOperand(0));
}
- // Not supported instruction in the chain
- return false;
+ // We have reached the root of the chain, which is either equal to the base or
+ // is the first unsupported value along the use chain.
+ return CurrentValue;
}
// Helper function for the "rematerializeLiveValues". Compute cost of the use
@@ -1852,6 +1894,34 @@ chainToBasePointerCost(SmallVectorImpl<Instruction*> &Chain,
return Cost;
}
+static bool AreEquivalentPhiNodes(PHINode &OrigRootPhi, PHINode &AlternateRootPhi) {
+
+ unsigned PhiNum = OrigRootPhi.getNumIncomingValues();
+ if (PhiNum != AlternateRootPhi.getNumIncomingValues() ||
+ OrigRootPhi.getParent() != AlternateRootPhi.getParent())
+ return false;
+ // Map of incoming values and their corresponding basic blocks of
+ // OrigRootPhi.
+ SmallDenseMap<Value *, BasicBlock *, 8> CurrentIncomingValues;
+ for (unsigned i = 0; i < PhiNum; i++)
+ CurrentIncomingValues[OrigRootPhi.getIncomingValue(i)] =
+ OrigRootPhi.getIncomingBlock(i);
+
+ // Both current and base PHIs should have same incoming values and
+ // the same basic blocks corresponding to the incoming values.
+ for (unsigned i = 0; i < PhiNum; i++) {
+ auto CIVI =
+ CurrentIncomingValues.find(AlternateRootPhi.getIncomingValue(i));
+ if (CIVI == CurrentIncomingValues.end())
+ return false;
+ BasicBlock *CurrentIncomingBB = CIVI->second;
+ if (CurrentIncomingBB != AlternateRootPhi.getIncomingBlock(i))
+ return false;
+ }
+ return true;
+
+}
+
// From the statepoint live set pick values that are cheaper to recompute then
// to relocate. Remove this values from the live set, rematerialize them after
// statepoint and record them in "Info" structure. Note that similar to
@@ -1869,16 +1939,38 @@ static void rematerializeLiveValues(CallSite CS,
// For each live pointer find it's defining chain
SmallVector<Instruction *, 3> ChainToBase;
assert(Info.PointerToBase.count(LiveValue));
- bool FoundChain =
+ Value *RootOfChain =
findRematerializableChainToBasePointer(ChainToBase,
- LiveValue,
- Info.PointerToBase[LiveValue]);
+ LiveValue);
+
// Nothing to do, or chain is too long
- if (!FoundChain ||
- ChainToBase.size() == 0 ||
+ if ( ChainToBase.size() == 0 ||
ChainToBase.size() > ChainLengthThreshold)
continue;
+ // Handle the scenario where the RootOfChain is not equal to the
+ // Base Value, but they are essentially the same phi values.
+ if (RootOfChain != Info.PointerToBase[LiveValue]) {
+ PHINode *OrigRootPhi = dyn_cast<PHINode>(RootOfChain);
+ PHINode *AlternateRootPhi = dyn_cast<PHINode>(Info.PointerToBase[LiveValue]);
+ if (!OrigRootPhi || !AlternateRootPhi)
+ continue;
+ // PHI nodes that have the same incoming values, and belonging to the same
+ // basic blocks are essentially the same SSA value. When the original phi
+ // has incoming values with different base pointers, the original phi is
+ // marked as conflict, and an additional `AlternateRootPhi` with the same
+ // incoming values get generated by the findBasePointer function. We need
+ // to identify the newly generated AlternateRootPhi (.base version of phi)
+ // and RootOfChain (the original phi node itself) are the same, so that we
+ // can rematerialize the gep and casts. This is a workaround for the
+ // deficieny in the findBasePointer algorithm.
+ if (!AreEquivalentPhiNodes(*OrigRootPhi, *AlternateRootPhi))
+ continue;
+ // Now that the phi nodes are proved to be the same, assert that
+ // findBasePointer's newly generated AlternateRootPhi is present in the
+ // liveset of the call.
+ assert(Info.LiveSet.count(AlternateRootPhi));
+ }
// Compute cost of this chain
unsigned Cost = chainToBasePointerCost(ChainToBase, TTI);
// TODO: We can also account for cases when we will be able to remove some
@@ -1906,7 +1998,8 @@ static void rematerializeLiveValues(CallSite CS,
// Utility function which clones all instructions from "ChainToBase"
// and inserts them before "InsertBefore". Returns rematerialized value
// which should be used after statepoint.
- auto rematerializeChain = [&ChainToBase](Instruction *InsertBefore) {
+ auto rematerializeChain = [&ChainToBase](
+ Instruction *InsertBefore, Value *RootOfChain, Value *AlternateLiveBase) {
Instruction *LastClonedValue = nullptr;
Instruction *LastValue = nullptr;
for (Instruction *Instr: ChainToBase) {
@@ -1926,14 +2019,24 @@ static void rematerializeLiveValues(CallSite CS,
assert(LastValue);
ClonedValue->replaceUsesOfWith(LastValue, LastClonedValue);
#ifndef NDEBUG
- // Assert that cloned instruction does not use any instructions from
- // this chain other than LastClonedValue
for (auto OpValue : ClonedValue->operand_values()) {
- assert(std::find(ChainToBase.begin(), ChainToBase.end(), OpValue) ==
- ChainToBase.end() &&
+ // Assert that cloned instruction does not use any instructions from
+ // this chain other than LastClonedValue
+ assert(!is_contained(ChainToBase, OpValue) &&
"incorrect use in rematerialization chain");
+ // Assert that the cloned instruction does not use the RootOfChain
+ // or the AlternateLiveBase.
+ assert(OpValue != RootOfChain && OpValue != AlternateLiveBase);
}
#endif
+ } else {
+ // For the first instruction, replace the use of unrelocated base i.e.
+ // RootOfChain/OrigRootPhi, with the corresponding PHI present in the
+ // live set. They have been proved to be the same PHI nodes. Note
+ // that the *only* use of the RootOfChain in the ChainToBase list is
+ // the first Value in the list.
+ if (RootOfChain != AlternateLiveBase)
+ ClonedValue->replaceUsesOfWith(RootOfChain, AlternateLiveBase);
}
LastClonedValue = ClonedValue;
@@ -1948,7 +2051,8 @@ static void rematerializeLiveValues(CallSite CS,
if (CS.isCall()) {
Instruction *InsertBefore = CS.getInstruction()->getNextNode();
assert(InsertBefore);
- Instruction *RematerializedValue = rematerializeChain(InsertBefore);
+ Instruction *RematerializedValue = rematerializeChain(
+ InsertBefore, RootOfChain, Info.PointerToBase[LiveValue]);
Info.RematerializedValues[RematerializedValue] = LiveValue;
} else {
InvokeInst *Invoke = cast<InvokeInst>(CS.getInstruction());
@@ -1958,10 +2062,10 @@ static void rematerializeLiveValues(CallSite CS,
Instruction *UnwindInsertBefore =
&*Invoke->getUnwindDest()->getFirstInsertionPt();
- Instruction *NormalRematerializedValue =
- rematerializeChain(NormalInsertBefore);
- Instruction *UnwindRematerializedValue =
- rematerializeChain(UnwindInsertBefore);
+ Instruction *NormalRematerializedValue = rematerializeChain(
+ NormalInsertBefore, RootOfChain, Info.PointerToBase[LiveValue]);
+ Instruction *UnwindRematerializedValue = rematerializeChain(
+ UnwindInsertBefore, RootOfChain, Info.PointerToBase[LiveValue]);
Info.RematerializedValues[NormalRematerializedValue] = LiveValue;
Info.RematerializedValues[UnwindRematerializedValue] = LiveValue;
@@ -2268,8 +2372,7 @@ static bool shouldRewriteStatepointsIn(Function &F) {
void RewriteStatepointsForGC::stripNonValidAttributes(Module &M) {
#ifndef NDEBUG
- assert(std::any_of(M.begin(), M.end(), shouldRewriteStatepointsIn) &&
- "precondition!");
+ assert(any_of(M, shouldRewriteStatepointsIn) && "precondition!");
#endif
for (Function &F : M)
@@ -2546,8 +2649,8 @@ static void findLiveSetAtInst(Instruction *Inst, GCPtrLivenessData &Data,
// call result is not live (normal), nor are it's arguments
// (unless they're used again later). This adjustment is
// specifically what we need to relocate
- BasicBlock::reverse_iterator rend(Inst->getIterator());
- computeLiveInValues(BB->rbegin(), rend, LiveOut);
+ computeLiveInValues(BB->rbegin(), ++Inst->getIterator().getReverse(),
+ LiveOut);
LiveOut.remove(Inst);
Out.insert(LiveOut.begin(), LiveOut.end());
}
diff --git a/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp b/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp
index f74f28a..ede381c 100644
--- a/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp
@@ -242,7 +242,7 @@ public:
/// this method must be called.
void AddTrackedFunction(Function *F) {
// Add an entry, F -> undef.
- if (StructType *STy = dyn_cast<StructType>(F->getReturnType())) {
+ if (auto *STy = dyn_cast<StructType>(F->getReturnType())) {
MRVFunctionsTracked.insert(F);
for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i)
TrackedMultipleRetVals.insert(std::make_pair(std::make_pair(F, i),
@@ -272,7 +272,7 @@ public:
std::vector<LatticeVal> getStructLatticeValueFor(Value *V) const {
std::vector<LatticeVal> StructValues;
- StructType *STy = dyn_cast<StructType>(V->getType());
+ auto *STy = dyn_cast<StructType>(V->getType());
assert(STy && "getStructLatticeValueFor() can be called only on structs");
for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
auto I = StructValueState.find(std::make_pair(V, i));
@@ -300,23 +300,44 @@ public:
return TrackedGlobals;
}
+ /// getMRVFunctionsTracked - Get the set of functions which return multiple
+ /// values tracked by the pass.
+ const SmallPtrSet<Function *, 16> getMRVFunctionsTracked() {
+ return MRVFunctionsTracked;
+ }
+
void markOverdefined(Value *V) {
- assert(!V->getType()->isStructTy() && "Should use other method");
+ assert(!V->getType()->isStructTy() &&
+ "structs should use markAnythingOverdefined");
markOverdefined(ValueState[V], V);
}
/// markAnythingOverdefined - Mark the specified value overdefined. This
/// works with both scalars and structs.
void markAnythingOverdefined(Value *V) {
- if (StructType *STy = dyn_cast<StructType>(V->getType()))
+ if (auto *STy = dyn_cast<StructType>(V->getType()))
for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i)
markOverdefined(getStructValueState(V, i), V);
else
markOverdefined(V);
}
+ // isStructLatticeConstant - Return true if all the lattice values
+ // corresponding to elements of the structure are not overdefined,
+ // false otherwise.
+ bool isStructLatticeConstant(Function *F, StructType *STy) {
+ for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
+ const auto &It = TrackedMultipleRetVals.find(std::make_pair(F, i));
+ assert(It != TrackedMultipleRetVals.end());
+ LatticeVal LV = It->second;
+ if (LV.isOverdefined())
+ return false;
+ }
+ return true;
+ }
+
private:
- // pushToWorkList - Helper for markConstant/markForcedConstant
+ // pushToWorkList - Helper for markConstant/markForcedConstant/markOverdefined
void pushToWorkList(LatticeVal &IV, Value *V) {
if (IV.isOverdefined())
return OverdefinedInstWorkList.push_back(V);
@@ -334,12 +355,12 @@ private:
}
void markConstant(Value *V, Constant *C) {
- assert(!V->getType()->isStructTy() && "Should use other method");
+ assert(!V->getType()->isStructTy() && "structs should use mergeInValue");
markConstant(ValueState[V], V, C);
}
void markForcedConstant(Value *V, Constant *C) {
- assert(!V->getType()->isStructTy() && "Should use other method");
+ assert(!V->getType()->isStructTy() && "structs should use mergeInValue");
LatticeVal &IV = ValueState[V];
IV.markForcedConstant(C);
DEBUG(dbgs() << "markForcedConstant: " << *C << ": " << *V << '\n');
@@ -354,12 +375,12 @@ private:
if (!IV.markOverdefined()) return;
DEBUG(dbgs() << "markOverdefined: ";
- if (Function *F = dyn_cast<Function>(V))
+ if (auto *F = dyn_cast<Function>(V))
dbgs() << "Function '" << F->getName() << "'\n";
else
dbgs() << *V << '\n');
// Only instructions go on the work list
- OverdefinedInstWorkList.push_back(V);
+ pushToWorkList(IV, V);
}
void mergeInValue(LatticeVal &IV, Value *V, LatticeVal MergeWithV) {
@@ -374,7 +395,8 @@ private:
}
void mergeInValue(Value *V, LatticeVal MergeWithV) {
- assert(!V->getType()->isStructTy() && "Should use other method");
+ assert(!V->getType()->isStructTy() &&
+ "non-structs should use markConstant");
mergeInValue(ValueState[V], V, MergeWithV);
}
@@ -392,7 +414,7 @@ private:
if (!I.second)
return LV; // Common case, already in the map.
- if (Constant *C = dyn_cast<Constant>(V)) {
+ if (auto *C = dyn_cast<Constant>(V)) {
// Undef values remain unknown.
if (!isa<UndefValue>(V))
LV.markConstant(C); // Constants are constant
@@ -418,7 +440,7 @@ private:
if (!I.second)
return LV; // Common case, already in the map.
- if (Constant *C = dyn_cast<Constant>(V)) {
+ if (auto *C = dyn_cast<Constant>(V)) {
Constant *Elt = C->getAggregateElement(i);
if (!Elt)
@@ -489,9 +511,6 @@ private:
void visitSelectInst(SelectInst &I);
void visitBinaryOperator(Instruction &I);
void visitCmpInst(CmpInst &I);
- void visitExtractElementInst(ExtractElementInst &I);
- void visitInsertElementInst(InsertElementInst &I);
- void visitShuffleVectorInst(ShuffleVectorInst &I);
void visitExtractValueInst(ExtractValueInst &EVI);
void visitInsertValueInst(InsertValueInst &IVI);
void visitLandingPadInst(LandingPadInst &I) { markAnythingOverdefined(&I); }
@@ -527,7 +546,7 @@ private:
void visitInstruction(Instruction &I) {
// If a new instruction is added to LLVM that we don't handle.
- dbgs() << "SCCP: Don't know how to handle: " << I << '\n';
+ DEBUG(dbgs() << "SCCP: Don't know how to handle: " << I << '\n');
markAnythingOverdefined(&I); // Just in case
}
};
@@ -541,7 +560,7 @@ private:
void SCCPSolver::getFeasibleSuccessors(TerminatorInst &TI,
SmallVectorImpl<bool> &Succs) {
Succs.resize(TI.getNumSuccessors());
- if (BranchInst *BI = dyn_cast<BranchInst>(&TI)) {
+ if (auto *BI = dyn_cast<BranchInst>(&TI)) {
if (BI->isUnconditional()) {
Succs[0] = true;
return;
@@ -568,7 +587,7 @@ void SCCPSolver::getFeasibleSuccessors(TerminatorInst &TI,
return;
}
- if (SwitchInst *SI = dyn_cast<SwitchInst>(&TI)) {
+ if (auto *SI = dyn_cast<SwitchInst>(&TI)) {
if (!SI->getNumCases()) {
Succs[0] = true;
return;
@@ -594,9 +613,7 @@ void SCCPSolver::getFeasibleSuccessors(TerminatorInst &TI,
return;
}
-#ifndef NDEBUG
- dbgs() << "Unknown terminator instruction: " << TI << '\n';
-#endif
+ DEBUG(dbgs() << "Unknown terminator instruction: " << TI << '\n');
llvm_unreachable("SCCP: Don't know how to handle this terminator!");
}
@@ -612,7 +629,7 @@ bool SCCPSolver::isEdgeFeasible(BasicBlock *From, BasicBlock *To) {
// Check to make sure this edge itself is actually feasible now.
TerminatorInst *TI = From->getTerminator();
- if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
+ if (auto *BI = dyn_cast<BranchInst>(TI)) {
if (BI->isUnconditional())
return true;
@@ -632,7 +649,7 @@ bool SCCPSolver::isEdgeFeasible(BasicBlock *From, BasicBlock *To) {
if (TI->isExceptional())
return true;
- if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
+ if (auto *SI = dyn_cast<SwitchInst>(TI)) {
if (SI->getNumCases() < 1)
return true;
@@ -650,9 +667,7 @@ bool SCCPSolver::isEdgeFeasible(BasicBlock *From, BasicBlock *To) {
if (isa<IndirectBrInst>(TI))
return true;
-#ifndef NDEBUG
- dbgs() << "Unknown terminator instruction: " << *TI << '\n';
-#endif
+ DEBUG(dbgs() << "Unknown terminator instruction: " << *TI << '\n');
llvm_unreachable("SCCP: Don't know how to handle this terminator!");
}
@@ -747,7 +762,7 @@ void SCCPSolver::visitReturnInst(ReturnInst &I) {
// Handle functions that return multiple values.
if (!TrackedMultipleRetVals.empty()) {
- if (StructType *STy = dyn_cast<StructType>(ResultOp->getType()))
+ if (auto *STy = dyn_cast<StructType>(ResultOp->getType()))
if (MRVFunctionsTracked.count(F))
for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i)
mergeInValue(TrackedMultipleRetVals[std::make_pair(F, i)], F,
@@ -806,7 +821,7 @@ void SCCPSolver::visitExtractValueInst(ExtractValueInst &EVI) {
}
void SCCPSolver::visitInsertValueInst(InsertValueInst &IVI) {
- StructType *STy = dyn_cast<StructType>(IVI.getType());
+ auto *STy = dyn_cast<StructType>(IVI.getType());
if (!STy)
return markOverdefined(&IVI);
@@ -898,7 +913,8 @@ void SCCPSolver::visitBinaryOperator(Instruction &I) {
// If this is an AND or OR with 0 or -1, it doesn't matter that the other
// operand is overdefined.
- if (I.getOpcode() == Instruction::And || I.getOpcode() == Instruction::Or) {
+ if (I.getOpcode() == Instruction::And || I.getOpcode() == Instruction::Mul ||
+ I.getOpcode() == Instruction::Or) {
LatticeVal *NonOverdefVal = nullptr;
if (!V1State.isOverdefined())
NonOverdefVal = &V1State;
@@ -906,25 +922,19 @@ void SCCPSolver::visitBinaryOperator(Instruction &I) {
NonOverdefVal = &V2State;
if (NonOverdefVal) {
- if (NonOverdefVal->isUnknown()) {
- // Could annihilate value.
- if (I.getOpcode() == Instruction::And)
- markConstant(IV, &I, Constant::getNullValue(I.getType()));
- else if (VectorType *PT = dyn_cast<VectorType>(I.getType()))
- markConstant(IV, &I, Constant::getAllOnesValue(PT));
- else
- markConstant(IV, &I,
- Constant::getAllOnesValue(I.getType()));
+ if (NonOverdefVal->isUnknown())
return;
- }
- if (I.getOpcode() == Instruction::And) {
+ if (I.getOpcode() == Instruction::And ||
+ I.getOpcode() == Instruction::Mul) {
// X and 0 = 0
+ // X * 0 = 0
if (NonOverdefVal->getConstant()->isNullValue())
return markConstant(IV, &I, NonOverdefVal->getConstant());
} else {
+ // X or -1 = -1
if (ConstantInt *CI = NonOverdefVal->getConstantInt())
- if (CI->isAllOnesValue()) // X or -1 = -1
+ if (CI->isAllOnesValue())
return markConstant(IV, &I, NonOverdefVal->getConstant());
}
}
@@ -957,21 +967,6 @@ void SCCPSolver::visitCmpInst(CmpInst &I) {
markOverdefined(&I);
}
-void SCCPSolver::visitExtractElementInst(ExtractElementInst &I) {
- // TODO : SCCP does not handle vectors properly.
- return markOverdefined(&I);
-}
-
-void SCCPSolver::visitInsertElementInst(InsertElementInst &I) {
- // TODO : SCCP does not handle vectors properly.
- return markOverdefined(&I);
-}
-
-void SCCPSolver::visitShuffleVectorInst(ShuffleVectorInst &I) {
- // TODO : SCCP does not handle vectors properly.
- return markOverdefined(&I);
-}
-
// Handle getelementptr instructions. If all operands are constants then we
// can turn this into a getelementptr ConstantExpr.
//
@@ -1044,7 +1039,7 @@ void SCCPSolver::visitLoadInst(LoadInst &I) {
return;
// Transform load (constant global) into the value loaded.
- if (GlobalVariable *GV = dyn_cast<GlobalVariable>(Ptr)) {
+ if (auto *GV = dyn_cast<GlobalVariable>(Ptr)) {
if (!TrackedGlobals.empty()) {
// If we are tracking this global, merge in the known value for it.
DenseMap<GlobalVariable*, LatticeVal>::iterator It =
@@ -1132,7 +1127,7 @@ CallOverdefined:
continue;
}
- if (StructType *STy = dyn_cast<StructType>(AI->getType())) {
+ if (auto *STy = dyn_cast<StructType>(AI->getType())) {
for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
LatticeVal CallArg = getStructValueState(*CAI, i);
mergeInValue(getStructValueState(&*AI, i), &*AI, CallArg);
@@ -1144,7 +1139,7 @@ CallOverdefined:
}
// If this is a single/zero retval case, see if we're tracking the function.
- if (StructType *STy = dyn_cast<StructType>(F->getReturnType())) {
+ if (auto *STy = dyn_cast<StructType>(F->getReturnType())) {
if (!MRVFunctionsTracked.count(F))
goto CallOverdefined; // Not tracking this callee.
@@ -1182,7 +1177,7 @@ void SCCPSolver::Solve() {
// Update all of the users of this instruction's value.
//
for (User *U : I->users())
- if (Instruction *UI = dyn_cast<Instruction>(U))
+ if (auto *UI = dyn_cast<Instruction>(U))
OperandChangedState(UI);
}
@@ -1201,7 +1196,7 @@ void SCCPSolver::Solve() {
//
if (I->getType()->isStructTy() || !getValueState(I).isOverdefined())
for (User *U : I->users())
- if (Instruction *UI = dyn_cast<Instruction>(U))
+ if (auto *UI = dyn_cast<Instruction>(U))
OperandChangedState(UI);
}
@@ -1246,7 +1241,7 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {
// Look for instructions which produce undef values.
if (I.getType()->isVoidTy()) continue;
- if (StructType *STy = dyn_cast<StructType>(I.getType())) {
+ if (auto *STy = dyn_cast<StructType>(I.getType())) {
// Only a few things that can be structs matter for undef.
// Tracked calls must never be marked overdefined in ResolvedUndefsIn.
@@ -1386,8 +1381,8 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {
break;
}
- // undef >>a X -> all ones
- markForcedConstant(&I, Constant::getAllOnesValue(ITy));
+ // undef >>a X -> 0
+ markForcedConstant(&I, Constant::getNullValue(ITy));
return true;
case Instruction::LShr:
case Instruction::Shl:
@@ -1467,7 +1462,7 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {
// we force the branch to go one way or the other to make the successor
// values live. It doesn't really matter which way we force it.
TerminatorInst *TI = BB.getTerminator();
- if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
+ if (auto *BI = dyn_cast<BranchInst>(TI)) {
if (!BI->isConditional()) continue;
if (!getValueState(BI->getCondition()).isUnknown())
continue;
@@ -1488,7 +1483,7 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {
return true;
}
- if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
+ if (auto *SI = dyn_cast<SwitchInst>(TI)) {
if (!SI->getNumCases() || !getValueState(SI->getCondition()).isUnknown())
continue;
@@ -1512,11 +1507,10 @@ static bool tryToReplaceWithConstant(SCCPSolver &Solver, Value *V) {
Constant *Const = nullptr;
if (V->getType()->isStructTy()) {
std::vector<LatticeVal> IVs = Solver.getStructLatticeValueFor(V);
- if (std::any_of(IVs.begin(), IVs.end(),
- [](LatticeVal &LV) { return LV.isOverdefined(); }))
+ if (any_of(IVs, [](const LatticeVal &LV) { return LV.isOverdefined(); }))
return false;
std::vector<Constant *> ConstVals;
- StructType *ST = dyn_cast<StructType>(V->getType());
+ auto *ST = dyn_cast<StructType>(V->getType());
for (unsigned i = 0, e = ST->getNumElements(); i != e; ++i) {
LatticeVal V = IVs[i];
ConstVals.push_back(V.isConstant()
@@ -1599,7 +1593,7 @@ static bool runSCCP(Function &F, const DataLayout &DL,
return MadeChanges;
}
-PreservedAnalyses SCCPPass::run(Function &F, AnalysisManager<Function> &AM) {
+PreservedAnalyses SCCPPass::run(Function &F, FunctionAnalysisManager &AM) {
const DataLayout &DL = F.getParent()->getDataLayout();
auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
if (!runSCCP(F, DL, &TLI))
@@ -1657,7 +1651,7 @@ static bool AddressIsTaken(const GlobalValue *GV) {
for (const Use &U : GV->uses()) {
const User *UR = U.getUser();
- if (const StoreInst *SI = dyn_cast<StoreInst>(UR)) {
+ if (const auto *SI = dyn_cast<StoreInst>(UR)) {
if (SI->getOperand(0) == GV || SI->isVolatile())
return true; // Storing addr of GV.
} else if (isa<InvokeInst>(UR) || isa<CallInst>(UR)) {
@@ -1665,7 +1659,7 @@ static bool AddressIsTaken(const GlobalValue *GV) {
ImmutableCallSite CS(cast<Instruction>(UR));
if (!CS.isCallee(&U))
return true;
- } else if (const LoadInst *LI = dyn_cast<LoadInst>(UR)) {
+ } else if (const auto *LI = dyn_cast<LoadInst>(UR)) {
if (LI->isVolatile())
return true;
} else if (isa<BlockAddress>(UR)) {
@@ -1678,6 +1672,19 @@ static bool AddressIsTaken(const GlobalValue *GV) {
return false;
}
+static void findReturnsToZap(Function &F,
+ SmallPtrSet<Function *, 32> &AddressTakenFunctions,
+ SmallVector<ReturnInst *, 8> &ReturnsToZap) {
+ // We can only do this if we know that nothing else can call the function.
+ if (!F.hasLocalLinkage() || AddressTakenFunctions.count(&F))
+ return;
+
+ for (BasicBlock &BB : F)
+ if (auto *RI = dyn_cast<ReturnInst>(BB.getTerminator()))
+ if (!isa<UndefValue>(RI->getOperand(0)))
+ ReturnsToZap.push_back(RI);
+}
+
static bool runIPSCCP(Module &M, const DataLayout &DL,
const TargetLibraryInfo *TLI) {
SCCPSolver Solver(DL, TLI);
@@ -1698,7 +1705,10 @@ static bool runIPSCCP(Module &M, const DataLayout &DL,
// If this is an exact definition of this function, then we can propagate
// information about its result into callsites of it.
- if (F.hasExactDefinition())
+ // Don't touch naked functions. They may contain asm returning a
+ // value we don't see, so we may end up interprocedurally propagating
+ // the return value incorrectly.
+ if (F.hasExactDefinition() && !F.hasFnAttribute(Attribute::Naked))
Solver.AddTrackedFunction(&F);
// If this function only has direct calls that we can see, we can track its
@@ -1800,7 +1810,7 @@ static bool runIPSCCP(Module &M, const DataLayout &DL,
UI != UE;) {
// Grab the user and then increment the iterator early, as the user
// will be deleted. Step past all adjacent uses from the same user.
- Instruction *I = dyn_cast<Instruction>(*UI);
+ auto *I = dyn_cast<Instruction>(*UI);
do { ++UI; } while (UI != UE && *UI == I);
// Ignore blockaddress users; BasicBlock's dtor will handle them.
@@ -1812,10 +1822,10 @@ static bool runIPSCCP(Module &M, const DataLayout &DL,
// if this is a branch or switch on undef. Fold it manually as a
// branch to the first successor.
#ifndef NDEBUG
- if (BranchInst *BI = dyn_cast<BranchInst>(I)) {
+ if (auto *BI = dyn_cast<BranchInst>(I)) {
assert(BI->isConditional() && isa<UndefValue>(BI->getCondition()) &&
"Branch should be foldable!");
- } else if (SwitchInst *SI = dyn_cast<SwitchInst>(I)) {
+ } else if (auto *SI = dyn_cast<SwitchInst>(I)) {
assert(isa<UndefValue>(SI->getCondition()) && "Switch should fold");
} else {
llvm_unreachable("Didn't fold away reference to block!");
@@ -1853,21 +1863,20 @@ static bool runIPSCCP(Module &M, const DataLayout &DL,
// whether other functions are optimizable.
SmallVector<ReturnInst*, 8> ReturnsToZap;
- // TODO: Process multiple value ret instructions also.
const DenseMap<Function*, LatticeVal> &RV = Solver.getTrackedRetVals();
for (const auto &I : RV) {
Function *F = I.first;
if (I.second.isOverdefined() || F->getReturnType()->isVoidTy())
continue;
+ findReturnsToZap(*F, AddressTakenFunctions, ReturnsToZap);
+ }
- // We can only do this if we know that nothing else can call the function.
- if (!F->hasLocalLinkage() || AddressTakenFunctions.count(F))
- continue;
-
- for (BasicBlock &BB : *F)
- if (ReturnInst *RI = dyn_cast<ReturnInst>(BB.getTerminator()))
- if (!isa<UndefValue>(RI->getOperand(0)))
- ReturnsToZap.push_back(RI);
+ for (const auto &F : Solver.getMRVFunctionsTracked()) {
+ assert(F->getReturnType()->isStructTy() &&
+ "The return type should be a struct");
+ StructType *STy = cast<StructType>(F->getReturnType());
+ if (Solver.isStructLatticeConstant(F, STy))
+ findReturnsToZap(*F, AddressTakenFunctions, ReturnsToZap);
}
// Zap all returns which we've identified as zap to change.
@@ -1896,7 +1905,7 @@ static bool runIPSCCP(Module &M, const DataLayout &DL,
return MadeChanges;
}
-PreservedAnalyses IPSCCPPass::run(Module &M, AnalysisManager<Module> &AM) {
+PreservedAnalyses IPSCCPPass::run(Module &M, ModuleAnalysisManager &AM) {
const DataLayout &DL = M.getDataLayout();
auto &TLI = AM.getResult<TargetLibraryAnalysis>(M);
if (!runIPSCCP(M, DL, &TLI))
diff --git a/contrib/llvm/lib/Transforms/Scalar/SROA.cpp b/contrib/llvm/lib/Transforms/Scalar/SROA.cpp
index 4ce552f..bfcb155 100644
--- a/contrib/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -44,12 +44,12 @@
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/Operator.h"
#include "llvm/Pass.h"
+#include "llvm/Support/Chrono.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Compiler.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/MathExtras.h"
-#include "llvm/Support/TimeValue.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Utils/Local.h"
@@ -432,19 +432,18 @@ class AllocaSlices::partition_iterator
// cannot change the max split slice end because we just checked that
// the prior partition ended prior to that max.
P.SplitTails.erase(
- std::remove_if(
- P.SplitTails.begin(), P.SplitTails.end(),
- [&](Slice *S) { return S->endOffset() <= P.EndOffset; }),
+ remove_if(P.SplitTails,
+ [&](Slice *S) { return S->endOffset() <= P.EndOffset; }),
P.SplitTails.end());
- assert(std::any_of(P.SplitTails.begin(), P.SplitTails.end(),
- [&](Slice *S) {
- return S->endOffset() == MaxSplitSliceEndOffset;
- }) &&
+ assert(any_of(P.SplitTails,
+ [&](Slice *S) {
+ return S->endOffset() == MaxSplitSliceEndOffset;
+ }) &&
"Could not find the current max split slice offset!");
- assert(std::all_of(P.SplitTails.begin(), P.SplitTails.end(),
- [&](Slice *S) {
- return S->endOffset() <= MaxSplitSliceEndOffset;
- }) &&
+ assert(all_of(P.SplitTails,
+ [&](Slice *S) {
+ return S->endOffset() <= MaxSplitSliceEndOffset;
+ }) &&
"Max split slice end offset is not actually the max!");
}
}
@@ -693,7 +692,7 @@ private:
break;
// Handle a struct index, which adds its field offset to the pointer.
- if (StructType *STy = dyn_cast<StructType>(*GTI)) {
+ if (StructType *STy = GTI.getStructTypeOrNull()) {
unsigned ElementIdx = OpC->getZExtValue();
const StructLayout *SL = DL.getStructLayout(STy);
GEPOffset +=
@@ -996,15 +995,13 @@ AllocaSlices::AllocaSlices(const DataLayout &DL, AllocaInst &AI)
return;
}
- Slices.erase(std::remove_if(Slices.begin(), Slices.end(),
- [](const Slice &S) {
- return S.isDead();
- }),
+ Slices.erase(remove_if(Slices, [](const Slice &S) { return S.isDead(); }),
Slices.end());
#ifndef NDEBUG
if (SROARandomShuffleSlices) {
- std::mt19937 MT(static_cast<unsigned>(sys::TimeValue::now().msec()));
+ std::mt19937 MT(static_cast<unsigned>(
+ std::chrono::system_clock::now().time_since_epoch().count()));
std::shuffle(Slices.begin(), Slices.end(), MT);
}
#endif
@@ -1815,10 +1812,10 @@ static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL) {
// do that until all the backends are known to produce good code for all
// integer vector types.
if (!HaveCommonEltTy) {
- CandidateTys.erase(std::remove_if(CandidateTys.begin(), CandidateTys.end(),
- [](VectorType *VTy) {
- return !VTy->getElementType()->isIntegerTy();
- }),
+ CandidateTys.erase(remove_if(CandidateTys,
+ [](VectorType *VTy) {
+ return !VTy->getElementType()->isIntegerTy();
+ }),
CandidateTys.end());
// If there were no integer vector types, give up.
@@ -2486,8 +2483,8 @@ private:
}
V = convertValue(DL, IRB, V, NewAllocaTy);
StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment());
+ Store->copyMetadata(SI, LLVMContext::MD_mem_parallel_loop_access);
Pass.DeadInsts.insert(&SI);
- (void)Store;
DEBUG(dbgs() << " to: " << *Store << "\n");
return true;
}
@@ -2549,6 +2546,7 @@ private:
NewSI = IRB.CreateAlignedStore(V, NewPtr, getSliceAlign(V->getType()),
SI.isVolatile());
}
+ NewSI->copyMetadata(SI, LLVMContext::MD_mem_parallel_loop_access);
if (SI.isVolatile())
NewSI->setAtomic(SI.getOrdering(), SI.getSynchScope());
Pass.DeadInsts.insert(&SI);
@@ -2878,6 +2876,17 @@ private:
// Record this instruction for deletion.
Pass.DeadInsts.insert(&II);
+ // Lifetime intrinsics are only promotable if they cover the whole alloca.
+ // Therefore, we drop lifetime intrinsics which don't cover the whole
+ // alloca.
+ // (In theory, intrinsics which partially cover an alloca could be
+ // promoted, but PromoteMemToReg doesn't handle that case.)
+ // FIXME: Check whether the alloca is promotable before dropping the
+ // lifetime intrinsics?
+ if (NewBeginOffset != NewAllocaBeginOffset ||
+ NewEndOffset != NewAllocaEndOffset)
+ return true;
+
ConstantInt *Size =
ConstantInt::get(cast<IntegerType>(II.getArgOperand(0)->getType()),
NewEndOffset - NewBeginOffset);
@@ -2890,6 +2899,7 @@ private:
(void)New;
DEBUG(dbgs() << " to: " << *New << "\n");
+
return true;
}
@@ -3209,20 +3219,11 @@ static Type *getTypePartition(const DataLayout &DL, Type *Ty, uint64_t Offset,
return nullptr;
if (SequentialType *SeqTy = dyn_cast<SequentialType>(Ty)) {
- // We can't partition pointers...
- if (SeqTy->isPointerTy())
- return nullptr;
-
Type *ElementTy = SeqTy->getElementType();
uint64_t ElementSize = DL.getTypeAllocSize(ElementTy);
uint64_t NumSkippedElements = Offset / ElementSize;
- if (ArrayType *ArrTy = dyn_cast<ArrayType>(SeqTy)) {
- if (NumSkippedElements >= ArrTy->getNumElements())
- return nullptr;
- } else if (VectorType *VecTy = dyn_cast<VectorType>(SeqTy)) {
- if (NumSkippedElements >= VecTy->getNumElements())
- return nullptr;
- }
+ if (NumSkippedElements >= SeqTy->getNumElements())
+ return nullptr;
Offset -= NumSkippedElements * ElementSize;
// First check if we need to recurse.
@@ -3456,63 +3457,60 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
// match relative to their starting offset. We have to verify this prior to
// any rewriting.
Stores.erase(
- std::remove_if(Stores.begin(), Stores.end(),
- [&UnsplittableLoads, &SplitOffsetsMap](StoreInst *SI) {
- // Lookup the load we are storing in our map of split
- // offsets.
- auto *LI = cast<LoadInst>(SI->getValueOperand());
- // If it was completely unsplittable, then we're done,
- // and this store can't be pre-split.
- if (UnsplittableLoads.count(LI))
- return true;
-
- auto LoadOffsetsI = SplitOffsetsMap.find(LI);
- if (LoadOffsetsI == SplitOffsetsMap.end())
- return false; // Unrelated loads are definitely safe.
- auto &LoadOffsets = LoadOffsetsI->second;
-
- // Now lookup the store's offsets.
- auto &StoreOffsets = SplitOffsetsMap[SI];
-
- // If the relative offsets of each split in the load and
- // store match exactly, then we can split them and we
- // don't need to remove them here.
- if (LoadOffsets.Splits == StoreOffsets.Splits)
- return false;
-
- DEBUG(dbgs()
- << " Mismatched splits for load and store:\n"
- << " " << *LI << "\n"
- << " " << *SI << "\n");
-
- // We've found a store and load that we need to split
- // with mismatched relative splits. Just give up on them
- // and remove both instructions from our list of
- // candidates.
- UnsplittableLoads.insert(LI);
- return true;
- }),
+ remove_if(Stores,
+ [&UnsplittableLoads, &SplitOffsetsMap](StoreInst *SI) {
+ // Lookup the load we are storing in our map of split
+ // offsets.
+ auto *LI = cast<LoadInst>(SI->getValueOperand());
+ // If it was completely unsplittable, then we're done,
+ // and this store can't be pre-split.
+ if (UnsplittableLoads.count(LI))
+ return true;
+
+ auto LoadOffsetsI = SplitOffsetsMap.find(LI);
+ if (LoadOffsetsI == SplitOffsetsMap.end())
+ return false; // Unrelated loads are definitely safe.
+ auto &LoadOffsets = LoadOffsetsI->second;
+
+ // Now lookup the store's offsets.
+ auto &StoreOffsets = SplitOffsetsMap[SI];
+
+ // If the relative offsets of each split in the load and
+ // store match exactly, then we can split them and we
+ // don't need to remove them here.
+ if (LoadOffsets.Splits == StoreOffsets.Splits)
+ return false;
+
+ DEBUG(dbgs() << " Mismatched splits for load and store:\n"
+ << " " << *LI << "\n"
+ << " " << *SI << "\n");
+
+ // We've found a store and load that we need to split
+ // with mismatched relative splits. Just give up on them
+ // and remove both instructions from our list of
+ // candidates.
+ UnsplittableLoads.insert(LI);
+ return true;
+ }),
Stores.end());
// Now we have to go *back* through all the stores, because a later store may
// have caused an earlier store's load to become unsplittable and if it is
// unsplittable for the later store, then we can't rely on it being split in
// the earlier store either.
- Stores.erase(std::remove_if(Stores.begin(), Stores.end(),
- [&UnsplittableLoads](StoreInst *SI) {
- auto *LI =
- cast<LoadInst>(SI->getValueOperand());
- return UnsplittableLoads.count(LI);
- }),
+ Stores.erase(remove_if(Stores,
+ [&UnsplittableLoads](StoreInst *SI) {
+ auto *LI = cast<LoadInst>(SI->getValueOperand());
+ return UnsplittableLoads.count(LI);
+ }),
Stores.end());
// Once we've established all the loads that can't be split for some reason,
// filter any that made it into our list out.
- Loads.erase(std::remove_if(Loads.begin(), Loads.end(),
- [&UnsplittableLoads](LoadInst *LI) {
- return UnsplittableLoads.count(LI);
- }),
+ Loads.erase(remove_if(Loads,
+ [&UnsplittableLoads](LoadInst *LI) {
+ return UnsplittableLoads.count(LI);
+ }),
Loads.end());
-
// If no loads or stores are left, there is no pre-splitting to be done for
// this alloca.
if (Loads.empty() && Stores.empty())
@@ -3570,6 +3568,7 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
PartPtrTy, BasePtr->getName() + "."),
getAdjustedAlignment(LI, PartOffset, DL), /*IsVolatile*/ false,
LI->getName());
+ PLoad->copyMetadata(*LI, LLVMContext::MD_mem_parallel_loop_access);
// Append this load onto the list of split loads so we can find it later
// to rewrite the stores.
@@ -3622,7 +3621,7 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
APInt(DL.getPointerSizeInBits(), PartOffset),
PartPtrTy, StoreBasePtr->getName() + "."),
getAdjustedAlignment(SI, PartOffset, DL), /*IsVolatile*/ false);
- (void)PStore;
+ PStore->copyMetadata(*LI, LLVMContext::MD_mem_parallel_loop_access);
DEBUG(dbgs() << " +" << PartOffset << ":" << *PStore << "\n");
}
@@ -3770,9 +3769,7 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
}
// Remove the killed slices that have ben pre-split.
- AS.erase(std::remove_if(AS.begin(), AS.end(), [](const Slice &S) {
- return S.isDead();
- }), AS.end());
+ AS.erase(remove_if(AS, [](const Slice &S) { return S.isDead(); }), AS.end());
// Insert our new slices. This will sort and merge them into the sorted
// sequence.
@@ -3787,8 +3784,8 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
// Finally, don't try to promote any allocas that new require re-splitting.
// They have already been added to the worklist above.
PromotableAllocas.erase(
- std::remove_if(
- PromotableAllocas.begin(), PromotableAllocas.end(),
+ remove_if(
+ PromotableAllocas,
[&](AllocaInst *AI) { return ResplitPromotableAllocas.count(AI); }),
PromotableAllocas.end());
@@ -3985,16 +3982,16 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
if (!IsSorted)
std::sort(AS.begin(), AS.end());
- /// \brief Describes the allocas introduced by rewritePartition
- /// in order to migrate the debug info.
- struct Piece {
+ /// Describes the allocas introduced by rewritePartition in order to migrate
+ /// the debug info.
+ struct Fragment {
AllocaInst *Alloca;
uint64_t Offset;
uint64_t Size;
- Piece(AllocaInst *AI, uint64_t O, uint64_t S)
+ Fragment(AllocaInst *AI, uint64_t O, uint64_t S)
: Alloca(AI), Offset(O), Size(S) {}
};
- SmallVector<Piece, 4> Pieces;
+ SmallVector<Fragment, 4> Fragments;
// Rewrite each partition.
for (auto &P : AS.partitions()) {
@@ -4005,7 +4002,7 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
uint64_t AllocaSize = DL.getTypeSizeInBits(NewAI->getAllocatedType());
// Don't include any padding.
uint64_t Size = std::min(AllocaSize, P.size() * SizeOfByte);
- Pieces.push_back(Piece(NewAI, P.beginOffset() * SizeOfByte, Size));
+ Fragments.push_back(Fragment(NewAI, P.beginOffset() * SizeOfByte, Size));
}
}
++NumPartitions;
@@ -4022,32 +4019,34 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
auto *Expr = DbgDecl->getExpression();
DIBuilder DIB(*AI.getModule(), /*AllowUnresolved*/ false);
uint64_t AllocaSize = DL.getTypeSizeInBits(AI.getAllocatedType());
- for (auto Piece : Pieces) {
- // Create a piece expression describing the new partition or reuse AI's
+ for (auto Fragment : Fragments) {
+ // Create a fragment expression describing the new partition or reuse AI's
// expression if there is only one partition.
- auto *PieceExpr = Expr;
- if (Piece.Size < AllocaSize || Expr->isBitPiece()) {
+ auto *FragmentExpr = Expr;
+ if (Fragment.Size < AllocaSize || Expr->isFragment()) {
// If this alloca is already a scalar replacement of a larger aggregate,
- // Piece.Offset describes the offset inside the scalar.
- uint64_t Offset = Expr->isBitPiece() ? Expr->getBitPieceOffset() : 0;
- uint64_t Start = Offset + Piece.Offset;
- uint64_t Size = Piece.Size;
- if (Expr->isBitPiece()) {
- uint64_t AbsEnd = Expr->getBitPieceOffset() + Expr->getBitPieceSize();
+ // Fragment.Offset describes the offset inside the scalar.
+ auto ExprFragment = Expr->getFragmentInfo();
+ uint64_t Offset = ExprFragment ? ExprFragment->OffsetInBits : 0;
+ uint64_t Start = Offset + Fragment.Offset;
+ uint64_t Size = Fragment.Size;
+ if (ExprFragment) {
+ uint64_t AbsEnd =
+ ExprFragment->OffsetInBits + ExprFragment->SizeInBits;
if (Start >= AbsEnd)
// No need to describe a SROAed padding.
continue;
Size = std::min(Size, AbsEnd - Start);
}
- PieceExpr = DIB.createBitPieceExpression(Start, Size);
+ FragmentExpr = DIB.createFragmentExpression(Start, Size);
}
// Remove any existing dbg.declare intrinsic describing the same alloca.
- if (DbgDeclareInst *OldDDI = FindAllocaDbgDeclare(Piece.Alloca))
+ if (DbgDeclareInst *OldDDI = FindAllocaDbgDeclare(Fragment.Alloca))
OldDDI->eraseFromParent();
- DIB.insertDeclare(Piece.Alloca, Var, PieceExpr, DbgDecl->getDebugLoc(),
- &AI);
+ DIB.insertDeclare(Fragment.Alloca, Var, FragmentExpr,
+ DbgDecl->getDebugLoc(), &AI);
}
}
return Changed;
@@ -4220,9 +4219,7 @@ PreservedAnalyses SROA::runImpl(Function &F, DominatorTree &RunDT,
auto IsInSet = [&](AllocaInst *AI) { return DeletedAllocas.count(AI); };
Worklist.remove_if(IsInSet);
PostPromotionWorklist.remove_if(IsInSet);
- PromotableAllocas.erase(std::remove_if(PromotableAllocas.begin(),
- PromotableAllocas.end(),
- IsInSet),
+ PromotableAllocas.erase(remove_if(PromotableAllocas, IsInSet),
PromotableAllocas.end());
DeletedAllocas.clear();
}
@@ -4244,7 +4241,7 @@ PreservedAnalyses SROA::runImpl(Function &F, DominatorTree &RunDT,
return PA;
}
-PreservedAnalyses SROA::run(Function &F, AnalysisManager<Function> &AM) {
+PreservedAnalyses SROA::run(Function &F, FunctionAnalysisManager &AM) {
return runImpl(F, AM.getResult<DominatorTreeAnalysis>(F),
AM.getResult<AssumptionAnalysis>(F));
}
@@ -4277,7 +4274,7 @@ public:
AU.setPreservesCFG();
}
- const char *getPassName() const override { return "SROA"; }
+ StringRef getPassName() const override { return "SROA"; }
static char ID;
};
diff --git a/contrib/llvm/lib/Transforms/Scalar/Scalar.cpp b/contrib/llvm/lib/Transforms/Scalar/Scalar.cpp
index f235b12..afe7483 100644
--- a/contrib/llvm/lib/Transforms/Scalar/Scalar.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/Scalar.cpp
@@ -43,14 +43,17 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) {
initializeDSELegacyPassPass(Registry);
initializeGuardWideningLegacyPassPass(Registry);
initializeGVNLegacyPassPass(Registry);
+ initializeNewGVNPass(Registry);
initializeEarlyCSELegacyPassPass(Registry);
+ initializeEarlyCSEMemSSALegacyPassPass(Registry);
initializeGVNHoistLegacyPassPass(Registry);
initializeFlattenCFGPassPass(Registry);
initializeInductiveRangeCheckEliminationPass(Registry);
initializeIndVarSimplifyLegacyPassPass(Registry);
initializeJumpThreadingPass(Registry);
initializeLegacyLICMPassPass(Registry);
- initializeLoopDataPrefetchPass(Registry);
+ initializeLegacyLoopSinkPassPass(Registry);
+ initializeLoopDataPrefetchLegacyPassPass(Registry);
initializeLoopDeletionLegacyPassPass(Registry);
initializeLoopAccessLegacyAnalysisPass(Registry);
initializeLoopInstSimplifyLegacyPassPass(Registry);
@@ -64,10 +67,10 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) {
initializeLoopIdiomRecognizeLegacyPassPass(Registry);
initializeLowerAtomicLegacyPassPass(Registry);
initializeLowerExpectIntrinsicPass(Registry);
- initializeLowerGuardIntrinsicPass(Registry);
+ initializeLowerGuardIntrinsicLegacyPassPass(Registry);
initializeMemCpyOptLegacyPassPass(Registry);
initializeMergedLoadStoreMotionLegacyPassPass(Registry);
- initializeNaryReassociatePass(Registry);
+ initializeNaryReassociateLegacyPassPass(Registry);
initializePartiallyInlineLibCallsLegacyPassPass(Registry);
initializeReassociateLegacyPassPass(Registry);
initializeRegToMemPass(Registry);
@@ -80,7 +83,7 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) {
initializeSinkingLegacyPassPass(Registry);
initializeTailCallElimPass(Registry);
initializeSeparateConstOffsetFromGEPPass(Registry);
- initializeSpeculativeExecutionPass(Registry);
+ initializeSpeculativeExecutionLegacyPassPass(Registry);
initializeStraightLineStrengthReducePass(Registry);
initializeLoadCombinePass(Registry);
initializePlaceBackedgeSafepointsImplPass(Registry);
@@ -124,6 +127,10 @@ void LLVMAddGVNPass(LLVMPassManagerRef PM) {
unwrap(PM)->add(createGVNPass());
}
+void LLVMAddNewGVNPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createNewGVNPass());
+}
+
void LLVMAddMergedLoadStoreMotionPass(LLVMPassManagerRef PM) {
unwrap(PM)->add(createMergedLoadStoreMotionPass());
}
@@ -140,6 +147,10 @@ void LLVMAddJumpThreadingPass(LLVMPassManagerRef PM) {
unwrap(PM)->add(createJumpThreadingPass());
}
+void LLVMAddLoopSinkPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createLoopSinkPass());
+}
+
void LLVMAddLICMPass(LLVMPassManagerRef PM) {
unwrap(PM)->add(createLICMPass());
}
@@ -234,7 +245,11 @@ void LLVMAddCorrelatedValuePropagationPass(LLVMPassManagerRef PM) {
}
void LLVMAddEarlyCSEPass(LLVMPassManagerRef PM) {
- unwrap(PM)->add(createEarlyCSEPass());
+ unwrap(PM)->add(createEarlyCSEPass(false/*=UseMemorySSA*/));
+}
+
+void LLVMAddEarlyCSEMemSSAPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createEarlyCSEPass(true/*=UseMemorySSA*/));
}
void LLVMAddGVNHoistLegacyPass(LLVMPassManagerRef PM) {
diff --git a/contrib/llvm/lib/Transforms/Scalar/Scalarizer.cpp b/contrib/llvm/lib/Transforms/Scalar/Scalarizer.cpp
index aed4a4a..39969e2 100644
--- a/contrib/llvm/lib/Transforms/Scalar/Scalarizer.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/Scalarizer.cpp
@@ -16,6 +16,7 @@
#include "llvm/Transforms/Scalar.h"
#include "llvm/ADT/STLExtras.h"
+#include "llvm/Analysis/VectorUtils.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InstVisitor.h"
#include "llvm/Pass.h"
@@ -148,6 +149,7 @@ public:
bool visitPHINode(PHINode &);
bool visitLoadInst(LoadInst &);
bool visitStoreInst(StoreInst &);
+ bool visitCallInst(CallInst &I);
static void registerOptions() {
// This is disabled by default because having separate loads and stores
@@ -169,6 +171,8 @@ private:
template<typename T> bool splitBinary(Instruction &, const T &);
+ bool splitCall(CallInst &CI);
+
ScatterMap Scattered;
GatherList Gathered;
unsigned ParallelLoopAccessMDKind;
@@ -394,6 +398,77 @@ bool Scalarizer::splitBinary(Instruction &I, const Splitter &Split) {
return true;
}
+static bool isTriviallyScalariable(Intrinsic::ID ID) {
+ return isTriviallyVectorizable(ID);
+}
+
+// All of the current scalarizable intrinsics only have one mangled type.
+static Function *getScalarIntrinsicDeclaration(Module *M,
+ Intrinsic::ID ID,
+ VectorType *Ty) {
+ return Intrinsic::getDeclaration(M, ID, { Ty->getScalarType() });
+}
+
+/// If a call to a vector typed intrinsic function, split into a scalar call per
+/// element if possible for the intrinsic.
+bool Scalarizer::splitCall(CallInst &CI) {
+ VectorType *VT = dyn_cast<VectorType>(CI.getType());
+ if (!VT)
+ return false;
+
+ Function *F = CI.getCalledFunction();
+ if (!F)
+ return false;
+
+ Intrinsic::ID ID = F->getIntrinsicID();
+ if (ID == Intrinsic::not_intrinsic || !isTriviallyScalariable(ID))
+ return false;
+
+ unsigned NumElems = VT->getNumElements();
+ unsigned NumArgs = CI.getNumArgOperands();
+
+ ValueVector ScalarOperands(NumArgs);
+ SmallVector<Scatterer, 8> Scattered(NumArgs);
+
+ Scattered.resize(NumArgs);
+
+ // Assumes that any vector type has the same number of elements as the return
+ // vector type, which is true for all current intrinsics.
+ for (unsigned I = 0; I != NumArgs; ++I) {
+ Value *OpI = CI.getOperand(I);
+ if (OpI->getType()->isVectorTy()) {
+ Scattered[I] = scatter(&CI, OpI);
+ assert(Scattered[I].size() == NumElems && "mismatched call operands");
+ } else {
+ ScalarOperands[I] = OpI;
+ }
+ }
+
+ ValueVector Res(NumElems);
+ ValueVector ScalarCallOps(NumArgs);
+
+ Function *NewIntrin = getScalarIntrinsicDeclaration(F->getParent(), ID, VT);
+ IRBuilder<> Builder(&CI);
+
+ // Perform actual scalarization, taking care to preserve any scalar operands.
+ for (unsigned Elem = 0; Elem < NumElems; ++Elem) {
+ ScalarCallOps.clear();
+
+ for (unsigned J = 0; J != NumArgs; ++J) {
+ if (hasVectorInstrinsicScalarOpd(ID, J))
+ ScalarCallOps.push_back(ScalarOperands[J]);
+ else
+ ScalarCallOps.push_back(Scattered[J][Elem]);
+ }
+
+ Res[Elem] = Builder.CreateCall(NewIntrin, ScalarCallOps,
+ CI.getName() + ".i" + Twine(Elem));
+ }
+
+ gather(&CI, Res);
+ return true;
+}
+
bool Scalarizer::visitSelectInst(SelectInst &SI) {
VectorType *VT = dyn_cast<VectorType>(SI.getType());
if (!VT)
@@ -642,6 +717,10 @@ bool Scalarizer::visitStoreInst(StoreInst &SI) {
return true;
}
+bool Scalarizer::visitCallInst(CallInst &CI) {
+ return splitCall(CI);
+}
+
// Delete the instructions that we scalarized. If a full vector result
// is still needed, recreate it using InsertElements.
bool Scalarizer::finish() {
diff --git a/contrib/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/contrib/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
index d6ae186..4d59453 100644
--- a/contrib/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
@@ -722,7 +722,7 @@ bool SeparateConstOffsetFromGEP::canonicalizeArrayIndicesToPointerSize(
for (User::op_iterator I = GEP->op_begin() + 1, E = GEP->op_end();
I != E; ++I, ++GTI) {
// Skip struct member indices which must be i32.
- if (isa<SequentialType>(*GTI)) {
+ if (GTI.isSequential()) {
if ((*I)->getType() != IntPtrTy) {
*I = CastInst::CreateIntegerCast(*I, IntPtrTy, true, "idxprom", GEP);
Changed = true;
@@ -739,7 +739,7 @@ SeparateConstOffsetFromGEP::accumulateByteOffset(GetElementPtrInst *GEP,
int64_t AccumulativeByteOffset = 0;
gep_type_iterator GTI = gep_type_begin(*GEP);
for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I, ++GTI) {
- if (isa<SequentialType>(*GTI)) {
+ if (GTI.isSequential()) {
// Tries to extract a constant offset from this GEP index.
int64_t ConstantOffset =
ConstantOffsetExtractor::Find(GEP->getOperand(I), GEP, DT);
@@ -752,7 +752,7 @@ SeparateConstOffsetFromGEP::accumulateByteOffset(GetElementPtrInst *GEP,
ConstantOffset * DL->getTypeAllocSize(GTI.getIndexedType());
}
} else if (LowerGEP) {
- StructType *StTy = cast<StructType>(*GTI);
+ StructType *StTy = GTI.getStructType();
uint64_t Field = cast<ConstantInt>(GEP->getOperand(I))->getZExtValue();
// Skip field 0 as the offset is always 0.
if (Field != 0) {
@@ -787,7 +787,7 @@ void SeparateConstOffsetFromGEP::lowerToSingleIndexGEPs(
// Create an ugly GEP for each sequential index. We don't create GEPs for
// structure indices, as they are accumulated in the constant offset index.
for (unsigned I = 1, E = Variadic->getNumOperands(); I != E; ++I, ++GTI) {
- if (isa<SequentialType>(*GTI)) {
+ if (GTI.isSequential()) {
Value *Idx = Variadic->getOperand(I);
// Skip zero indices.
if (ConstantInt *CI = dyn_cast<ConstantInt>(Idx))
@@ -848,7 +848,7 @@ SeparateConstOffsetFromGEP::lowerToArithmetics(GetElementPtrInst *Variadic,
// don't create arithmetics for structure indices, as they are accumulated
// in the constant offset index.
for (unsigned I = 1, E = Variadic->getNumOperands(); I != E; ++I, ++GTI) {
- if (isa<SequentialType>(*GTI)) {
+ if (GTI.isSequential()) {
Value *Idx = Variadic->getOperand(I);
// Skip zero indices.
if (ConstantInt *CI = dyn_cast<ConstantInt>(Idx))
@@ -928,7 +928,7 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
// handle the constant offset and won't need a new structure index.
gep_type_iterator GTI = gep_type_begin(*GEP);
for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I, ++GTI) {
- if (isa<SequentialType>(*GTI)) {
+ if (GTI.isSequential()) {
// Splits this GEP index into a variadic part and a constant offset, and
// uses the variadic part as the new index.
Value *OldIdx = GEP->getOperand(I);
@@ -1150,8 +1150,7 @@ bool SeparateConstOffsetFromGEP::reuniteExts(Instruction *I) {
bool SeparateConstOffsetFromGEP::reuniteExts(Function &F) {
bool Changed = false;
DominatingExprs.clear();
- for (auto Node = GraphTraits<DominatorTree *>::nodes_begin(DT);
- Node != GraphTraits<DominatorTree *>::nodes_end(DT); ++Node) {
+ for (const auto Node : depth_first(DT)) {
BasicBlock *BB = Node->getBlock();
for (auto I = BB->begin(); I != BB->end(); ) {
Instruction *Cur = &*I++;
diff --git a/contrib/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/contrib/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
index 2d0a21d..f2723bd 100644
--- a/contrib/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
@@ -187,7 +187,7 @@ SimplifyCFGPass::SimplifyCFGPass(int BonusInstThreshold)
: BonusInstThreshold(BonusInstThreshold) {}
PreservedAnalyses SimplifyCFGPass::run(Function &F,
- AnalysisManager<Function> &AM) {
+ FunctionAnalysisManager &AM) {
auto &TTI = AM.getResult<TargetIRAnalysis>(F);
auto &AC = AM.getResult<AssumptionAnalysis>(F);
diff --git a/contrib/llvm/lib/Transforms/Scalar/Sink.cpp b/contrib/llvm/lib/Transforms/Scalar/Sink.cpp
index d9a296c..c3f14a0 100644
--- a/contrib/llvm/lib/Transforms/Scalar/Sink.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/Sink.cpp
@@ -254,7 +254,7 @@ static bool iterativelySinkInstructions(Function &F, DominatorTree &DT,
return EverMadeChange;
}
-PreservedAnalyses SinkingPass::run(Function &F, AnalysisManager<Function> &AM) {
+PreservedAnalyses SinkingPass::run(Function &F, FunctionAnalysisManager &AM) {
auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
auto &LI = AM.getResult<LoopAnalysis>(F);
auto &AA = AM.getResult<AAManager>(F);
diff --git a/contrib/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp b/contrib/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp
index 9bf2d62..a7c308b 100644
--- a/contrib/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp
@@ -61,9 +61,9 @@
//
//===----------------------------------------------------------------------===//
+#include "llvm/Transforms/Scalar/SpeculativeExecution.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/Analysis/GlobalsModRef.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/Module.h"
@@ -101,58 +101,62 @@ static cl::opt<bool> SpecExecOnlyIfDivergentTarget(
namespace {
-class SpeculativeExecution : public FunctionPass {
- public:
- static char ID;
- explicit SpeculativeExecution(bool OnlyIfDivergentTarget = false)
- : FunctionPass(ID),
- OnlyIfDivergentTarget(OnlyIfDivergentTarget ||
- SpecExecOnlyIfDivergentTarget) {}
-
- void getAnalysisUsage(AnalysisUsage &AU) const override;
- bool runOnFunction(Function &F) override;
-
- const char *getPassName() const override {
- if (OnlyIfDivergentTarget)
- return "Speculatively execute instructions if target has divergent "
- "branches";
- return "Speculatively execute instructions";
- }
-
- private:
- bool runOnBasicBlock(BasicBlock &B);
- bool considerHoistingFromTo(BasicBlock &FromBlock, BasicBlock &ToBlock);
-
- // If true, this pass is a nop unless the target architecture has branch
- // divergence.
+class SpeculativeExecutionLegacyPass : public FunctionPass {
+public:
+ static char ID;
+ explicit SpeculativeExecutionLegacyPass(bool OnlyIfDivergentTarget = false)
+ : FunctionPass(ID), OnlyIfDivergentTarget(OnlyIfDivergentTarget ||
+ SpecExecOnlyIfDivergentTarget),
+ Impl(OnlyIfDivergentTarget) {}
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override;
+ bool runOnFunction(Function &F) override;
+
+ StringRef getPassName() const override {
+ if (OnlyIfDivergentTarget)
+ return "Speculatively execute instructions if target has divergent "
+ "branches";
+ return "Speculatively execute instructions";
+ }
+
+private:
+ // Variable preserved purely for correct name printing.
const bool OnlyIfDivergentTarget;
- const TargetTransformInfo *TTI = nullptr;
+
+ SpeculativeExecutionPass Impl;
};
} // namespace
-char SpeculativeExecution::ID = 0;
-INITIALIZE_PASS_BEGIN(SpeculativeExecution, "speculative-execution",
+char SpeculativeExecutionLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(SpeculativeExecutionLegacyPass, "speculative-execution",
"Speculatively execute instructions", false, false)
INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
-INITIALIZE_PASS_END(SpeculativeExecution, "speculative-execution",
+INITIALIZE_PASS_END(SpeculativeExecutionLegacyPass, "speculative-execution",
"Speculatively execute instructions", false, false)
-void SpeculativeExecution::getAnalysisUsage(AnalysisUsage &AU) const {
+void SpeculativeExecutionLegacyPass::getAnalysisUsage(AnalysisUsage &AU) const {
AU.addRequired<TargetTransformInfoWrapperPass>();
AU.addPreserved<GlobalsAAWrapperPass>();
}
-bool SpeculativeExecution::runOnFunction(Function &F) {
+bool SpeculativeExecutionLegacyPass::runOnFunction(Function &F) {
if (skipFunction(F))
return false;
- TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+ auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+ return Impl.runImpl(F, TTI);
+}
+
+namespace llvm {
+
+bool SpeculativeExecutionPass::runImpl(Function &F, TargetTransformInfo *TTI) {
if (OnlyIfDivergentTarget && !TTI->hasBranchDivergence()) {
DEBUG(dbgs() << "Not running SpeculativeExecution because "
"TTI->hasBranchDivergence() is false.\n");
return false;
}
+ this->TTI = TTI;
bool Changed = false;
for (auto& B : F) {
Changed |= runOnBasicBlock(B);
@@ -160,7 +164,7 @@ bool SpeculativeExecution::runOnFunction(Function &F) {
return Changed;
}
-bool SpeculativeExecution::runOnBasicBlock(BasicBlock &B) {
+bool SpeculativeExecutionPass::runOnBasicBlock(BasicBlock &B) {
BranchInst *BI = dyn_cast<BranchInst>(B.getTerminator());
if (BI == nullptr)
return false;
@@ -220,6 +224,24 @@ static unsigned ComputeSpeculationCost(const Instruction *I,
case Instruction::Xor:
case Instruction::ZExt:
case Instruction::SExt:
+ case Instruction::Call:
+ case Instruction::BitCast:
+ case Instruction::PtrToInt:
+ case Instruction::IntToPtr:
+ case Instruction::AddrSpaceCast:
+ case Instruction::FPToUI:
+ case Instruction::FPToSI:
+ case Instruction::UIToFP:
+ case Instruction::SIToFP:
+ case Instruction::FPExt:
+ case Instruction::FPTrunc:
+ case Instruction::FAdd:
+ case Instruction::FSub:
+ case Instruction::FMul:
+ case Instruction::FDiv:
+ case Instruction::FRem:
+ case Instruction::ICmp:
+ case Instruction::FCmp:
return TTI.getUserCost(I);
default:
@@ -227,8 +249,8 @@ static unsigned ComputeSpeculationCost(const Instruction *I,
}
}
-bool SpeculativeExecution::considerHoistingFromTo(BasicBlock &FromBlock,
- BasicBlock &ToBlock) {
+bool SpeculativeExecutionPass::considerHoistingFromTo(
+ BasicBlock &FromBlock, BasicBlock &ToBlock) {
SmallSet<const Instruction *, 8> NotHoisted;
const auto AllPrecedingUsesFromBlockHoisted = [&NotHoisted](User *U) {
for (Value* V : U->operand_values()) {
@@ -270,14 +292,28 @@ bool SpeculativeExecution::considerHoistingFromTo(BasicBlock &FromBlock,
return true;
}
-namespace llvm {
-
FunctionPass *createSpeculativeExecutionPass() {
- return new SpeculativeExecution();
+ return new SpeculativeExecutionLegacyPass();
}
FunctionPass *createSpeculativeExecutionIfHasBranchDivergencePass() {
- return new SpeculativeExecution(/* OnlyIfDivergentTarget = */ true);
+ return new SpeculativeExecutionLegacyPass(/* OnlyIfDivergentTarget = */ true);
}
+SpeculativeExecutionPass::SpeculativeExecutionPass(bool OnlyIfDivergentTarget)
+ : OnlyIfDivergentTarget(OnlyIfDivergentTarget ||
+ SpecExecOnlyIfDivergentTarget) {}
+
+PreservedAnalyses SpeculativeExecutionPass::run(Function &F,
+ FunctionAnalysisManager &AM) {
+ auto *TTI = &AM.getResult<TargetIRAnalysis>(F);
+
+ bool Changed = runImpl(F, TTI);
+
+ if (!Changed)
+ return PreservedAnalyses::all();
+ PreservedAnalyses PA;
+ PA.preserve<GlobalsAA>();
+ return PA;
+}
} // namespace llvm
diff --git a/contrib/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp b/contrib/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
index 292d040..2be3f5c 100644
--- a/contrib/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
@@ -55,8 +55,6 @@
//
// - When (i' - i) is constant but i and i' are not, we could still perform
// SLSR.
-#include <vector>
-
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Analysis/ValueTracking.h"
@@ -68,6 +66,8 @@
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Utils/Local.h"
+#include <list>
+#include <vector>
using namespace llvm;
using namespace PatternMatch;
@@ -80,7 +80,7 @@ class StraightLineStrengthReduce : public FunctionPass {
public:
// SLSR candidate. Such a candidate must be in one of the forms described in
// the header comments.
- struct Candidate : public ilist_node<Candidate> {
+ struct Candidate {
enum Kind {
Invalid, // reserved for the default constructor
Add, // B + i * S
@@ -200,7 +200,7 @@ private:
DominatorTree *DT;
ScalarEvolution *SE;
TargetTransformInfo *TTI;
- ilist<Candidate> Candidates;
+ std::list<Candidate> Candidates;
// Temporarily holds all instructions that are unlinked (but not deleted) by
// rewriteCandidateWithBasis. These instructions will be actually removed
// after all rewriting finishes.
@@ -490,8 +490,8 @@ void StraightLineStrengthReduce::allocateCandidatesAndFindBasisForGEP(
IndexExprs.push_back(SE->getSCEV(*I));
gep_type_iterator GTI = gep_type_begin(GEP);
- for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I) {
- if (!isa<SequentialType>(*GTI++))
+ for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I, ++GTI) {
+ if (GTI.isStruct())
continue;
const SCEV *OrigIndexExpr = IndexExprs[I - 1];
@@ -499,11 +499,9 @@ void StraightLineStrengthReduce::allocateCandidatesAndFindBasisForGEP(
// The base of this candidate is GEP's base plus the offsets of all
// indices except this current one.
- const SCEV *BaseExpr = SE->getGEPExpr(GEP->getSourceElementType(),
- SE->getSCEV(GEP->getPointerOperand()),
- IndexExprs, GEP->isInBounds());
+ const SCEV *BaseExpr = SE->getGEPExpr(cast<GEPOperator>(GEP), IndexExprs);
Value *ArrayIdx = GEP->getOperand(I);
- uint64_t ElementSize = DL->getTypeAllocSize(*GTI);
+ uint64_t ElementSize = DL->getTypeAllocSize(GTI.getIndexedType());
if (ArrayIdx->getType()->getIntegerBitWidth() <=
DL->getPointerSizeInBits(GEP->getAddressSpace())) {
// Skip factoring if ArrayIdx is wider than the pointer size, because
@@ -674,11 +672,9 @@ bool StraightLineStrengthReduce::runOnFunction(Function &F) {
SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
// Traverse the dominator tree in the depth-first order. This order makes sure
// all bases of a candidate are in Candidates when we process it.
- for (auto node = GraphTraits<DominatorTree *>::nodes_begin(DT);
- node != GraphTraits<DominatorTree *>::nodes_end(DT); ++node) {
- for (auto &I : *node->getBlock())
+ for (const auto Node : depth_first(DT))
+ for (auto &I : *(Node->getBlock()))
allocateCandidatesAndFindBasis(&I);
- }
// Rewrite candidates in the reverse depth-first order. This order makes sure
// a candidate being rewritten is not a basis for any other candidate.
diff --git a/contrib/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp b/contrib/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
index e9ac39b..49ce026 100644
--- a/contrib/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
@@ -43,77 +43,58 @@ typedef SmallPtrSet<BasicBlock *, 8> BBSet;
typedef MapVector<PHINode *, BBValueVector> PhiMap;
typedef MapVector<BasicBlock *, BBVector> BB2BBVecMap;
-typedef DenseMap<DomTreeNode *, unsigned> DTN2UnsignedMap;
typedef DenseMap<BasicBlock *, PhiMap> BBPhiMap;
typedef DenseMap<BasicBlock *, Value *> BBPredicates;
typedef DenseMap<BasicBlock *, BBPredicates> PredMap;
typedef DenseMap<BasicBlock *, BasicBlock*> BB2BBMap;
// The name for newly created blocks.
-
static const char *const FlowBlockName = "Flow";
-/// @brief Find the nearest common dominator for multiple BasicBlocks
+/// Finds the nearest common dominator of a set of BasicBlocks.
///
-/// Helper class for StructurizeCFG
-/// TODO: Maybe move into common code
+/// For every BB you add to the set, you can specify whether we "remember" the
+/// block. When you get the common dominator, you can also ask whether it's one
+/// of the blocks we remembered.
class NearestCommonDominator {
DominatorTree *DT;
+ BasicBlock *Result = nullptr;
+ bool ResultIsRemembered = false;
- DTN2UnsignedMap IndexMap;
-
- BasicBlock *Result;
- unsigned ResultIndex;
- bool ExplicitMentioned;
-
-public:
- /// \brief Start a new query
- NearestCommonDominator(DominatorTree *DomTree) {
- DT = DomTree;
- Result = nullptr;
- }
-
- /// \brief Add BB to the resulting dominator
- void addBlock(BasicBlock *BB, bool Remember = true) {
- DomTreeNode *Node = DT->getNode(BB);
-
+ /// Add BB to the resulting dominator.
+ void addBlock(BasicBlock *BB, bool Remember) {
if (!Result) {
- unsigned Numbering = 0;
- for (;Node;Node = Node->getIDom())
- IndexMap[Node] = ++Numbering;
Result = BB;
- ResultIndex = 1;
- ExplicitMentioned = Remember;
+ ResultIsRemembered = Remember;
return;
}
- for (;Node;Node = Node->getIDom())
- if (IndexMap.count(Node))
- break;
- else
- IndexMap[Node] = 0;
+ BasicBlock *NewResult = DT->findNearestCommonDominator(Result, BB);
+ if (NewResult != Result)
+ ResultIsRemembered = false;
+ if (NewResult == BB)
+ ResultIsRemembered |= Remember;
+ Result = NewResult;
+ }
- assert(Node && "Dominator tree invalid!");
+public:
+ explicit NearestCommonDominator(DominatorTree *DomTree) : DT(DomTree) {}
- unsigned Numbering = IndexMap[Node];
- if (Numbering > ResultIndex) {
- Result = Node->getBlock();
- ResultIndex = Numbering;
- ExplicitMentioned = Remember && (Result == BB);
- } else if (Numbering == ResultIndex) {
- ExplicitMentioned |= Remember;
- }
+ void addBlock(BasicBlock *BB) {
+ addBlock(BB, /* Remember = */ false);
}
- /// \brief Is "Result" one of the BBs added with "Remember" = True?
- bool wasResultExplicitMentioned() {
- return ExplicitMentioned;
+ void addAndRememberBlock(BasicBlock *BB) {
+ addBlock(BB, /* Remember = */ true);
}
- /// \brief Get the query result
- BasicBlock *getResult() {
- return Result;
- }
+ /// Get the nearest common dominator of all the BBs added via addBlock() and
+ /// addAndRememberBlock().
+ BasicBlock *result() { return Result; }
+
+ /// Is the BB returned by getResult() one of the blocks we added to the set
+ /// with addAndRememberBlock()?
+ bool resultIsRememberedBlock() { return ResultIsRemembered; }
};
/// @brief Transforms the control flow graph on one single entry/exit region
@@ -141,7 +122,7 @@ public:
/// Control flow is expressed as a branch where the true exit goes into the
/// "Then"/"Else" region, while the false exit skips the region
/// The condition for the optional "Else" region is expressed as a PHI node.
-/// The incomming values of the PHI node are true for the "If" edge and false
+/// The incoming values of the PHI node are true for the "If" edge and false
/// for the "Then" edge.
///
/// Additionally to that even complicated loops look like this:
@@ -163,7 +144,6 @@ public:
/// breaks and the false values expresses continue states.
class StructurizeCFG : public RegionPass {
bool SkipUniformRegions;
- DivergenceAnalysis *DA;
Type *Boolean;
ConstantInt *BoolTrue;
@@ -176,7 +156,7 @@ class StructurizeCFG : public RegionPass {
DominatorTree *DT;
LoopInfo *LI;
- RNVector Order;
+ SmallVector<RegionNode *, 8> Order;
BBSet Visited;
BBPhiMap DeletedPhis;
@@ -236,29 +216,19 @@ class StructurizeCFG : public RegionPass {
void rebuildSSA();
- bool hasOnlyUniformBranches(const Region *R);
-
public:
static char ID;
- StructurizeCFG() :
- RegionPass(ID), SkipUniformRegions(false) {
- initializeStructurizeCFGPass(*PassRegistry::getPassRegistry());
- }
-
- StructurizeCFG(bool SkipUniformRegions) :
- RegionPass(ID), SkipUniformRegions(SkipUniformRegions) {
+ explicit StructurizeCFG(bool SkipUniformRegions = false)
+ : RegionPass(ID), SkipUniformRegions(SkipUniformRegions) {
initializeStructurizeCFGPass(*PassRegistry::getPassRegistry());
}
- using Pass::doInitialization;
bool doInitialization(Region *R, RGPassManager &RGM) override;
bool runOnRegion(Region *R, RGPassManager &RGM) override;
- const char *getPassName() const override {
- return "Structurize control flow";
- }
+ StringRef getPassName() const override { return "Structurize control flow"; }
void getAnalysisUsage(AnalysisUsage &AU) const override {
if (SkipUniformRegions)
@@ -266,6 +236,7 @@ public:
AU.addRequiredID(LowerSwitchID);
AU.addRequired<DominatorTreeWrapperPass>();
AU.addRequired<LoopInfoWrapperPass>();
+
AU.addPreserved<DominatorTreeWrapperPass>();
RegionPass::getAnalysisUsage(AU);
}
@@ -298,17 +269,13 @@ bool StructurizeCFG::doInitialization(Region *R, RGPassManager &RGM) {
/// \brief Build up the general order of nodes
void StructurizeCFG::orderNodes() {
- RNVector TempOrder;
ReversePostOrderTraversal<Region*> RPOT(ParentRegion);
- TempOrder.append(RPOT.begin(), RPOT.end());
-
- std::map<Loop*, unsigned> LoopBlocks;
-
+ SmallDenseMap<Loop*, unsigned, 8> LoopBlocks;
// The reverse post-order traversal of the list gives us an ordering close
// to what we want. The only problem with it is that sometimes backedges
// for outer loops will be visited before backedges for inner loops.
- for (RegionNode *RN : TempOrder) {
+ for (RegionNode *RN : RPOT) {
BasicBlock *BB = RN->getEntry();
Loop *Loop = LI->getLoopFor(BB);
++LoopBlocks[Loop];
@@ -316,19 +283,18 @@ void StructurizeCFG::orderNodes() {
unsigned CurrentLoopDepth = 0;
Loop *CurrentLoop = nullptr;
- BBSet TempVisited;
- for (RNVector::iterator I = TempOrder.begin(), E = TempOrder.end(); I != E; ++I) {
+ for (auto I = RPOT.begin(), E = RPOT.end(); I != E; ++I) {
BasicBlock *BB = (*I)->getEntry();
unsigned LoopDepth = LI->getLoopDepth(BB);
- if (std::find(Order.begin(), Order.end(), *I) != Order.end())
+ if (is_contained(Order, *I))
continue;
if (LoopDepth < CurrentLoopDepth) {
// Make sure we have visited all blocks in this loop before moving back to
// the outer loop.
- RNVector::iterator LoopI = I;
+ auto LoopI = I;
while (unsigned &BlockCount = LoopBlocks[CurrentLoop]) {
LoopI++;
BasicBlock *LoopBB = (*LoopI)->getEntry();
@@ -340,9 +306,8 @@ void StructurizeCFG::orderNodes() {
}
CurrentLoop = LI->getLoopFor(BB);
- if (CurrentLoop) {
+ if (CurrentLoop)
LoopBlocks[CurrentLoop]--;
- }
CurrentLoopDepth = LoopDepth;
Order.push_back(*I);
@@ -426,46 +391,40 @@ void StructurizeCFG::gatherPredicates(RegionNode *N) {
BBPredicates &Pred = Predicates[BB];
BBPredicates &LPred = LoopPreds[BB];
- for (pred_iterator PI = pred_begin(BB), PE = pred_end(BB);
- PI != PE; ++PI) {
-
+ for (BasicBlock *P : predecessors(BB)) {
// Ignore it if it's a branch from outside into our region entry
- if (!ParentRegion->contains(*PI))
+ if (!ParentRegion->contains(P))
continue;
- Region *R = RI->getRegionFor(*PI);
+ Region *R = RI->getRegionFor(P);
if (R == ParentRegion) {
-
// It's a top level block in our region
- BranchInst *Term = cast<BranchInst>((*PI)->getTerminator());
+ BranchInst *Term = cast<BranchInst>(P->getTerminator());
for (unsigned i = 0, e = Term->getNumSuccessors(); i != e; ++i) {
BasicBlock *Succ = Term->getSuccessor(i);
if (Succ != BB)
continue;
- if (Visited.count(*PI)) {
+ if (Visited.count(P)) {
// Normal forward edge
if (Term->isConditional()) {
// Try to treat it like an ELSE block
BasicBlock *Other = Term->getSuccessor(!i);
if (Visited.count(Other) && !Loops.count(Other) &&
- !Pred.count(Other) && !Pred.count(*PI)) {
+ !Pred.count(Other) && !Pred.count(P)) {
Pred[Other] = BoolFalse;
- Pred[*PI] = BoolTrue;
+ Pred[P] = BoolTrue;
continue;
}
}
- Pred[*PI] = buildCondition(Term, i, false);
-
+ Pred[P] = buildCondition(Term, i, false);
} else {
// Back edge
- LPred[*PI] = buildCondition(Term, i, true);
+ LPred[P] = buildCondition(Term, i, true);
}
}
-
} else {
-
// It's an exit from a sub region
while (R->getParent() != ParentRegion)
R = R->getParent();
@@ -496,7 +455,6 @@ void StructurizeCFG::collectInfos() {
Visited.clear();
for (RegionNode *RN : reverse(Order)) {
-
DEBUG(dbgs() << "Visiting: "
<< (RN->isSubRegion() ? "SubRegion with entry: " : "")
<< RN->getEntry()->getName() << " Loop Depth: "
@@ -533,25 +491,26 @@ void StructurizeCFG::insertConditions(bool Loops) {
BBPredicates &Preds = Loops ? LoopPreds[SuccFalse] : Predicates[SuccTrue];
NearestCommonDominator Dominator(DT);
- Dominator.addBlock(Parent, false);
+ Dominator.addBlock(Parent);
Value *ParentValue = nullptr;
- for (BBPredicates::iterator PI = Preds.begin(), PE = Preds.end();
- PI != PE; ++PI) {
+ for (std::pair<BasicBlock *, Value *> BBAndPred : Preds) {
+ BasicBlock *BB = BBAndPred.first;
+ Value *Pred = BBAndPred.second;
- if (PI->first == Parent) {
- ParentValue = PI->second;
+ if (BB == Parent) {
+ ParentValue = Pred;
break;
}
- PhiInserter.AddAvailableValue(PI->first, PI->second);
- Dominator.addBlock(PI->first);
+ PhiInserter.AddAvailableValue(BB, Pred);
+ Dominator.addAndRememberBlock(BB);
}
if (ParentValue) {
Term->setCondition(ParentValue);
} else {
- if (!Dominator.wasResultExplicitMentioned())
- PhiInserter.AddAvailableValue(Dominator.getResult(), Default);
+ if (!Dominator.resultIsRememberedBlock())
+ PhiInserter.AddAvailableValue(Dominator.result(), Default);
Term->setCondition(PhiInserter.GetValueInMiddleOfBlock(Parent));
}
@@ -562,10 +521,10 @@ void StructurizeCFG::insertConditions(bool Loops) {
/// them in DeletedPhis
void StructurizeCFG::delPhiValues(BasicBlock *From, BasicBlock *To) {
PhiMap &Map = DeletedPhis[To];
- for (BasicBlock::iterator I = To->begin(), E = To->end();
- I != E && isa<PHINode>(*I);) {
-
- PHINode &Phi = cast<PHINode>(*I++);
+ for (Instruction &I : *To) {
+ if (!isa<PHINode>(I))
+ break;
+ PHINode &Phi = cast<PHINode>(I);
while (Phi.getBasicBlockIndex(From) != -1) {
Value *Deleted = Phi.removeIncomingValue(From, false);
Map[&Phi].push_back(std::make_pair(From, Deleted));
@@ -575,10 +534,10 @@ void StructurizeCFG::delPhiValues(BasicBlock *From, BasicBlock *To) {
/// \brief Add a dummy PHI value as soon as we knew the new predecessor
void StructurizeCFG::addPhiValues(BasicBlock *From, BasicBlock *To) {
- for (BasicBlock::iterator I = To->begin(), E = To->end();
- I != E && isa<PHINode>(*I);) {
-
- PHINode &Phi = cast<PHINode>(*I++);
+ for (Instruction &I : *To) {
+ if (!isa<PHINode>(I))
+ break;
+ PHINode &Phi = cast<PHINode>(I);
Value *Undef = UndefValue::get(Phi.getType());
Phi.addIncoming(Undef, From);
}
@@ -589,7 +548,6 @@ void StructurizeCFG::addPhiValues(BasicBlock *From, BasicBlock *To) {
void StructurizeCFG::setPhiValues() {
SSAUpdater Updater;
for (const auto &AddedPhi : AddedPhis) {
-
BasicBlock *To = AddedPhi.first;
const BBVector &From = AddedPhi.second;
@@ -598,7 +556,6 @@ void StructurizeCFG::setPhiValues() {
PhiMap &Map = DeletedPhis[To];
for (const auto &PI : Map) {
-
PHINode *Phi = PI.first;
Value *Undef = UndefValue::get(Phi->getType());
Updater.Initialize(Phi->getType(), "");
@@ -606,18 +563,16 @@ void StructurizeCFG::setPhiValues() {
Updater.AddAvailableValue(To, Undef);
NearestCommonDominator Dominator(DT);
- Dominator.addBlock(To, false);
+ Dominator.addBlock(To);
for (const auto &VI : PI.second) {
-
Updater.AddAvailableValue(VI.first, VI.second);
- Dominator.addBlock(VI.first);
+ Dominator.addAndRememberBlock(VI.first);
}
- if (!Dominator.wasResultExplicitMentioned())
- Updater.AddAvailableValue(Dominator.getResult(), Undef);
+ if (!Dominator.resultIsRememberedBlock())
+ Updater.AddAvailableValue(Dominator.result(), Undef);
for (BasicBlock *FI : From) {
-
int Idx = Phi->getBasicBlockIndex(FI);
assert(Idx != -1);
Phi->setIncomingValue(Idx, Updater.GetValueAtEndOfBlock(FI));
@@ -636,10 +591,8 @@ void StructurizeCFG::killTerminator(BasicBlock *BB) {
return;
for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB);
- SI != SE; ++SI) {
-
+ SI != SE; ++SI)
delPhiValues(BB, *SI);
- }
Term->eraseFromParent();
}
@@ -653,10 +606,10 @@ void StructurizeCFG::changeExit(RegionNode *Node, BasicBlock *NewExit,
BasicBlock *Dominator = nullptr;
// Find all the edges from the sub region to the exit
- for (pred_iterator I = pred_begin(OldExit), E = pred_end(OldExit);
- I != E;) {
+ for (auto BBI = pred_begin(OldExit), E = pred_end(OldExit); BBI != E;) {
+ // Incrememt BBI before mucking with BB's terminator.
+ BasicBlock *BB = *BBI++;
- BasicBlock *BB = *I++;
if (!SubRegion->contains(BB))
continue;
@@ -680,7 +633,6 @@ void StructurizeCFG::changeExit(RegionNode *Node, BasicBlock *NewExit,
// Update the region info
SubRegion->replaceExit(NewExit);
-
} else {
BasicBlock *BB = Node->getNodeAs<BasicBlock>();
killTerminator(BB);
@@ -711,7 +663,6 @@ BasicBlock *StructurizeCFG::needPrefix(bool NeedEmpty) {
killTerminator(Entry);
if (!NeedEmpty || Entry->getFirstInsertionPt() == Entry->end())
return Entry;
-
}
// create a new flow node
@@ -726,13 +677,13 @@ BasicBlock *StructurizeCFG::needPrefix(bool NeedEmpty) {
/// \brief Returns the region exit if possible, otherwise just a new flow node
BasicBlock *StructurizeCFG::needPostfix(BasicBlock *Flow,
bool ExitUseAllowed) {
- if (Order.empty() && ExitUseAllowed) {
- BasicBlock *Exit = ParentRegion->getExit();
- DT->changeImmediateDominator(Exit, Flow);
- addPhiValues(Flow, Exit);
- return Exit;
- }
- return getNextFlow(Flow);
+ if (!Order.empty() || !ExitUseAllowed)
+ return getNextFlow(Flow);
+
+ BasicBlock *Exit = ParentRegion->getExit();
+ DT->changeImmediateDominator(Exit, Flow);
+ addPhiValues(Flow, Exit);
+ return Exit;
}
/// \brief Set the previous node
@@ -741,16 +692,12 @@ void StructurizeCFG::setPrevNode(BasicBlock *BB) {
: nullptr;
}
-/// \brief Does BB dominate all the predicates of Node ?
+/// \brief Does BB dominate all the predicates of Node?
bool StructurizeCFG::dominatesPredicates(BasicBlock *BB, RegionNode *Node) {
BBPredicates &Preds = Predicates[Node->getEntry()];
- for (BBPredicates::iterator PI = Preds.begin(), PE = Preds.end();
- PI != PE; ++PI) {
-
- if (!DT->dominates(BB, PI->first))
- return false;
- }
- return true;
+ return llvm::all_of(Preds, [&](std::pair<BasicBlock *, Value *> Pred) {
+ return DT->dominates(BB, Pred.first);
+ });
}
/// \brief Can we predict that this node will always be called?
@@ -762,13 +709,14 @@ bool StructurizeCFG::isPredictableTrue(RegionNode *Node) {
if (!PrevNode)
return true;
- for (BBPredicates::iterator I = Preds.begin(), E = Preds.end();
- I != E; ++I) {
+ for (std::pair<BasicBlock*, Value*> Pred : Preds) {
+ BasicBlock *BB = Pred.first;
+ Value *V = Pred.second;
- if (I->second != BoolTrue)
+ if (V != BoolTrue)
return false;
- if (!Dominated && DT->dominates(I->first, PrevNode->getEntry()))
+ if (!Dominated && DT->dominates(BB, PrevNode->getEntry()))
Dominated = true;
}
@@ -844,6 +792,7 @@ void StructurizeCFG::handleLoops(bool ExitUseAllowed,
LoopFunc,
LoopStart);
BranchInst::Create(LoopStart, NewEntry);
+ DT->setNewRoot(NewEntry);
}
// Create an extra loop end node
@@ -883,30 +832,29 @@ void StructurizeCFG::createFlow() {
/// no longer dominate all their uses. Not sure if this is really nessasary
void StructurizeCFG::rebuildSSA() {
SSAUpdater Updater;
- for (auto *BB : ParentRegion->blocks())
- for (BasicBlock::iterator II = BB->begin(), IE = BB->end();
- II != IE; ++II) {
-
+ for (BasicBlock *BB : ParentRegion->blocks())
+ for (Instruction &I : *BB) {
bool Initialized = false;
- for (auto I = II->use_begin(), E = II->use_end(); I != E;) {
- Use &U = *I++;
+ // We may modify the use list as we iterate over it, so be careful to
+ // compute the next element in the use list at the top of the loop.
+ for (auto UI = I.use_begin(), E = I.use_end(); UI != E;) {
+ Use &U = *UI++;
Instruction *User = cast<Instruction>(U.getUser());
if (User->getParent() == BB) {
continue;
-
} else if (PHINode *UserPN = dyn_cast<PHINode>(User)) {
if (UserPN->getIncomingBlock(U) == BB)
continue;
}
- if (DT->dominates(&*II, User))
+ if (DT->dominates(&I, User))
continue;
if (!Initialized) {
- Value *Undef = UndefValue::get(II->getType());
- Updater.Initialize(II->getType(), "");
+ Value *Undef = UndefValue::get(I.getType());
+ Updater.Initialize(I.getType(), "");
Updater.AddAvailableValue(&Func->getEntryBlock(), Undef);
- Updater.AddAvailableValue(BB, &*II);
+ Updater.AddAvailableValue(BB, &I);
Initialized = true;
}
Updater.RewriteUseAfterInsertions(U);
@@ -914,13 +862,14 @@ void StructurizeCFG::rebuildSSA() {
}
}
-bool StructurizeCFG::hasOnlyUniformBranches(const Region *R) {
+static bool hasOnlyUniformBranches(const Region *R,
+ const DivergenceAnalysis &DA) {
for (const BasicBlock *BB : R->blocks()) {
const BranchInst *Br = dyn_cast<BranchInst>(BB->getTerminator());
if (!Br || !Br->isConditional())
continue;
- if (!DA->isUniform(Br->getCondition()))
+ if (!DA.isUniform(Br->getCondition()))
return false;
DEBUG(dbgs() << "BB: " << BB->getName() << " has uniform terminator\n");
}
@@ -933,9 +882,9 @@ bool StructurizeCFG::runOnRegion(Region *R, RGPassManager &RGM) {
return false;
if (SkipUniformRegions) {
- DA = &getAnalysis<DivergenceAnalysis>();
// TODO: We could probably be smarter here with how we handle sub-regions.
- if (hasOnlyUniformBranches(R)) {
+ auto &DA = getAnalysis<DivergenceAnalysis>();
+ if (hasOnlyUniformBranches(R, DA)) {
DEBUG(dbgs() << "Skipping region with uniform control flow: " << *R << '\n');
// Mark all direct child block terminators as having been treated as
@@ -943,12 +892,11 @@ bool StructurizeCFG::runOnRegion(Region *R, RGPassManager &RGM) {
// sub-regions are treated more cleverly, indirect children are not
// marked as uniform.
MDNode *MD = MDNode::get(R->getEntry()->getParent()->getContext(), {});
- Region::element_iterator E = R->element_end();
- for (Region::element_iterator I = R->element_begin(); I != E; ++I) {
- if (I->isSubRegion())
+ for (RegionNode *E : R->elements()) {
+ if (E->isSubRegion())
continue;
- if (Instruction *Term = I->getEntry()->getTerminator())
+ if (Instruction *Term = E->getEntry()->getTerminator())
Term->setMetadata("structurizecfg.uniform", MD);
}
diff --git a/contrib/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp b/contrib/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
index d5ff997..a6b9fee 100644
--- a/contrib/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
@@ -236,7 +236,7 @@ static bool markTails(Function &F, bool &AllCallsAreTailCalls) {
if (!CI || CI->isTailCall())
continue;
- bool IsNoTail = CI->isNoTailCall();
+ bool IsNoTail = CI->isNoTailCall() || CI->hasOperandBundles();
if (!IsNoTail && CI->doesNotAccessMemory()) {
// A call to a readnone function whose arguments are all things computed
@@ -347,7 +347,7 @@ static bool canMoveAboveCall(Instruction *I, CallInst *CI) {
// return value of the call, it must only use things that are defined before
// the call, or movable instructions between the call and the instruction
// itself.
- return std::find(I->op_begin(), I->op_end(), CI) == I->op_end();
+ return !is_contained(I->operands(), CI);
}
/// Return true if the specified value is the same when the return would exit
diff --git a/contrib/llvm/lib/Transforms/Utils/ASanStackFrameLayout.cpp b/contrib/llvm/lib/Transforms/Utils/ASanStackFrameLayout.cpp
index 7e50d4b..df9d5da 100644
--- a/contrib/llvm/lib/Transforms/Utils/ASanStackFrameLayout.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/ASanStackFrameLayout.cpp
@@ -12,7 +12,9 @@
//===----------------------------------------------------------------------===//
#include "llvm/Transforms/Utils/ASanStackFrameLayout.h"
#include "llvm/ADT/SmallString.h"
+#include "llvm/IR/DebugInfo.h"
#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/ScopedPrinter.h"
#include "llvm/Support/raw_ostream.h"
#include <algorithm>
@@ -47,64 +49,102 @@ static size_t VarAndRedzoneSize(size_t Size, size_t Alignment) {
return alignTo(Res, Alignment);
}
-void
+ASanStackFrameLayout
ComputeASanStackFrameLayout(SmallVectorImpl<ASanStackVariableDescription> &Vars,
- size_t Granularity, size_t MinHeaderSize,
- ASanStackFrameLayout *Layout) {
+ size_t Granularity, size_t MinHeaderSize) {
assert(Granularity >= 8 && Granularity <= 64 &&
(Granularity & (Granularity - 1)) == 0);
assert(MinHeaderSize >= 16 && (MinHeaderSize & (MinHeaderSize - 1)) == 0 &&
MinHeaderSize >= Granularity);
- size_t NumVars = Vars.size();
+ const size_t NumVars = Vars.size();
assert(NumVars > 0);
for (size_t i = 0; i < NumVars; i++)
Vars[i].Alignment = std::max(Vars[i].Alignment, kMinAlignment);
std::stable_sort(Vars.begin(), Vars.end(), CompareVars);
- SmallString<2048> StackDescriptionStorage;
- raw_svector_ostream StackDescription(StackDescriptionStorage);
- StackDescription << NumVars;
- Layout->FrameAlignment = std::max(Granularity, Vars[0].Alignment);
- SmallVector<uint8_t, 64> &SB(Layout->ShadowBytes);
- SB.clear();
+
+ ASanStackFrameLayout Layout;
+ Layout.Granularity = Granularity;
+ Layout.FrameAlignment = std::max(Granularity, Vars[0].Alignment);
size_t Offset = std::max(std::max(MinHeaderSize, Granularity),
Vars[0].Alignment);
assert((Offset % Granularity) == 0);
- SB.insert(SB.end(), Offset / Granularity, kAsanStackLeftRedzoneMagic);
for (size_t i = 0; i < NumVars; i++) {
bool IsLast = i == NumVars - 1;
size_t Alignment = std::max(Granularity, Vars[i].Alignment);
(void)Alignment; // Used only in asserts.
size_t Size = Vars[i].Size;
- const char *Name = Vars[i].Name;
assert((Alignment & (Alignment - 1)) == 0);
- assert(Layout->FrameAlignment >= Alignment);
+ assert(Layout.FrameAlignment >= Alignment);
assert((Offset % Alignment) == 0);
assert(Size > 0);
- StackDescription << " " << Offset << " " << Size << " " << strlen(Name)
- << " " << Name;
size_t NextAlignment = IsLast ? Granularity
: std::max(Granularity, Vars[i + 1].Alignment);
- size_t SizeWithRedzone = VarAndRedzoneSize(Vars[i].Size, NextAlignment);
- SB.insert(SB.end(), Size / Granularity, 0);
- if (Size % Granularity)
- SB.insert(SB.end(), Size % Granularity);
- SB.insert(SB.end(), (SizeWithRedzone - Size) / Granularity,
- IsLast ? kAsanStackRightRedzoneMagic
- : kAsanStackMidRedzoneMagic);
+ size_t SizeWithRedzone = VarAndRedzoneSize(Size, NextAlignment);
Vars[i].Offset = Offset;
Offset += SizeWithRedzone;
}
if (Offset % MinHeaderSize) {
- size_t ExtraRedzone = MinHeaderSize - (Offset % MinHeaderSize);
- SB.insert(SB.end(), ExtraRedzone / Granularity,
- kAsanStackRightRedzoneMagic);
- Offset += ExtraRedzone;
+ Offset += MinHeaderSize - (Offset % MinHeaderSize);
+ }
+ Layout.FrameSize = Offset;
+ assert((Layout.FrameSize % MinHeaderSize) == 0);
+ return Layout;
+}
+
+SmallString<64> ComputeASanStackFrameDescription(
+ const SmallVectorImpl<ASanStackVariableDescription> &Vars) {
+ SmallString<2048> StackDescriptionStorage;
+ raw_svector_ostream StackDescription(StackDescriptionStorage);
+ StackDescription << Vars.size();
+
+ for (const auto &Var : Vars) {
+ std::string Name = Var.Name;
+ if (Var.Line) {
+ Name += ":";
+ Name += to_string(Var.Line);
+ }
+ StackDescription << " " << Var.Offset << " " << Var.Size << " "
+ << Name.size() << " " << Name;
}
- Layout->DescriptionString = StackDescription.str();
- Layout->FrameSize = Offset;
- assert((Layout->FrameSize % MinHeaderSize) == 0);
- assert(Layout->FrameSize / Granularity == Layout->ShadowBytes.size());
+ return StackDescription.str();
+}
+
+SmallVector<uint8_t, 64>
+GetShadowBytes(const SmallVectorImpl<ASanStackVariableDescription> &Vars,
+ const ASanStackFrameLayout &Layout) {
+ assert(Vars.size() > 0);
+ SmallVector<uint8_t, 64> SB;
+ SB.clear();
+ const size_t Granularity = Layout.Granularity;
+ SB.resize(Vars[0].Offset / Granularity, kAsanStackLeftRedzoneMagic);
+ for (const auto &Var : Vars) {
+ SB.resize(Var.Offset / Granularity, kAsanStackMidRedzoneMagic);
+
+ SB.resize(SB.size() + Var.Size / Granularity, 0);
+ if (Var.Size % Granularity)
+ SB.push_back(Var.Size % Granularity);
+ }
+ SB.resize(Layout.FrameSize / Granularity, kAsanStackRightRedzoneMagic);
+ return SB;
+}
+
+SmallVector<uint8_t, 64> GetShadowBytesAfterScope(
+ const SmallVectorImpl<ASanStackVariableDescription> &Vars,
+ const ASanStackFrameLayout &Layout) {
+ SmallVector<uint8_t, 64> SB = GetShadowBytes(Vars, Layout);
+ const size_t Granularity = Layout.Granularity;
+
+ for (const auto &Var : Vars) {
+ assert(Var.LifetimeSize <= Var.Size);
+ const size_t LifetimeShadowSize =
+ (Var.LifetimeSize + Granularity - 1) / Granularity;
+ const size_t Offset = Var.Offset / Granularity;
+ std::fill(SB.begin() + Offset, SB.begin() + Offset + LifetimeShadowSize,
+ kAsanStackUseAfterScopeMagic);
+ }
+
+ return SB;
}
} // llvm namespace
diff --git a/contrib/llvm/lib/Transforms/Utils/AddDiscriminators.cpp b/contrib/llvm/lib/Transforms/Utils/AddDiscriminators.cpp
index d034905..2e95926 100644
--- a/contrib/llvm/lib/Transforms/Utils/AddDiscriminators.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/AddDiscriminators.cpp
@@ -57,12 +57,10 @@
#include "llvm/ADT/DenseSet.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/Constants.h"
-#include "llvm/IR/DIBuilder.h"
#include "llvm/IR/DebugInfo.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Module.h"
#include "llvm/Pass.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
@@ -159,20 +157,14 @@ static bool addDiscriminators(Function &F) {
// If the function has debug information, but the user has disabled
// discriminators, do nothing.
// Simlarly, if the function has no debug info, do nothing.
- // Finally, if this module is built with dwarf versions earlier than 4,
- // do nothing (discriminator support is a DWARF 4 feature).
- if (NoDiscriminators || !F.getSubprogram() ||
- F.getParent()->getDwarfVersion() < 4)
+ if (NoDiscriminators || !F.getSubprogram())
return false;
bool Changed = false;
- Module *M = F.getParent();
- LLVMContext &Ctx = M->getContext();
- DIBuilder Builder(*M, /*AllowUnresolved*/ false);
typedef std::pair<StringRef, unsigned> Location;
- typedef DenseMap<const BasicBlock *, Metadata *> BBScopeMap;
- typedef DenseMap<Location, BBScopeMap> LocationBBMap;
+ typedef DenseSet<const BasicBlock *> BBSet;
+ typedef DenseMap<Location, BBSet> LocationBBMap;
typedef DenseMap<Location, unsigned> LocationDiscriminatorMap;
typedef DenseSet<Location> LocationSet;
@@ -184,32 +176,25 @@ static bool addDiscriminators(Function &F) {
// discriminator for this instruction.
for (BasicBlock &B : F) {
for (auto &I : B.getInstList()) {
- if (isa<DbgInfoIntrinsic>(&I))
+ if (isa<IntrinsicInst>(&I))
continue;
const DILocation *DIL = I.getDebugLoc();
if (!DIL)
continue;
Location L = std::make_pair(DIL->getFilename(), DIL->getLine());
auto &BBMap = LBM[L];
- auto R = BBMap.insert(std::make_pair(&B, (Metadata *)nullptr));
+ auto R = BBMap.insert(&B);
if (BBMap.size() == 1)
continue;
- bool InsertSuccess = R.second;
- Metadata *&NewScope = R.first->second;
- // If we could insert a different block in the same location, a
+ // If we could insert more than one block with the same line+file, a
// discriminator is needed to distinguish both instructions.
- if (InsertSuccess) {
- auto *Scope = DIL->getScope();
- auto *File =
- Builder.createFile(DIL->getFilename(), Scope->getDirectory());
- NewScope = Builder.createLexicalBlockFile(Scope, File, ++LDM[L]);
- }
- I.setDebugLoc(DILocation::get(Ctx, DIL->getLine(), DIL->getColumn(),
- NewScope, DIL->getInlinedAt()));
+ // Only the lowest 7 bits are used to represent a discriminator to fit
+ // it in 1 byte ULEB128 representation.
+ unsigned Discriminator = (R.second ? ++LDM[L] : LDM[L]) & 0x7f;
+ I.setDebugLoc(DIL->cloneWithDiscriminator(Discriminator));
DEBUG(dbgs() << DIL->getFilename() << ":" << DIL->getLine() << ":"
- << DIL->getColumn() << ":"
- << dyn_cast<DILexicalBlockFile>(NewScope)->getDiscriminator()
- << I << "\n");
+ << DIL->getColumn() << ":" << Discriminator << " " << I
+ << "\n");
Changed = true;
}
}
@@ -222,7 +207,7 @@ static bool addDiscriminators(Function &F) {
LocationSet CallLocations;
for (auto &I : B.getInstList()) {
CallInst *Current = dyn_cast<CallInst>(&I);
- if (!Current || isa<DbgInfoIntrinsic>(&I))
+ if (!Current || isa<IntrinsicInst>(&I))
continue;
DILocation *CurrentDIL = Current->getDebugLoc();
@@ -231,13 +216,8 @@ static bool addDiscriminators(Function &F) {
Location L =
std::make_pair(CurrentDIL->getFilename(), CurrentDIL->getLine());
if (!CallLocations.insert(L).second) {
- auto *Scope = CurrentDIL->getScope();
- auto *File = Builder.createFile(CurrentDIL->getFilename(),
- Scope->getDirectory());
- auto *NewScope = Builder.createLexicalBlockFile(Scope, File, ++LDM[L]);
- Current->setDebugLoc(DILocation::get(Ctx, CurrentDIL->getLine(),
- CurrentDIL->getColumn(), NewScope,
- CurrentDIL->getInlinedAt()));
+ Current->setDebugLoc(
+ CurrentDIL->cloneWithDiscriminator((++LDM[L]) & 0x7f));
Changed = true;
}
}
@@ -249,7 +229,7 @@ bool AddDiscriminatorsLegacyPass::runOnFunction(Function &F) {
return addDiscriminators(F);
}
PreservedAnalyses AddDiscriminatorsPass::run(Function &F,
- AnalysisManager<Function> &AM) {
+ FunctionAnalysisManager &AM) {
if (!addDiscriminators(F))
return PreservedAnalyses::all();
diff --git a/contrib/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp b/contrib/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp
index 49b646a..175cbd2 100644
--- a/contrib/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp
@@ -15,7 +15,7 @@
//
//===----------------------------------------------------------------------===//
-#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BreakCriticalEdges.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/AliasAnalysis.h"
@@ -23,10 +23,10 @@
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/IR/CFG.h"
#include "llvm/IR/Dominators.h"
-#include "llvm/IR/Function.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/Type.h"
#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
using namespace llvm;
@@ -72,6 +72,20 @@ FunctionPass *llvm::createBreakCriticalEdgesPass() {
return new BreakCriticalEdges();
}
+PreservedAnalyses BreakCriticalEdgesPass::run(Function &F,
+ FunctionAnalysisManager &AM) {
+ auto *DT = AM.getCachedResult<DominatorTreeAnalysis>(F);
+ auto *LI = AM.getCachedResult<LoopAnalysis>(F);
+ unsigned N = SplitAllCriticalEdges(F, CriticalEdgeSplittingOptions(DT, LI));
+ NumBroken += N;
+ if (N == 0)
+ return PreservedAnalyses::all();
+ PreservedAnalyses PA;
+ PA.preserve<DominatorTreeAnalysis>();
+ PA.preserve<LoopAnalysis>();
+ return PA;
+}
+
//===----------------------------------------------------------------------===//
// Implementation of the external critical edge manipulation functions
//===----------------------------------------------------------------------===//
diff --git a/contrib/llvm/lib/Transforms/Utils/BuildLibCalls.cpp b/contrib/llvm/lib/Transforms/Utils/BuildLibCalls.cpp
index f4260a9..e61b04f 100644
--- a/contrib/llvm/lib/Transforms/Utils/BuildLibCalls.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/BuildLibCalls.cpp
@@ -250,6 +250,7 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
Changed |= setDoesNotCapture(F, 2);
return Changed;
case LibFunc::memcpy:
+ case LibFunc::mempcpy:
case LibFunc::memccpy:
case LibFunc::memmove:
Changed |= setDoesNotThrow(F);
diff --git a/contrib/llvm/lib/Transforms/Utils/BypassSlowDivision.cpp b/contrib/llvm/lib/Transforms/Utils/BypassSlowDivision.cpp
index 42287d3..bc2cef2 100644
--- a/contrib/llvm/lib/Transforms/Utils/BypassSlowDivision.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/BypassSlowDivision.cpp
@@ -20,6 +20,7 @@
#include "llvm/IR/Function.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Instructions.h"
+#include "llvm/Transforms/Utils/Local.h"
using namespace llvm;
@@ -82,13 +83,17 @@ static bool insertFastDiv(Instruction *I, IntegerType *BypassType,
Value *Dividend = I->getOperand(0);
Value *Divisor = I->getOperand(1);
- if (isa<ConstantInt>(Divisor) ||
- (isa<ConstantInt>(Dividend) && isa<ConstantInt>(Divisor))) {
- // Operations with immediate values should have
- // been solved and replaced during compile time.
+ if (isa<ConstantInt>(Divisor)) {
+ // Division by a constant should have been been solved and replaced earlier
+ // in the pipeline.
return false;
}
+ // If the numerator is a constant, bail if it doesn't fit into BypassType.
+ if (ConstantInt *ConstDividend = dyn_cast<ConstantInt>(Dividend))
+ if (ConstDividend->getValue().getActiveBits() > BypassType->getBitWidth())
+ return false;
+
// Basic Block is split before divide
BasicBlock *MainBB = &*I->getParent();
BasicBlock *SuccessorBB = MainBB->splitBasicBlock(I);
@@ -120,8 +125,7 @@ static bool insertFastDiv(Instruction *I, IntegerType *BypassType,
BypassType);
// udiv/urem because optimization only handles positive numbers
- Value *ShortQuotientV = FastBuilder.CreateExactUDiv(ShortDividendV,
- ShortDivisorV);
+ Value *ShortQuotientV = FastBuilder.CreateUDiv(ShortDividendV, ShortDivisorV);
Value *ShortRemainderV = FastBuilder.CreateURem(ShortDividendV,
ShortDivisorV);
Value *FastQuotientV = FastBuilder.CreateCast(Instruction::ZExt,
@@ -151,7 +155,17 @@ static bool insertFastDiv(Instruction *I, IntegerType *BypassType,
// Combine operands into a single value with OR for value testing below
MainBB->getInstList().back().eraseFromParent();
IRBuilder<> MainBuilder(MainBB, MainBB->end());
- Value *OrV = MainBuilder.CreateOr(Dividend, Divisor);
+
+ // We should have bailed out above if the divisor is a constant, but the
+ // dividend may still be a constant. Set OrV to our non-constant operands
+ // OR'ed together.
+ assert(!isa<ConstantInt>(Divisor));
+
+ Value *OrV;
+ if (!isa<ConstantInt>(Dividend))
+ OrV = MainBuilder.CreateOr(Dividend, Divisor);
+ else
+ OrV = Divisor;
// BitMask is inverted to check if the operands are
// larger than the bypass type
@@ -247,5 +261,12 @@ bool llvm::bypassSlowDivision(
MadeChange |= reuseOrInsertFastDiv(I, BT, UseDivOp, UseSignedOp, DivCache);
}
+ // Above we eagerly create divs and rems, as pairs, so that we can efficiently
+ // create divrem machine instructions. Now erase any unused divs / rems so we
+ // don't leave extra instructions sitting around.
+ for (auto &KV : DivCache)
+ for (Instruction *Phi : {KV.second.Quotient, KV.second.Remainder})
+ RecursivelyDeleteTriviallyDeadInstructions(Phi);
+
return MadeChange;
}
diff --git a/contrib/llvm/lib/Transforms/Utils/CloneModule.cpp b/contrib/llvm/lib/Transforms/Utils/CloneModule.cpp
index 17e34c4..7ebeb61 100644
--- a/contrib/llvm/lib/Transforms/Utils/CloneModule.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/CloneModule.cpp
@@ -119,6 +119,11 @@ std::unique_ptr<Module> llvm::CloneModule(
}
if (I->hasInitializer())
GV->setInitializer(MapValue(I->getInitializer(), VMap));
+
+ SmallVector<std::pair<unsigned, MDNode *>, 1> MDs;
+ I->getAllMetadata(MDs);
+ for (auto MD : MDs)
+ GV->addMetadata(MD.first, *MapMetadata(MD.second, VMap));
}
// Similarly, copy over function bodies now...
diff --git a/contrib/llvm/lib/Transforms/Utils/CmpInstAnalysis.cpp b/contrib/llvm/lib/Transforms/Utils/CmpInstAnalysis.cpp
index 3b15a0a..60ae374 100644
--- a/contrib/llvm/lib/Transforms/Utils/CmpInstAnalysis.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/CmpInstAnalysis.cpp
@@ -18,29 +18,6 @@
using namespace llvm;
-/// getICmpCode - Encode a icmp predicate into a three bit mask. These bits
-/// are carefully arranged to allow folding of expressions such as:
-///
-/// (A < B) | (A > B) --> (A != B)
-///
-/// Note that this is only valid if the first and second predicates have the
-/// same sign. Is illegal to do: (A u< B) | (A s> B)
-///
-/// Three bits are used to represent the condition, as follows:
-/// 0 A > B
-/// 1 A == B
-/// 2 A < B
-///
-/// <=> Value Definition
-/// 000 0 Always false
-/// 001 1 A > B
-/// 010 2 A == B
-/// 011 3 A >= B
-/// 100 4 A < B
-/// 101 5 A != B
-/// 110 6 A <= B
-/// 111 7 Always true
-///
unsigned llvm::getICmpCode(const ICmpInst *ICI, bool InvertPred) {
ICmpInst::Predicate Pred = InvertPred ? ICI->getInversePredicate()
: ICI->getPredicate();
@@ -62,13 +39,6 @@ unsigned llvm::getICmpCode(const ICmpInst *ICI, bool InvertPred) {
}
}
-/// getICmpValue - This is the complement of getICmpCode, which turns an
-/// opcode and two operands into either a constant true or false, or the
-/// predicate for a new ICmp instruction. The sign is passed in to determine
-/// which kind of predicate to use in the new icmp instruction.
-/// Non-NULL return value will be a true or false constant.
-/// NULL return means a new ICmp is needed. The predicate for which is
-/// output in NewICmpPred.
Value *llvm::getICmpValue(bool Sign, unsigned Code, Value *LHS, Value *RHS,
CmpInst::Predicate &NewICmpPred) {
switch (Code) {
@@ -87,10 +57,52 @@ Value *llvm::getICmpValue(bool Sign, unsigned Code, Value *LHS, Value *RHS,
return nullptr;
}
-/// PredicatesFoldable - Return true if both predicates match sign or if at
-/// least one of them is an equality comparison (which is signless).
bool llvm::PredicatesFoldable(ICmpInst::Predicate p1, ICmpInst::Predicate p2) {
return (CmpInst::isSigned(p1) == CmpInst::isSigned(p2)) ||
(CmpInst::isSigned(p1) && ICmpInst::isEquality(p2)) ||
(CmpInst::isSigned(p2) && ICmpInst::isEquality(p1));
}
+
+bool llvm::decomposeBitTestICmp(const ICmpInst *I, CmpInst::Predicate &Pred,
+ Value *&X, Value *&Y, Value *&Z) {
+ ConstantInt *C = dyn_cast<ConstantInt>(I->getOperand(1));
+ if (!C)
+ return false;
+
+ switch (I->getPredicate()) {
+ default:
+ return false;
+ case ICmpInst::ICMP_SLT:
+ // X < 0 is equivalent to (X & SignBit) != 0.
+ if (!C->isZero())
+ return false;
+ Y = ConstantInt::get(I->getContext(), APInt::getSignBit(C->getBitWidth()));
+ Pred = ICmpInst::ICMP_NE;
+ break;
+ case ICmpInst::ICMP_SGT:
+ // X > -1 is equivalent to (X & SignBit) == 0.
+ if (!C->isAllOnesValue())
+ return false;
+ Y = ConstantInt::get(I->getContext(), APInt::getSignBit(C->getBitWidth()));
+ Pred = ICmpInst::ICMP_EQ;
+ break;
+ case ICmpInst::ICMP_ULT:
+ // X <u 2^n is equivalent to (X & ~(2^n-1)) == 0.
+ if (!C->getValue().isPowerOf2())
+ return false;
+ Y = ConstantInt::get(I->getContext(), -C->getValue());
+ Pred = ICmpInst::ICMP_EQ;
+ break;
+ case ICmpInst::ICMP_UGT:
+ // X >u 2^n-1 is equivalent to (X & ~(2^n-1)) != 0.
+ if (!(C->getValue() + 1).isPowerOf2())
+ return false;
+ Y = ConstantInt::get(I->getContext(), ~C->getValue());
+ Pred = ICmpInst::ICMP_NE;
+ break;
+ }
+
+ X = I->getOperand(0);
+ Z = ConstantInt::getNullValue(C->getType());
+ return true;
+}
diff --git a/contrib/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/contrib/llvm/lib/Transforms/Utils/CodeExtractor.cpp
index 9f2181f..c514c9c 100644
--- a/contrib/llvm/lib/Transforms/Utils/CodeExtractor.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/CodeExtractor.cpp
@@ -17,6 +17,9 @@
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/StringExtras.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/BlockFrequencyInfoImpl.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/RegionInfo.h"
#include "llvm/Analysis/RegionIterator.h"
@@ -26,9 +29,11 @@
#include "llvm/IR/Instructions.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/MDBuilder.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/Verifier.h"
#include "llvm/Pass.h"
+#include "llvm/Support/BlockFrequency.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
@@ -49,7 +54,7 @@ AggregateArgsOpt("aggregate-extracted-args", cl::Hidden,
cl::desc("Aggregate arguments to code-extracted functions"));
/// \brief Test whether a block is valid for extraction.
-static bool isBlockValidForExtraction(const BasicBlock &BB) {
+bool CodeExtractor::isBlockValidForExtraction(const BasicBlock &BB) {
// Landing pads must be in the function where they were inserted for cleanup.
if (BB.isEHPad())
return false;
@@ -81,7 +86,7 @@ static SetVector<BasicBlock *> buildExtractionBlockSet(IteratorT BBBegin,
if (!Result.insert(*BBBegin))
llvm_unreachable("Repeated basic blocks in extraction input");
- if (!isBlockValidForExtraction(**BBBegin)) {
+ if (!CodeExtractor::isBlockValidForExtraction(**BBBegin)) {
Result.clear();
return Result;
}
@@ -119,23 +124,30 @@ buildExtractionBlockSet(const RegionNode &RN) {
return buildExtractionBlockSet(R.block_begin(), R.block_end());
}
-CodeExtractor::CodeExtractor(BasicBlock *BB, bool AggregateArgs)
- : DT(nullptr), AggregateArgs(AggregateArgs||AggregateArgsOpt),
- Blocks(buildExtractionBlockSet(BB)), NumExitBlocks(~0U) {}
+CodeExtractor::CodeExtractor(BasicBlock *BB, bool AggregateArgs,
+ BlockFrequencyInfo *BFI,
+ BranchProbabilityInfo *BPI)
+ : DT(nullptr), AggregateArgs(AggregateArgs || AggregateArgsOpt), BFI(BFI),
+ BPI(BPI), Blocks(buildExtractionBlockSet(BB)), NumExitBlocks(~0U) {}
CodeExtractor::CodeExtractor(ArrayRef<BasicBlock *> BBs, DominatorTree *DT,
- bool AggregateArgs)
- : DT(DT), AggregateArgs(AggregateArgs||AggregateArgsOpt),
- Blocks(buildExtractionBlockSet(BBs)), NumExitBlocks(~0U) {}
-
-CodeExtractor::CodeExtractor(DominatorTree &DT, Loop &L, bool AggregateArgs)
- : DT(&DT), AggregateArgs(AggregateArgs||AggregateArgsOpt),
- Blocks(buildExtractionBlockSet(L.getBlocks())), NumExitBlocks(~0U) {}
+ bool AggregateArgs, BlockFrequencyInfo *BFI,
+ BranchProbabilityInfo *BPI)
+ : DT(DT), AggregateArgs(AggregateArgs || AggregateArgsOpt), BFI(BFI),
+ BPI(BPI), Blocks(buildExtractionBlockSet(BBs)), NumExitBlocks(~0U) {}
+
+CodeExtractor::CodeExtractor(DominatorTree &DT, Loop &L, bool AggregateArgs,
+ BlockFrequencyInfo *BFI,
+ BranchProbabilityInfo *BPI)
+ : DT(&DT), AggregateArgs(AggregateArgs || AggregateArgsOpt), BFI(BFI),
+ BPI(BPI), Blocks(buildExtractionBlockSet(L.getBlocks())),
+ NumExitBlocks(~0U) {}
CodeExtractor::CodeExtractor(DominatorTree &DT, const RegionNode &RN,
- bool AggregateArgs)
- : DT(&DT), AggregateArgs(AggregateArgs||AggregateArgsOpt),
- Blocks(buildExtractionBlockSet(RN)), NumExitBlocks(~0U) {}
+ bool AggregateArgs, BlockFrequencyInfo *BFI,
+ BranchProbabilityInfo *BPI)
+ : DT(&DT), AggregateArgs(AggregateArgs || AggregateArgsOpt), BFI(BFI),
+ BPI(BPI), Blocks(buildExtractionBlockSet(RN)), NumExitBlocks(~0U) {}
/// definedInRegion - Return true if the specified value is defined in the
/// extracted region.
@@ -339,7 +351,22 @@ Function *CodeExtractor::constructFunction(const ValueSet &inputs,
// If the old function is no-throw, so is the new one.
if (oldFunction->doesNotThrow())
newFunction->setDoesNotThrow();
-
+
+ // Inherit the uwtable attribute if we need to.
+ if (oldFunction->hasUWTable())
+ newFunction->setHasUWTable();
+
+ // Inherit all of the target dependent attributes.
+ // (e.g. If the extracted region contains a call to an x86.sse
+ // instruction we need to make sure that the extracted region has the
+ // "target-features" attribute allowing it to be lowered.
+ // FIXME: This should be changed to check to see if a specific
+ // attribute can not be inherited.
+ AttributeSet OldFnAttrs = oldFunction->getAttributes().getFnAttributes();
+ AttrBuilder AB(OldFnAttrs, AttributeSet::FunctionIndex);
+ for (auto Attr : AB.td_attrs())
+ newFunction->addFnAttr(Attr.first, Attr.second);
+
newFunction->getBasicBlockList().push_back(newRootNode);
// Create an iterator to name all of the arguments we inserted.
@@ -672,6 +699,51 @@ void CodeExtractor::moveCodeToFunction(Function *newFunction) {
}
}
+void CodeExtractor::calculateNewCallTerminatorWeights(
+ BasicBlock *CodeReplacer,
+ DenseMap<BasicBlock *, BlockFrequency> &ExitWeights,
+ BranchProbabilityInfo *BPI) {
+ typedef BlockFrequencyInfoImplBase::Distribution Distribution;
+ typedef BlockFrequencyInfoImplBase::BlockNode BlockNode;
+
+ // Update the branch weights for the exit block.
+ TerminatorInst *TI = CodeReplacer->getTerminator();
+ SmallVector<unsigned, 8> BranchWeights(TI->getNumSuccessors(), 0);
+
+ // Block Frequency distribution with dummy node.
+ Distribution BranchDist;
+
+ // Add each of the frequencies of the successors.
+ for (unsigned i = 0, e = TI->getNumSuccessors(); i < e; ++i) {
+ BlockNode ExitNode(i);
+ uint64_t ExitFreq = ExitWeights[TI->getSuccessor(i)].getFrequency();
+ if (ExitFreq != 0)
+ BranchDist.addExit(ExitNode, ExitFreq);
+ else
+ BPI->setEdgeProbability(CodeReplacer, i, BranchProbability::getZero());
+ }
+
+ // Check for no total weight.
+ if (BranchDist.Total == 0)
+ return;
+
+ // Normalize the distribution so that they can fit in unsigned.
+ BranchDist.normalize();
+
+ // Create normalized branch weights and set the metadata.
+ for (unsigned I = 0, E = BranchDist.Weights.size(); I < E; ++I) {
+ const auto &Weight = BranchDist.Weights[I];
+
+ // Get the weight and update the current BFI.
+ BranchWeights[Weight.TargetNode.Index] = Weight.Amount;
+ BranchProbability BP(Weight.Amount, BranchDist.Total);
+ BPI->setEdgeProbability(CodeReplacer, Weight.TargetNode.Index, BP);
+ }
+ TI->setMetadata(
+ LLVMContext::MD_prof,
+ MDBuilder(TI->getContext()).createBranchWeights(BranchWeights));
+}
+
Function *CodeExtractor::extractCodeRegion() {
if (!isEligible())
return nullptr;
@@ -682,6 +754,19 @@ Function *CodeExtractor::extractCodeRegion() {
// block in the region.
BasicBlock *header = *Blocks.begin();
+ // Calculate the entry frequency of the new function before we change the root
+ // block.
+ BlockFrequency EntryFreq;
+ if (BFI) {
+ assert(BPI && "Both BPI and BFI are required to preserve profile info");
+ for (BasicBlock *Pred : predecessors(header)) {
+ if (Blocks.count(Pred))
+ continue;
+ EntryFreq +=
+ BFI->getBlockFreq(Pred) * BPI->getEdgeProbability(Pred, header);
+ }
+ }
+
// If we have to split PHI nodes or the entry block, do so now.
severSplitPHINodes(header);
@@ -705,12 +790,23 @@ Function *CodeExtractor::extractCodeRegion() {
// Find inputs to, outputs from the code region.
findInputsOutputs(inputs, outputs);
+ // Calculate the exit blocks for the extracted region and the total exit
+ // weights for each of those blocks.
+ DenseMap<BasicBlock *, BlockFrequency> ExitWeights;
SmallPtrSet<BasicBlock *, 1> ExitBlocks;
- for (BasicBlock *Block : Blocks)
+ for (BasicBlock *Block : Blocks) {
for (succ_iterator SI = succ_begin(Block), SE = succ_end(Block); SI != SE;
- ++SI)
- if (!Blocks.count(*SI))
+ ++SI) {
+ if (!Blocks.count(*SI)) {
+ // Update the branch weight for this successor.
+ if (BFI) {
+ BlockFrequency &BF = ExitWeights[*SI];
+ BF += BFI->getBlockFreq(Block) * BPI->getEdgeProbability(Block, *SI);
+ }
ExitBlocks.insert(*SI);
+ }
+ }
+ }
NumExitBlocks = ExitBlocks.size();
// Construct new function based on inputs/outputs & add allocas for all defs.
@@ -719,10 +815,23 @@ Function *CodeExtractor::extractCodeRegion() {
codeReplacer, oldFunction,
oldFunction->getParent());
+ // Update the entry count of the function.
+ if (BFI) {
+ Optional<uint64_t> EntryCount =
+ BFI->getProfileCountFromFreq(EntryFreq.getFrequency());
+ if (EntryCount.hasValue())
+ newFunction->setEntryCount(EntryCount.getValue());
+ BFI->setBlockFreq(codeReplacer, EntryFreq.getFrequency());
+ }
+
emitCallAndSwitchStatement(newFunction, codeReplacer, inputs, outputs);
moveCodeToFunction(newFunction);
+ // Update the branch weights for the exit block.
+ if (BFI && NumExitBlocks > 1)
+ calculateNewCallTerminatorWeights(codeReplacer, ExitWeights, BPI);
+
// Loop over all of the PHI nodes in the header block, and change any
// references to the old incoming edge to be the new incoming edge.
for (BasicBlock::iterator I = header->begin(); isa<PHINode>(I); ++I) {
diff --git a/contrib/llvm/lib/Transforms/Utils/CtorUtils.cpp b/contrib/llvm/lib/Transforms/Utils/CtorUtils.cpp
index b56ff68..6642a97 100644
--- a/contrib/llvm/lib/Transforms/Utils/CtorUtils.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/CtorUtils.cpp
@@ -71,8 +71,8 @@ std::vector<Function *> parseGlobalCtors(GlobalVariable *GV) {
ConstantArray *CA = cast<ConstantArray>(GV->getInitializer());
std::vector<Function *> Result;
Result.reserve(CA->getNumOperands());
- for (User::op_iterator i = CA->op_begin(), e = CA->op_end(); i != e; ++i) {
- ConstantStruct *CS = cast<ConstantStruct>(*i);
+ for (auto &V : CA->operands()) {
+ ConstantStruct *CS = cast<ConstantStruct>(V);
Result.push_back(dyn_cast<Function>(CS->getOperand(1)));
}
return Result;
@@ -94,10 +94,10 @@ GlobalVariable *findGlobalCtors(Module &M) {
return GV;
ConstantArray *CA = cast<ConstantArray>(GV->getInitializer());
- for (User::op_iterator i = CA->op_begin(), e = CA->op_end(); i != e; ++i) {
- if (isa<ConstantAggregateZero>(*i))
+ for (auto &V : CA->operands()) {
+ if (isa<ConstantAggregateZero>(V))
continue;
- ConstantStruct *CS = cast<ConstantStruct>(*i);
+ ConstantStruct *CS = cast<ConstantStruct>(V);
if (isa<ConstantPointerNull>(CS->getOperand(1)))
continue;
diff --git a/contrib/llvm/lib/Transforms/Utils/EscapeEnumerator.cpp b/contrib/llvm/lib/Transforms/Utils/EscapeEnumerator.cpp
new file mode 100644
index 0000000..8c23865
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Utils/EscapeEnumerator.cpp
@@ -0,0 +1,96 @@
+//===- EscapeEnumerator.cpp -----------------------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Defines a helper class that enumerates all possible exits from a function,
+// including exception handling.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/EscapeEnumerator.h"
+#include "llvm/Analysis/EHPersonalities.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Transforms/Utils/Local.h"
+using namespace llvm;
+
+static Constant *getDefaultPersonalityFn(Module *M) {
+ LLVMContext &C = M->getContext();
+ Triple T(M->getTargetTriple());
+ EHPersonality Pers = getDefaultEHPersonality(T);
+ return M->getOrInsertFunction(getEHPersonalityName(Pers),
+ FunctionType::get(Type::getInt32Ty(C), true));
+}
+
+IRBuilder<> *EscapeEnumerator::Next() {
+ if (Done)
+ return nullptr;
+
+ // Find all 'return', 'resume', and 'unwind' instructions.
+ while (StateBB != StateE) {
+ BasicBlock *CurBB = &*StateBB++;
+
+ // Branches and invokes do not escape, only unwind, resume, and return
+ // do.
+ TerminatorInst *TI = CurBB->getTerminator();
+ if (!isa<ReturnInst>(TI) && !isa<ResumeInst>(TI))
+ continue;
+
+ Builder.SetInsertPoint(TI);
+ return &Builder;
+ }
+
+ Done = true;
+
+ if (!HandleExceptions)
+ return nullptr;
+
+ if (F.doesNotThrow())
+ return nullptr;
+
+ // Find all 'call' instructions that may throw.
+ SmallVector<Instruction *, 16> Calls;
+ for (BasicBlock &BB : F)
+ for (Instruction &II : BB)
+ if (CallInst *CI = dyn_cast<CallInst>(&II))
+ if (!CI->doesNotThrow())
+ Calls.push_back(CI);
+
+ if (Calls.empty())
+ return nullptr;
+
+ // Create a cleanup block.
+ LLVMContext &C = F.getContext();
+ BasicBlock *CleanupBB = BasicBlock::Create(C, CleanupBBName, &F);
+ Type *ExnTy =
+ StructType::get(Type::getInt8PtrTy(C), Type::getInt32Ty(C), nullptr);
+ if (!F.hasPersonalityFn()) {
+ Constant *PersFn = getDefaultPersonalityFn(F.getParent());
+ F.setPersonalityFn(PersFn);
+ }
+
+ if (isFuncletEHPersonality(classifyEHPersonality(F.getPersonalityFn()))) {
+ report_fatal_error("Funclet EH not supported");
+ }
+
+ LandingPadInst *LPad =
+ LandingPadInst::Create(ExnTy, 1, "cleanup.lpad", CleanupBB);
+ LPad->setCleanup(true);
+ ResumeInst *RI = ResumeInst::Create(LPad, CleanupBB);
+
+ // Transform the 'call' instructions into 'invoke's branching to the
+ // cleanup block. Go in reverse order to make prettier BB names.
+ SmallVector<Value *, 16> Args;
+ for (unsigned I = Calls.size(); I != 0;) {
+ CallInst *CI = cast<CallInst>(Calls[--I]);
+ changeToInvokeAndSplitBasicBlock(CI, CleanupBB);
+ }
+
+ Builder.SetInsertPoint(RI);
+ return &Builder;
+}
diff --git a/contrib/llvm/lib/Transforms/Utils/Evaluator.cpp b/contrib/llvm/lib/Transforms/Utils/Evaluator.cpp
index cd130ab..4adf175 100644
--- a/contrib/llvm/lib/Transforms/Utils/Evaluator.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/Evaluator.cpp
@@ -203,9 +203,9 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,
return false; // no volatile/atomic accesses.
}
Constant *Ptr = getVal(SI->getOperand(1));
- if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Ptr)) {
+ if (auto *FoldedPtr = ConstantFoldConstant(Ptr, DL, TLI)) {
DEBUG(dbgs() << "Folding constant ptr expression: " << *Ptr);
- Ptr = ConstantFoldConstantExpression(CE, DL, TLI);
+ Ptr = FoldedPtr;
DEBUG(dbgs() << "; To: " << *Ptr << "\n");
}
if (!isSimpleEnoughPointerToCommit(Ptr)) {
@@ -249,8 +249,8 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,
Constant * const IdxList[] = {IdxZero, IdxZero};
Ptr = ConstantExpr::getGetElementPtr(nullptr, Ptr, IdxList);
- if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Ptr))
- Ptr = ConstantFoldConstantExpression(CE, DL, TLI);
+ if (auto *FoldedPtr = ConstantFoldConstant(Ptr, DL, TLI))
+ Ptr = FoldedPtr;
// If we can't improve the situation by introspecting NewTy,
// we have to give up.
@@ -324,8 +324,8 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,
}
Constant *Ptr = getVal(LI->getOperand(0));
- if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Ptr)) {
- Ptr = ConstantFoldConstantExpression(CE, DL, TLI);
+ if (auto *FoldedPtr = ConstantFoldConstant(Ptr, DL, TLI)) {
+ Ptr = FoldedPtr;
DEBUG(dbgs() << "Found a constant pointer expression, constant "
"folding: " << *Ptr << "\n");
}
@@ -512,8 +512,8 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,
}
if (!CurInst->use_empty()) {
- if (ConstantExpr *CE = dyn_cast<ConstantExpr>(InstResult))
- InstResult = ConstantFoldConstantExpression(CE, DL, TLI);
+ if (auto *FoldedInstResult = ConstantFoldConstant(InstResult, DL, TLI))
+ InstResult = FoldedInstResult;
setVal(&*CurInst, InstResult);
}
@@ -537,7 +537,7 @@ bool Evaluator::EvaluateFunction(Function *F, Constant *&RetVal,
const SmallVectorImpl<Constant*> &ActualArgs) {
// Check to see if this function is already executing (recursion). If so,
// bail out. TODO: we might want to accept limited recursion.
- if (std::find(CallStack.begin(), CallStack.end(), F) != CallStack.end())
+ if (is_contained(CallStack, F))
return false;
CallStack.push_back(F);
diff --git a/contrib/llvm/lib/Transforms/Utils/FlattenCFG.cpp b/contrib/llvm/lib/Transforms/Utils/FlattenCFG.cpp
index 492ae9f..7b96fbb 100644
--- a/contrib/llvm/lib/Transforms/Utils/FlattenCFG.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/FlattenCFG.cpp
@@ -463,19 +463,14 @@ bool FlattenCFGOpt::MergeIfRegion(BasicBlock *BB, IRBuilder<> &Builder) {
}
bool FlattenCFGOpt::run(BasicBlock *BB) {
- bool Changed = false;
assert(BB && BB->getParent() && "Block not embedded in function!");
assert(BB->getTerminator() && "Degenerate basic block encountered!");
IRBuilder<> Builder(BB);
- if (FlattenParallelAndOr(BB, Builder))
+ if (FlattenParallelAndOr(BB, Builder) || MergeIfRegion(BB, Builder))
return true;
-
- if (MergeIfRegion(BB, Builder))
- return true;
-
- return Changed;
+ return false;
}
/// FlattenCFG - This function is used to flatten a CFG. For
diff --git a/contrib/llvm/lib/Transforms/Utils/FunctionComparator.cpp b/contrib/llvm/lib/Transforms/Utils/FunctionComparator.cpp
new file mode 100644
index 0000000..81a7c4c
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Utils/FunctionComparator.cpp
@@ -0,0 +1,919 @@
+//===- FunctionComparator.h - Function Comparator -------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the FunctionComparator and GlobalNumberState classes
+// which are used by the MergeFunctions pass for comparing functions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/FunctionComparator.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "functioncomparator"
+
+int FunctionComparator::cmpNumbers(uint64_t L, uint64_t R) const {
+ if (L < R) return -1;
+ if (L > R) return 1;
+ return 0;
+}
+
+int FunctionComparator::cmpOrderings(AtomicOrdering L, AtomicOrdering R) const {
+ if ((int)L < (int)R) return -1;
+ if ((int)L > (int)R) return 1;
+ return 0;
+}
+
+int FunctionComparator::cmpAPInts(const APInt &L, const APInt &R) const {
+ if (int Res = cmpNumbers(L.getBitWidth(), R.getBitWidth()))
+ return Res;
+ if (L.ugt(R)) return 1;
+ if (R.ugt(L)) return -1;
+ return 0;
+}
+
+int FunctionComparator::cmpAPFloats(const APFloat &L, const APFloat &R) const {
+ // Floats are ordered first by semantics (i.e. float, double, half, etc.),
+ // then by value interpreted as a bitstring (aka APInt).
+ const fltSemantics &SL = L.getSemantics(), &SR = R.getSemantics();
+ if (int Res = cmpNumbers(APFloat::semanticsPrecision(SL),
+ APFloat::semanticsPrecision(SR)))
+ return Res;
+ if (int Res = cmpNumbers(APFloat::semanticsMaxExponent(SL),
+ APFloat::semanticsMaxExponent(SR)))
+ return Res;
+ if (int Res = cmpNumbers(APFloat::semanticsMinExponent(SL),
+ APFloat::semanticsMinExponent(SR)))
+ return Res;
+ if (int Res = cmpNumbers(APFloat::semanticsSizeInBits(SL),
+ APFloat::semanticsSizeInBits(SR)))
+ return Res;
+ return cmpAPInts(L.bitcastToAPInt(), R.bitcastToAPInt());
+}
+
+int FunctionComparator::cmpMem(StringRef L, StringRef R) const {
+ // Prevent heavy comparison, compare sizes first.
+ if (int Res = cmpNumbers(L.size(), R.size()))
+ return Res;
+
+ // Compare strings lexicographically only when it is necessary: only when
+ // strings are equal in size.
+ return L.compare(R);
+}
+
+int FunctionComparator::cmpAttrs(const AttributeSet L,
+ const AttributeSet R) const {
+ if (int Res = cmpNumbers(L.getNumSlots(), R.getNumSlots()))
+ return Res;
+
+ for (unsigned i = 0, e = L.getNumSlots(); i != e; ++i) {
+ AttributeSet::iterator LI = L.begin(i), LE = L.end(i), RI = R.begin(i),
+ RE = R.end(i);
+ for (; LI != LE && RI != RE; ++LI, ++RI) {
+ Attribute LA = *LI;
+ Attribute RA = *RI;
+ if (LA < RA)
+ return -1;
+ if (RA < LA)
+ return 1;
+ }
+ if (LI != LE)
+ return 1;
+ if (RI != RE)
+ return -1;
+ }
+ return 0;
+}
+
+int FunctionComparator::cmpRangeMetadata(const MDNode *L,
+ const MDNode *R) const {
+ if (L == R)
+ return 0;
+ if (!L)
+ return -1;
+ if (!R)
+ return 1;
+ // Range metadata is a sequence of numbers. Make sure they are the same
+ // sequence.
+ // TODO: Note that as this is metadata, it is possible to drop and/or merge
+ // this data when considering functions to merge. Thus this comparison would
+ // return 0 (i.e. equivalent), but merging would become more complicated
+ // because the ranges would need to be unioned. It is not likely that
+ // functions differ ONLY in this metadata if they are actually the same
+ // function semantically.
+ if (int Res = cmpNumbers(L->getNumOperands(), R->getNumOperands()))
+ return Res;
+ for (size_t I = 0; I < L->getNumOperands(); ++I) {
+ ConstantInt *LLow = mdconst::extract<ConstantInt>(L->getOperand(I));
+ ConstantInt *RLow = mdconst::extract<ConstantInt>(R->getOperand(I));
+ if (int Res = cmpAPInts(LLow->getValue(), RLow->getValue()))
+ return Res;
+ }
+ return 0;
+}
+
+int FunctionComparator::cmpOperandBundlesSchema(const Instruction *L,
+ const Instruction *R) const {
+ ImmutableCallSite LCS(L);
+ ImmutableCallSite RCS(R);
+
+ assert(LCS && RCS && "Must be calls or invokes!");
+ assert(LCS.isCall() == RCS.isCall() && "Can't compare otherwise!");
+
+ if (int Res =
+ cmpNumbers(LCS.getNumOperandBundles(), RCS.getNumOperandBundles()))
+ return Res;
+
+ for (unsigned i = 0, e = LCS.getNumOperandBundles(); i != e; ++i) {
+ auto OBL = LCS.getOperandBundleAt(i);
+ auto OBR = RCS.getOperandBundleAt(i);
+
+ if (int Res = OBL.getTagName().compare(OBR.getTagName()))
+ return Res;
+
+ if (int Res = cmpNumbers(OBL.Inputs.size(), OBR.Inputs.size()))
+ return Res;
+ }
+
+ return 0;
+}
+
+/// Constants comparison:
+/// 1. Check whether type of L constant could be losslessly bitcasted to R
+/// type.
+/// 2. Compare constant contents.
+/// For more details see declaration comments.
+int FunctionComparator::cmpConstants(const Constant *L,
+ const Constant *R) const {
+
+ Type *TyL = L->getType();
+ Type *TyR = R->getType();
+
+ // Check whether types are bitcastable. This part is just re-factored
+ // Type::canLosslesslyBitCastTo method, but instead of returning true/false,
+ // we also pack into result which type is "less" for us.
+ int TypesRes = cmpTypes(TyL, TyR);
+ if (TypesRes != 0) {
+ // Types are different, but check whether we can bitcast them.
+ if (!TyL->isFirstClassType()) {
+ if (TyR->isFirstClassType())
+ return -1;
+ // Neither TyL nor TyR are values of first class type. Return the result
+ // of comparing the types
+ return TypesRes;
+ }
+ if (!TyR->isFirstClassType()) {
+ if (TyL->isFirstClassType())
+ return 1;
+ return TypesRes;
+ }
+
+ // Vector -> Vector conversions are always lossless if the two vector types
+ // have the same size, otherwise not.
+ unsigned TyLWidth = 0;
+ unsigned TyRWidth = 0;
+
+ if (auto *VecTyL = dyn_cast<VectorType>(TyL))
+ TyLWidth = VecTyL->getBitWidth();
+ if (auto *VecTyR = dyn_cast<VectorType>(TyR))
+ TyRWidth = VecTyR->getBitWidth();
+
+ if (TyLWidth != TyRWidth)
+ return cmpNumbers(TyLWidth, TyRWidth);
+
+ // Zero bit-width means neither TyL nor TyR are vectors.
+ if (!TyLWidth) {
+ PointerType *PTyL = dyn_cast<PointerType>(TyL);
+ PointerType *PTyR = dyn_cast<PointerType>(TyR);
+ if (PTyL && PTyR) {
+ unsigned AddrSpaceL = PTyL->getAddressSpace();
+ unsigned AddrSpaceR = PTyR->getAddressSpace();
+ if (int Res = cmpNumbers(AddrSpaceL, AddrSpaceR))
+ return Res;
+ }
+ if (PTyL)
+ return 1;
+ if (PTyR)
+ return -1;
+
+ // TyL and TyR aren't vectors, nor pointers. We don't know how to
+ // bitcast them.
+ return TypesRes;
+ }
+ }
+
+ // OK, types are bitcastable, now check constant contents.
+
+ if (L->isNullValue() && R->isNullValue())
+ return TypesRes;
+ if (L->isNullValue() && !R->isNullValue())
+ return 1;
+ if (!L->isNullValue() && R->isNullValue())
+ return -1;
+
+ auto GlobalValueL = const_cast<GlobalValue*>(dyn_cast<GlobalValue>(L));
+ auto GlobalValueR = const_cast<GlobalValue*>(dyn_cast<GlobalValue>(R));
+ if (GlobalValueL && GlobalValueR) {
+ return cmpGlobalValues(GlobalValueL, GlobalValueR);
+ }
+
+ if (int Res = cmpNumbers(L->getValueID(), R->getValueID()))
+ return Res;
+
+ if (const auto *SeqL = dyn_cast<ConstantDataSequential>(L)) {
+ const auto *SeqR = cast<ConstantDataSequential>(R);
+ // This handles ConstantDataArray and ConstantDataVector. Note that we
+ // compare the two raw data arrays, which might differ depending on the host
+ // endianness. This isn't a problem though, because the endiness of a module
+ // will affect the order of the constants, but this order is the same
+ // for a given input module and host platform.
+ return cmpMem(SeqL->getRawDataValues(), SeqR->getRawDataValues());
+ }
+
+ switch (L->getValueID()) {
+ case Value::UndefValueVal:
+ case Value::ConstantTokenNoneVal:
+ return TypesRes;
+ case Value::ConstantIntVal: {
+ const APInt &LInt = cast<ConstantInt>(L)->getValue();
+ const APInt &RInt = cast<ConstantInt>(R)->getValue();
+ return cmpAPInts(LInt, RInt);
+ }
+ case Value::ConstantFPVal: {
+ const APFloat &LAPF = cast<ConstantFP>(L)->getValueAPF();
+ const APFloat &RAPF = cast<ConstantFP>(R)->getValueAPF();
+ return cmpAPFloats(LAPF, RAPF);
+ }
+ case Value::ConstantArrayVal: {
+ const ConstantArray *LA = cast<ConstantArray>(L);
+ const ConstantArray *RA = cast<ConstantArray>(R);
+ uint64_t NumElementsL = cast<ArrayType>(TyL)->getNumElements();
+ uint64_t NumElementsR = cast<ArrayType>(TyR)->getNumElements();
+ if (int Res = cmpNumbers(NumElementsL, NumElementsR))
+ return Res;
+ for (uint64_t i = 0; i < NumElementsL; ++i) {
+ if (int Res = cmpConstants(cast<Constant>(LA->getOperand(i)),
+ cast<Constant>(RA->getOperand(i))))
+ return Res;
+ }
+ return 0;
+ }
+ case Value::ConstantStructVal: {
+ const ConstantStruct *LS = cast<ConstantStruct>(L);
+ const ConstantStruct *RS = cast<ConstantStruct>(R);
+ unsigned NumElementsL = cast<StructType>(TyL)->getNumElements();
+ unsigned NumElementsR = cast<StructType>(TyR)->getNumElements();
+ if (int Res = cmpNumbers(NumElementsL, NumElementsR))
+ return Res;
+ for (unsigned i = 0; i != NumElementsL; ++i) {
+ if (int Res = cmpConstants(cast<Constant>(LS->getOperand(i)),
+ cast<Constant>(RS->getOperand(i))))
+ return Res;
+ }
+ return 0;
+ }
+ case Value::ConstantVectorVal: {
+ const ConstantVector *LV = cast<ConstantVector>(L);
+ const ConstantVector *RV = cast<ConstantVector>(R);
+ unsigned NumElementsL = cast<VectorType>(TyL)->getNumElements();
+ unsigned NumElementsR = cast<VectorType>(TyR)->getNumElements();
+ if (int Res = cmpNumbers(NumElementsL, NumElementsR))
+ return Res;
+ for (uint64_t i = 0; i < NumElementsL; ++i) {
+ if (int Res = cmpConstants(cast<Constant>(LV->getOperand(i)),
+ cast<Constant>(RV->getOperand(i))))
+ return Res;
+ }
+ return 0;
+ }
+ case Value::ConstantExprVal: {
+ const ConstantExpr *LE = cast<ConstantExpr>(L);
+ const ConstantExpr *RE = cast<ConstantExpr>(R);
+ unsigned NumOperandsL = LE->getNumOperands();
+ unsigned NumOperandsR = RE->getNumOperands();
+ if (int Res = cmpNumbers(NumOperandsL, NumOperandsR))
+ return Res;
+ for (unsigned i = 0; i < NumOperandsL; ++i) {
+ if (int Res = cmpConstants(cast<Constant>(LE->getOperand(i)),
+ cast<Constant>(RE->getOperand(i))))
+ return Res;
+ }
+ return 0;
+ }
+ case Value::BlockAddressVal: {
+ const BlockAddress *LBA = cast<BlockAddress>(L);
+ const BlockAddress *RBA = cast<BlockAddress>(R);
+ if (int Res = cmpValues(LBA->getFunction(), RBA->getFunction()))
+ return Res;
+ if (LBA->getFunction() == RBA->getFunction()) {
+ // They are BBs in the same function. Order by which comes first in the
+ // BB order of the function. This order is deterministic.
+ Function* F = LBA->getFunction();
+ BasicBlock *LBB = LBA->getBasicBlock();
+ BasicBlock *RBB = RBA->getBasicBlock();
+ if (LBB == RBB)
+ return 0;
+ for(BasicBlock &BB : F->getBasicBlockList()) {
+ if (&BB == LBB) {
+ assert(&BB != RBB);
+ return -1;
+ }
+ if (&BB == RBB)
+ return 1;
+ }
+ llvm_unreachable("Basic Block Address does not point to a basic block in "
+ "its function.");
+ return -1;
+ } else {
+ // cmpValues said the functions are the same. So because they aren't
+ // literally the same pointer, they must respectively be the left and
+ // right functions.
+ assert(LBA->getFunction() == FnL && RBA->getFunction() == FnR);
+ // cmpValues will tell us if these are equivalent BasicBlocks, in the
+ // context of their respective functions.
+ return cmpValues(LBA->getBasicBlock(), RBA->getBasicBlock());
+ }
+ }
+ default: // Unknown constant, abort.
+ DEBUG(dbgs() << "Looking at valueID " << L->getValueID() << "\n");
+ llvm_unreachable("Constant ValueID not recognized.");
+ return -1;
+ }
+}
+
+int FunctionComparator::cmpGlobalValues(GlobalValue *L, GlobalValue *R) const {
+ uint64_t LNumber = GlobalNumbers->getNumber(L);
+ uint64_t RNumber = GlobalNumbers->getNumber(R);
+ return cmpNumbers(LNumber, RNumber);
+}
+
+/// cmpType - compares two types,
+/// defines total ordering among the types set.
+/// See method declaration comments for more details.
+int FunctionComparator::cmpTypes(Type *TyL, Type *TyR) const {
+ PointerType *PTyL = dyn_cast<PointerType>(TyL);
+ PointerType *PTyR = dyn_cast<PointerType>(TyR);
+
+ const DataLayout &DL = FnL->getParent()->getDataLayout();
+ if (PTyL && PTyL->getAddressSpace() == 0)
+ TyL = DL.getIntPtrType(TyL);
+ if (PTyR && PTyR->getAddressSpace() == 0)
+ TyR = DL.getIntPtrType(TyR);
+
+ if (TyL == TyR)
+ return 0;
+
+ if (int Res = cmpNumbers(TyL->getTypeID(), TyR->getTypeID()))
+ return Res;
+
+ switch (TyL->getTypeID()) {
+ default:
+ llvm_unreachable("Unknown type!");
+ // Fall through in Release mode.
+ LLVM_FALLTHROUGH;
+ case Type::IntegerTyID:
+ return cmpNumbers(cast<IntegerType>(TyL)->getBitWidth(),
+ cast<IntegerType>(TyR)->getBitWidth());
+ // TyL == TyR would have returned true earlier, because types are uniqued.
+ case Type::VoidTyID:
+ case Type::FloatTyID:
+ case Type::DoubleTyID:
+ case Type::X86_FP80TyID:
+ case Type::FP128TyID:
+ case Type::PPC_FP128TyID:
+ case Type::LabelTyID:
+ case Type::MetadataTyID:
+ case Type::TokenTyID:
+ return 0;
+
+ case Type::PointerTyID: {
+ assert(PTyL && PTyR && "Both types must be pointers here.");
+ return cmpNumbers(PTyL->getAddressSpace(), PTyR->getAddressSpace());
+ }
+
+ case Type::StructTyID: {
+ StructType *STyL = cast<StructType>(TyL);
+ StructType *STyR = cast<StructType>(TyR);
+ if (STyL->getNumElements() != STyR->getNumElements())
+ return cmpNumbers(STyL->getNumElements(), STyR->getNumElements());
+
+ if (STyL->isPacked() != STyR->isPacked())
+ return cmpNumbers(STyL->isPacked(), STyR->isPacked());
+
+ for (unsigned i = 0, e = STyL->getNumElements(); i != e; ++i) {
+ if (int Res = cmpTypes(STyL->getElementType(i), STyR->getElementType(i)))
+ return Res;
+ }
+ return 0;
+ }
+
+ case Type::FunctionTyID: {
+ FunctionType *FTyL = cast<FunctionType>(TyL);
+ FunctionType *FTyR = cast<FunctionType>(TyR);
+ if (FTyL->getNumParams() != FTyR->getNumParams())
+ return cmpNumbers(FTyL->getNumParams(), FTyR->getNumParams());
+
+ if (FTyL->isVarArg() != FTyR->isVarArg())
+ return cmpNumbers(FTyL->isVarArg(), FTyR->isVarArg());
+
+ if (int Res = cmpTypes(FTyL->getReturnType(), FTyR->getReturnType()))
+ return Res;
+
+ for (unsigned i = 0, e = FTyL->getNumParams(); i != e; ++i) {
+ if (int Res = cmpTypes(FTyL->getParamType(i), FTyR->getParamType(i)))
+ return Res;
+ }
+ return 0;
+ }
+
+ case Type::ArrayTyID:
+ case Type::VectorTyID: {
+ auto *STyL = cast<SequentialType>(TyL);
+ auto *STyR = cast<SequentialType>(TyR);
+ if (STyL->getNumElements() != STyR->getNumElements())
+ return cmpNumbers(STyL->getNumElements(), STyR->getNumElements());
+ return cmpTypes(STyL->getElementType(), STyR->getElementType());
+ }
+ }
+}
+
+// Determine whether the two operations are the same except that pointer-to-A
+// and pointer-to-B are equivalent. This should be kept in sync with
+// Instruction::isSameOperationAs.
+// Read method declaration comments for more details.
+int FunctionComparator::cmpOperations(const Instruction *L,
+ const Instruction *R,
+ bool &needToCmpOperands) const {
+ needToCmpOperands = true;
+ if (int Res = cmpValues(L, R))
+ return Res;
+
+ // Differences from Instruction::isSameOperationAs:
+ // * replace type comparison with calls to cmpTypes.
+ // * we test for I->getRawSubclassOptionalData (nuw/nsw/tail) at the top.
+ // * because of the above, we don't test for the tail bit on calls later on.
+ if (int Res = cmpNumbers(L->getOpcode(), R->getOpcode()))
+ return Res;
+
+ if (const GetElementPtrInst *GEPL = dyn_cast<GetElementPtrInst>(L)) {
+ needToCmpOperands = false;
+ const GetElementPtrInst *GEPR = cast<GetElementPtrInst>(R);
+ if (int Res =
+ cmpValues(GEPL->getPointerOperand(), GEPR->getPointerOperand()))
+ return Res;
+ return cmpGEPs(GEPL, GEPR);
+ }
+
+ if (int Res = cmpNumbers(L->getNumOperands(), R->getNumOperands()))
+ return Res;
+
+ if (int Res = cmpTypes(L->getType(), R->getType()))
+ return Res;
+
+ if (int Res = cmpNumbers(L->getRawSubclassOptionalData(),
+ R->getRawSubclassOptionalData()))
+ return Res;
+
+ // We have two instructions of identical opcode and #operands. Check to see
+ // if all operands are the same type
+ for (unsigned i = 0, e = L->getNumOperands(); i != e; ++i) {
+ if (int Res =
+ cmpTypes(L->getOperand(i)->getType(), R->getOperand(i)->getType()))
+ return Res;
+ }
+
+ // Check special state that is a part of some instructions.
+ if (const AllocaInst *AI = dyn_cast<AllocaInst>(L)) {
+ if (int Res = cmpTypes(AI->getAllocatedType(),
+ cast<AllocaInst>(R)->getAllocatedType()))
+ return Res;
+ return cmpNumbers(AI->getAlignment(), cast<AllocaInst>(R)->getAlignment());
+ }
+ if (const LoadInst *LI = dyn_cast<LoadInst>(L)) {
+ if (int Res = cmpNumbers(LI->isVolatile(), cast<LoadInst>(R)->isVolatile()))
+ return Res;
+ if (int Res =
+ cmpNumbers(LI->getAlignment(), cast<LoadInst>(R)->getAlignment()))
+ return Res;
+ if (int Res =
+ cmpOrderings(LI->getOrdering(), cast<LoadInst>(R)->getOrdering()))
+ return Res;
+ if (int Res =
+ cmpNumbers(LI->getSynchScope(), cast<LoadInst>(R)->getSynchScope()))
+ return Res;
+ return cmpRangeMetadata(LI->getMetadata(LLVMContext::MD_range),
+ cast<LoadInst>(R)->getMetadata(LLVMContext::MD_range));
+ }
+ if (const StoreInst *SI = dyn_cast<StoreInst>(L)) {
+ if (int Res =
+ cmpNumbers(SI->isVolatile(), cast<StoreInst>(R)->isVolatile()))
+ return Res;
+ if (int Res =
+ cmpNumbers(SI->getAlignment(), cast<StoreInst>(R)->getAlignment()))
+ return Res;
+ if (int Res =
+ cmpOrderings(SI->getOrdering(), cast<StoreInst>(R)->getOrdering()))
+ return Res;
+ return cmpNumbers(SI->getSynchScope(), cast<StoreInst>(R)->getSynchScope());
+ }
+ if (const CmpInst *CI = dyn_cast<CmpInst>(L))
+ return cmpNumbers(CI->getPredicate(), cast<CmpInst>(R)->getPredicate());
+ if (const CallInst *CI = dyn_cast<CallInst>(L)) {
+ if (int Res = cmpNumbers(CI->getCallingConv(),
+ cast<CallInst>(R)->getCallingConv()))
+ return Res;
+ if (int Res =
+ cmpAttrs(CI->getAttributes(), cast<CallInst>(R)->getAttributes()))
+ return Res;
+ if (int Res = cmpOperandBundlesSchema(CI, R))
+ return Res;
+ return cmpRangeMetadata(
+ CI->getMetadata(LLVMContext::MD_range),
+ cast<CallInst>(R)->getMetadata(LLVMContext::MD_range));
+ }
+ if (const InvokeInst *II = dyn_cast<InvokeInst>(L)) {
+ if (int Res = cmpNumbers(II->getCallingConv(),
+ cast<InvokeInst>(R)->getCallingConv()))
+ return Res;
+ if (int Res =
+ cmpAttrs(II->getAttributes(), cast<InvokeInst>(R)->getAttributes()))
+ return Res;
+ if (int Res = cmpOperandBundlesSchema(II, R))
+ return Res;
+ return cmpRangeMetadata(
+ II->getMetadata(LLVMContext::MD_range),
+ cast<InvokeInst>(R)->getMetadata(LLVMContext::MD_range));
+ }
+ if (const InsertValueInst *IVI = dyn_cast<InsertValueInst>(L)) {
+ ArrayRef<unsigned> LIndices = IVI->getIndices();
+ ArrayRef<unsigned> RIndices = cast<InsertValueInst>(R)->getIndices();
+ if (int Res = cmpNumbers(LIndices.size(), RIndices.size()))
+ return Res;
+ for (size_t i = 0, e = LIndices.size(); i != e; ++i) {
+ if (int Res = cmpNumbers(LIndices[i], RIndices[i]))
+ return Res;
+ }
+ return 0;
+ }
+ if (const ExtractValueInst *EVI = dyn_cast<ExtractValueInst>(L)) {
+ ArrayRef<unsigned> LIndices = EVI->getIndices();
+ ArrayRef<unsigned> RIndices = cast<ExtractValueInst>(R)->getIndices();
+ if (int Res = cmpNumbers(LIndices.size(), RIndices.size()))
+ return Res;
+ for (size_t i = 0, e = LIndices.size(); i != e; ++i) {
+ if (int Res = cmpNumbers(LIndices[i], RIndices[i]))
+ return Res;
+ }
+ }
+ if (const FenceInst *FI = dyn_cast<FenceInst>(L)) {
+ if (int Res =
+ cmpOrderings(FI->getOrdering(), cast<FenceInst>(R)->getOrdering()))
+ return Res;
+ return cmpNumbers(FI->getSynchScope(), cast<FenceInst>(R)->getSynchScope());
+ }
+ if (const AtomicCmpXchgInst *CXI = dyn_cast<AtomicCmpXchgInst>(L)) {
+ if (int Res = cmpNumbers(CXI->isVolatile(),
+ cast<AtomicCmpXchgInst>(R)->isVolatile()))
+ return Res;
+ if (int Res = cmpNumbers(CXI->isWeak(),
+ cast<AtomicCmpXchgInst>(R)->isWeak()))
+ return Res;
+ if (int Res =
+ cmpOrderings(CXI->getSuccessOrdering(),
+ cast<AtomicCmpXchgInst>(R)->getSuccessOrdering()))
+ return Res;
+ if (int Res =
+ cmpOrderings(CXI->getFailureOrdering(),
+ cast<AtomicCmpXchgInst>(R)->getFailureOrdering()))
+ return Res;
+ return cmpNumbers(CXI->getSynchScope(),
+ cast<AtomicCmpXchgInst>(R)->getSynchScope());
+ }
+ if (const AtomicRMWInst *RMWI = dyn_cast<AtomicRMWInst>(L)) {
+ if (int Res = cmpNumbers(RMWI->getOperation(),
+ cast<AtomicRMWInst>(R)->getOperation()))
+ return Res;
+ if (int Res = cmpNumbers(RMWI->isVolatile(),
+ cast<AtomicRMWInst>(R)->isVolatile()))
+ return Res;
+ if (int Res = cmpOrderings(RMWI->getOrdering(),
+ cast<AtomicRMWInst>(R)->getOrdering()))
+ return Res;
+ return cmpNumbers(RMWI->getSynchScope(),
+ cast<AtomicRMWInst>(R)->getSynchScope());
+ }
+ if (const PHINode *PNL = dyn_cast<PHINode>(L)) {
+ const PHINode *PNR = cast<PHINode>(R);
+ // Ensure that in addition to the incoming values being identical
+ // (checked by the caller of this function), the incoming blocks
+ // are also identical.
+ for (unsigned i = 0, e = PNL->getNumIncomingValues(); i != e; ++i) {
+ if (int Res =
+ cmpValues(PNL->getIncomingBlock(i), PNR->getIncomingBlock(i)))
+ return Res;
+ }
+ }
+ return 0;
+}
+
+// Determine whether two GEP operations perform the same underlying arithmetic.
+// Read method declaration comments for more details.
+int FunctionComparator::cmpGEPs(const GEPOperator *GEPL,
+ const GEPOperator *GEPR) const {
+
+ unsigned int ASL = GEPL->getPointerAddressSpace();
+ unsigned int ASR = GEPR->getPointerAddressSpace();
+
+ if (int Res = cmpNumbers(ASL, ASR))
+ return Res;
+
+ // When we have target data, we can reduce the GEP down to the value in bytes
+ // added to the address.
+ const DataLayout &DL = FnL->getParent()->getDataLayout();
+ unsigned BitWidth = DL.getPointerSizeInBits(ASL);
+ APInt OffsetL(BitWidth, 0), OffsetR(BitWidth, 0);
+ if (GEPL->accumulateConstantOffset(DL, OffsetL) &&
+ GEPR->accumulateConstantOffset(DL, OffsetR))
+ return cmpAPInts(OffsetL, OffsetR);
+ if (int Res = cmpTypes(GEPL->getSourceElementType(),
+ GEPR->getSourceElementType()))
+ return Res;
+
+ if (int Res = cmpNumbers(GEPL->getNumOperands(), GEPR->getNumOperands()))
+ return Res;
+
+ for (unsigned i = 0, e = GEPL->getNumOperands(); i != e; ++i) {
+ if (int Res = cmpValues(GEPL->getOperand(i), GEPR->getOperand(i)))
+ return Res;
+ }
+
+ return 0;
+}
+
+int FunctionComparator::cmpInlineAsm(const InlineAsm *L,
+ const InlineAsm *R) const {
+ // InlineAsm's are uniqued. If they are the same pointer, obviously they are
+ // the same, otherwise compare the fields.
+ if (L == R)
+ return 0;
+ if (int Res = cmpTypes(L->getFunctionType(), R->getFunctionType()))
+ return Res;
+ if (int Res = cmpMem(L->getAsmString(), R->getAsmString()))
+ return Res;
+ if (int Res = cmpMem(L->getConstraintString(), R->getConstraintString()))
+ return Res;
+ if (int Res = cmpNumbers(L->hasSideEffects(), R->hasSideEffects()))
+ return Res;
+ if (int Res = cmpNumbers(L->isAlignStack(), R->isAlignStack()))
+ return Res;
+ if (int Res = cmpNumbers(L->getDialect(), R->getDialect()))
+ return Res;
+ llvm_unreachable("InlineAsm blocks were not uniqued.");
+ return 0;
+}
+
+/// Compare two values used by the two functions under pair-wise comparison. If
+/// this is the first time the values are seen, they're added to the mapping so
+/// that we will detect mismatches on next use.
+/// See comments in declaration for more details.
+int FunctionComparator::cmpValues(const Value *L, const Value *R) const {
+ // Catch self-reference case.
+ if (L == FnL) {
+ if (R == FnR)
+ return 0;
+ return -1;
+ }
+ if (R == FnR) {
+ if (L == FnL)
+ return 0;
+ return 1;
+ }
+
+ const Constant *ConstL = dyn_cast<Constant>(L);
+ const Constant *ConstR = dyn_cast<Constant>(R);
+ if (ConstL && ConstR) {
+ if (L == R)
+ return 0;
+ return cmpConstants(ConstL, ConstR);
+ }
+
+ if (ConstL)
+ return 1;
+ if (ConstR)
+ return -1;
+
+ const InlineAsm *InlineAsmL = dyn_cast<InlineAsm>(L);
+ const InlineAsm *InlineAsmR = dyn_cast<InlineAsm>(R);
+
+ if (InlineAsmL && InlineAsmR)
+ return cmpInlineAsm(InlineAsmL, InlineAsmR);
+ if (InlineAsmL)
+ return 1;
+ if (InlineAsmR)
+ return -1;
+
+ auto LeftSN = sn_mapL.insert(std::make_pair(L, sn_mapL.size())),
+ RightSN = sn_mapR.insert(std::make_pair(R, sn_mapR.size()));
+
+ return cmpNumbers(LeftSN.first->second, RightSN.first->second);
+}
+
+// Test whether two basic blocks have equivalent behaviour.
+int FunctionComparator::cmpBasicBlocks(const BasicBlock *BBL,
+ const BasicBlock *BBR) const {
+ BasicBlock::const_iterator InstL = BBL->begin(), InstLE = BBL->end();
+ BasicBlock::const_iterator InstR = BBR->begin(), InstRE = BBR->end();
+
+ do {
+ bool needToCmpOperands = true;
+ if (int Res = cmpOperations(&*InstL, &*InstR, needToCmpOperands))
+ return Res;
+ if (needToCmpOperands) {
+ assert(InstL->getNumOperands() == InstR->getNumOperands());
+
+ for (unsigned i = 0, e = InstL->getNumOperands(); i != e; ++i) {
+ Value *OpL = InstL->getOperand(i);
+ Value *OpR = InstR->getOperand(i);
+ if (int Res = cmpValues(OpL, OpR))
+ return Res;
+ // cmpValues should ensure this is true.
+ assert(cmpTypes(OpL->getType(), OpR->getType()) == 0);
+ }
+ }
+
+ ++InstL;
+ ++InstR;
+ } while (InstL != InstLE && InstR != InstRE);
+
+ if (InstL != InstLE && InstR == InstRE)
+ return 1;
+ if (InstL == InstLE && InstR != InstRE)
+ return -1;
+ return 0;
+}
+
+int FunctionComparator::compareSignature() const {
+ if (int Res = cmpAttrs(FnL->getAttributes(), FnR->getAttributes()))
+ return Res;
+
+ if (int Res = cmpNumbers(FnL->hasGC(), FnR->hasGC()))
+ return Res;
+
+ if (FnL->hasGC()) {
+ if (int Res = cmpMem(FnL->getGC(), FnR->getGC()))
+ return Res;
+ }
+
+ if (int Res = cmpNumbers(FnL->hasSection(), FnR->hasSection()))
+ return Res;
+
+ if (FnL->hasSection()) {
+ if (int Res = cmpMem(FnL->getSection(), FnR->getSection()))
+ return Res;
+ }
+
+ if (int Res = cmpNumbers(FnL->isVarArg(), FnR->isVarArg()))
+ return Res;
+
+ // TODO: if it's internal and only used in direct calls, we could handle this
+ // case too.
+ if (int Res = cmpNumbers(FnL->getCallingConv(), FnR->getCallingConv()))
+ return Res;
+
+ if (int Res = cmpTypes(FnL->getFunctionType(), FnR->getFunctionType()))
+ return Res;
+
+ assert(FnL->arg_size() == FnR->arg_size() &&
+ "Identically typed functions have different numbers of args!");
+
+ // Visit the arguments so that they get enumerated in the order they're
+ // passed in.
+ for (Function::const_arg_iterator ArgLI = FnL->arg_begin(),
+ ArgRI = FnR->arg_begin(),
+ ArgLE = FnL->arg_end();
+ ArgLI != ArgLE; ++ArgLI, ++ArgRI) {
+ if (cmpValues(&*ArgLI, &*ArgRI) != 0)
+ llvm_unreachable("Arguments repeat!");
+ }
+ return 0;
+}
+
+// Test whether the two functions have equivalent behaviour.
+int FunctionComparator::compare() {
+ beginCompare();
+
+ if (int Res = compareSignature())
+ return Res;
+
+ // We do a CFG-ordered walk since the actual ordering of the blocks in the
+ // linked list is immaterial. Our walk starts at the entry block for both
+ // functions, then takes each block from each terminator in order. As an
+ // artifact, this also means that unreachable blocks are ignored.
+ SmallVector<const BasicBlock *, 8> FnLBBs, FnRBBs;
+ SmallPtrSet<const BasicBlock *, 32> VisitedBBs; // in terms of F1.
+
+ FnLBBs.push_back(&FnL->getEntryBlock());
+ FnRBBs.push_back(&FnR->getEntryBlock());
+
+ VisitedBBs.insert(FnLBBs[0]);
+ while (!FnLBBs.empty()) {
+ const BasicBlock *BBL = FnLBBs.pop_back_val();
+ const BasicBlock *BBR = FnRBBs.pop_back_val();
+
+ if (int Res = cmpValues(BBL, BBR))
+ return Res;
+
+ if (int Res = cmpBasicBlocks(BBL, BBR))
+ return Res;
+
+ const TerminatorInst *TermL = BBL->getTerminator();
+ const TerminatorInst *TermR = BBR->getTerminator();
+
+ assert(TermL->getNumSuccessors() == TermR->getNumSuccessors());
+ for (unsigned i = 0, e = TermL->getNumSuccessors(); i != e; ++i) {
+ if (!VisitedBBs.insert(TermL->getSuccessor(i)).second)
+ continue;
+
+ FnLBBs.push_back(TermL->getSuccessor(i));
+ FnRBBs.push_back(TermR->getSuccessor(i));
+ }
+ }
+ return 0;
+}
+
+namespace {
+
+// Accumulate the hash of a sequence of 64-bit integers. This is similar to a
+// hash of a sequence of 64bit ints, but the entire input does not need to be
+// available at once. This interface is necessary for functionHash because it
+// needs to accumulate the hash as the structure of the function is traversed
+// without saving these values to an intermediate buffer. This form of hashing
+// is not often needed, as usually the object to hash is just read from a
+// buffer.
+class HashAccumulator64 {
+ uint64_t Hash;
+public:
+ // Initialize to random constant, so the state isn't zero.
+ HashAccumulator64() { Hash = 0x6acaa36bef8325c5ULL; }
+ void add(uint64_t V) {
+ Hash = llvm::hashing::detail::hash_16_bytes(Hash, V);
+ }
+ // No finishing is required, because the entire hash value is used.
+ uint64_t getHash() { return Hash; }
+};
+} // end anonymous namespace
+
+// A function hash is calculated by considering only the number of arguments and
+// whether a function is varargs, the order of basic blocks (given by the
+// successors of each basic block in depth first order), and the order of
+// opcodes of each instruction within each of these basic blocks. This mirrors
+// the strategy compare() uses to compare functions by walking the BBs in depth
+// first order and comparing each instruction in sequence. Because this hash
+// does not look at the operands, it is insensitive to things such as the
+// target of calls and the constants used in the function, which makes it useful
+// when possibly merging functions which are the same modulo constants and call
+// targets.
+FunctionComparator::FunctionHash FunctionComparator::functionHash(Function &F) {
+ HashAccumulator64 H;
+ H.add(F.isVarArg());
+ H.add(F.arg_size());
+
+ SmallVector<const BasicBlock *, 8> BBs;
+ SmallSet<const BasicBlock *, 16> VisitedBBs;
+
+ // Walk the blocks in the same order as FunctionComparator::cmpBasicBlocks(),
+ // accumulating the hash of the function "structure." (BB and opcode sequence)
+ BBs.push_back(&F.getEntryBlock());
+ VisitedBBs.insert(BBs[0]);
+ while (!BBs.empty()) {
+ const BasicBlock *BB = BBs.pop_back_val();
+ // This random value acts as a block header, as otherwise the partition of
+ // opcodes into BBs wouldn't affect the hash, only the order of the opcodes
+ H.add(45798);
+ for (auto &Inst : *BB) {
+ H.add(Inst.getOpcode());
+ }
+ const TerminatorInst *Term = BB->getTerminator();
+ for (unsigned i = 0, e = Term->getNumSuccessors(); i != e; ++i) {
+ if (!VisitedBBs.insert(Term->getSuccessor(i)).second)
+ continue;
+ BBs.push_back(Term->getSuccessor(i));
+ }
+ }
+ return H.getHash();
+}
+
+
diff --git a/contrib/llvm/lib/Transforms/Utils/FunctionImportUtils.cpp b/contrib/llvm/lib/Transforms/Utils/FunctionImportUtils.cpp
index fcb25ba..9844190 100644
--- a/contrib/llvm/lib/Transforms/Utils/FunctionImportUtils.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/FunctionImportUtils.cpp
@@ -48,7 +48,7 @@ bool FunctionImportGlobalProcessing::doImportAsDefinition(
GlobalsToImport);
}
-bool FunctionImportGlobalProcessing::doPromoteLocalToGlobal(
+bool FunctionImportGlobalProcessing::shouldPromoteLocalToGlobal(
const GlobalValue *SGV) {
assert(SGV->hasLocalLinkage());
// Both the imported references and the original local variable must
@@ -56,36 +56,57 @@ bool FunctionImportGlobalProcessing::doPromoteLocalToGlobal(
if (!isPerformingImport() && !isModuleExporting())
return false;
- // Local const variables never need to be promoted unless they are address
- // taken. The imported uses can simply use the clone created in this module.
- // For now we are conservative in determining which variables are not
- // address taken by checking the unnamed addr flag. To be more aggressive,
- // the address taken information must be checked earlier during parsing
- // of the module and recorded in the summary index for use when importing
- // from that module.
- auto *GVar = dyn_cast<GlobalVariable>(SGV);
- if (GVar && GVar->isConstant() && GVar->hasGlobalUnnamedAddr())
- return false;
+ if (isPerformingImport()) {
+ assert((!GlobalsToImport->count(SGV) || !isNonRenamableLocal(*SGV)) &&
+ "Attempting to promote non-renamable local");
+ // We don't know for sure yet if we are importing this value (as either
+ // a reference or a def), since we are simply walking all values in the
+ // module. But by necessity if we end up importing it and it is local,
+ // it must be promoted, so unconditionally promote all values in the
+ // importing module.
+ return true;
+ }
- if (GVar && GVar->hasSection())
- // Some sections like "__DATA,__cfstring" are "magic" and promotion is not
- // allowed. Just disable promotion on any GVar with sections right now.
- return false;
+ // When exporting, consult the index. We can have more than one local
+ // with the same GUID, in the case of same-named locals in different but
+ // same-named source files that were compiled in their respective directories
+ // (so the source file name and resulting GUID is the same). Find the one
+ // in this module.
+ auto Summary = ImportIndex.findSummaryInModule(
+ SGV->getGUID(), SGV->getParent()->getModuleIdentifier());
+ assert(Summary && "Missing summary for global value when exporting");
+ auto Linkage = Summary->linkage();
+ if (!GlobalValue::isLocalLinkage(Linkage)) {
+ assert(!isNonRenamableLocal(*SGV) &&
+ "Attempting to promote non-renamable local");
+ return true;
+ }
- // Eventually we only need to promote functions in the exporting module that
- // are referenced by a potentially exported function (i.e. one that is in the
- // summary index).
- return true;
+ return false;
}
-std::string FunctionImportGlobalProcessing::getName(const GlobalValue *SGV) {
+#ifndef NDEBUG
+bool FunctionImportGlobalProcessing::isNonRenamableLocal(
+ const GlobalValue &GV) const {
+ if (!GV.hasLocalLinkage())
+ return false;
+ // This needs to stay in sync with the logic in buildModuleSummaryIndex.
+ if (GV.hasSection())
+ return true;
+ if (Used.count(const_cast<GlobalValue *>(&GV)))
+ return true;
+ return false;
+}
+#endif
+
+std::string FunctionImportGlobalProcessing::getName(const GlobalValue *SGV,
+ bool DoPromote) {
// For locals that must be promoted to global scope, ensure that
// the promoted name uniquely identifies the copy in the original module,
// using the ID assigned during combined index creation. When importing,
// we rename all locals (not just those that are promoted) in order to
// avoid naming conflicts between locals imported from different modules.
- if (SGV->hasLocalLinkage() &&
- (doPromoteLocalToGlobal(SGV) || isPerformingImport()))
+ if (SGV->hasLocalLinkage() && (DoPromote || isPerformingImport()))
return ModuleSummaryIndex::getGlobalNameForLocal(
SGV->getName(),
ImportIndex.getModuleHash(SGV->getParent()->getModuleIdentifier()));
@@ -93,13 +114,14 @@ std::string FunctionImportGlobalProcessing::getName(const GlobalValue *SGV) {
}
GlobalValue::LinkageTypes
-FunctionImportGlobalProcessing::getLinkage(const GlobalValue *SGV) {
+FunctionImportGlobalProcessing::getLinkage(const GlobalValue *SGV,
+ bool DoPromote) {
// Any local variable that is referenced by an exported function needs
// to be promoted to global scope. Since we don't currently know which
// functions reference which local variables/functions, we must treat
// all as potentially exported if this module is exporting anything.
if (isModuleExporting()) {
- if (SGV->hasLocalLinkage() && doPromoteLocalToGlobal(SGV))
+ if (SGV->hasLocalLinkage() && DoPromote)
return GlobalValue::ExternalLinkage;
return SGV->getLinkage();
}
@@ -164,7 +186,7 @@ FunctionImportGlobalProcessing::getLinkage(const GlobalValue *SGV) {
case GlobalValue::PrivateLinkage:
// If we are promoting the local to global scope, it is handled
// similarly to a normal externally visible global.
- if (doPromoteLocalToGlobal(SGV)) {
+ if (DoPromote) {
if (doImportAsDefinition(SGV) && !dyn_cast<GlobalAlias>(SGV))
return GlobalValue::AvailableExternallyLinkage;
else
@@ -190,14 +212,19 @@ FunctionImportGlobalProcessing::getLinkage(const GlobalValue *SGV) {
}
void FunctionImportGlobalProcessing::processGlobalForThinLTO(GlobalValue &GV) {
+ bool DoPromote = false;
if (GV.hasLocalLinkage() &&
- (doPromoteLocalToGlobal(&GV) || isPerformingImport())) {
- GV.setName(getName(&GV));
- GV.setLinkage(getLinkage(&GV));
+ ((DoPromote = shouldPromoteLocalToGlobal(&GV)) || isPerformingImport())) {
+ // Once we change the name or linkage it is difficult to determine
+ // again whether we should promote since shouldPromoteLocalToGlobal needs
+ // to locate the summary (based on GUID from name and linkage). Therefore,
+ // use DoPromote result saved above.
+ GV.setName(getName(&GV, DoPromote));
+ GV.setLinkage(getLinkage(&GV, DoPromote));
if (!GV.hasLocalLinkage())
GV.setVisibility(GlobalValue::HiddenVisibility);
} else
- GV.setLinkage(getLinkage(&GV));
+ GV.setLinkage(getLinkage(&GV, /* DoPromote */ false));
// Remove functions imported as available externally defs from comdats,
// as this is a declaration for the linker, and will be dropped eventually.
@@ -214,14 +241,6 @@ void FunctionImportGlobalProcessing::processGlobalForThinLTO(GlobalValue &GV) {
}
void FunctionImportGlobalProcessing::processGlobalsForThinLTO() {
- if (!moduleCanBeRenamedForThinLTO(M)) {
- // We would have blocked importing from this module by suppressing index
- // generation. We still may be able to import into this module though.
- assert(!isPerformingImport() &&
- "Should have blocked importing from module with local used in ASM");
- return;
- }
-
for (GlobalVariable &GV : M.globals())
processGlobalForThinLTO(GV);
for (Function &SF : M)
diff --git a/contrib/llvm/lib/Transforms/Utils/GlobalStatus.cpp b/contrib/llvm/lib/Transforms/Utils/GlobalStatus.cpp
index 266be41..74ebcda 100644
--- a/contrib/llvm/lib/Transforms/Utils/GlobalStatus.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/GlobalStatus.cpp
@@ -20,9 +20,8 @@ using namespace llvm;
/// and release, then return AcquireRelease.
///
static AtomicOrdering strongerOrdering(AtomicOrdering X, AtomicOrdering Y) {
- if (X == AtomicOrdering::Acquire && Y == AtomicOrdering::Release)
- return AtomicOrdering::AcquireRelease;
- if (Y == AtomicOrdering::Acquire && X == AtomicOrdering::Release)
+ if ((X == AtomicOrdering::Acquire && Y == AtomicOrdering::Release) ||
+ (Y == AtomicOrdering::Acquire && X == AtomicOrdering::Release))
return AtomicOrdering::AcquireRelease;
return (AtomicOrdering)std::max((unsigned)X, (unsigned)Y);
}
@@ -35,7 +34,7 @@ bool llvm::isSafeToDestroyConstant(const Constant *C) {
if (isa<GlobalValue>(C))
return false;
- if (isa<ConstantInt>(C) || isa<ConstantFP>(C))
+ if (isa<ConstantData>(C))
return false;
for (const User *U : C->users())
diff --git a/contrib/llvm/lib/Transforms/Utils/ImportedFunctionsInliningStatistics.cpp b/contrib/llvm/lib/Transforms/Utils/ImportedFunctionsInliningStatistics.cpp
new file mode 100644
index 0000000..ed018bb
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Utils/ImportedFunctionsInliningStatistics.cpp
@@ -0,0 +1,203 @@
+//===-- ImportedFunctionsInliningStats.cpp ----------------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// Generating inliner statistics for imported functions, mostly useful for
+// ThinLTO.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/ImportedFunctionsInliningStatistics.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <iomanip>
+#include <sstream>
+using namespace llvm;
+
+ImportedFunctionsInliningStatistics::InlineGraphNode &
+ImportedFunctionsInliningStatistics::createInlineGraphNode(const Function &F) {
+
+ auto &ValueLookup = NodesMap[F.getName()];
+ if (!ValueLookup) {
+ ValueLookup = llvm::make_unique<InlineGraphNode>();
+ ValueLookup->Imported = F.getMetadata("thinlto_src_module") != nullptr;
+ }
+ return *ValueLookup;
+}
+
+void ImportedFunctionsInliningStatistics::recordInline(const Function &Caller,
+ const Function &Callee) {
+
+ InlineGraphNode &CallerNode = createInlineGraphNode(Caller);
+ InlineGraphNode &CalleeNode = createInlineGraphNode(Callee);
+ CalleeNode.NumberOfInlines++;
+
+ if (!CallerNode.Imported && !CalleeNode.Imported) {
+ // Direct inline from not imported callee to not imported caller, so we
+ // don't have to add this to graph. It might be very helpful if you wanna
+ // get the inliner statistics in compile step where there are no imported
+ // functions. In this case the graph would be empty.
+ CalleeNode.NumberOfRealInlines++;
+ return;
+ }
+
+ CallerNode.InlinedCallees.push_back(&CalleeNode);
+ if (!CallerNode.Imported) {
+ // We could avoid second lookup, but it would make the code ultra ugly.
+ auto It = NodesMap.find(Caller.getName());
+ assert(It != NodesMap.end() && "The node should be already there.");
+ // Save Caller as a starting node for traversal. The string has to be one
+ // from map because Caller can disappear (and function name with it).
+ NonImportedCallers.push_back(It->first());
+ }
+}
+
+void ImportedFunctionsInliningStatistics::setModuleInfo(const Module &M) {
+ ModuleName = M.getName();
+ for (const auto &F : M.functions()) {
+ AllFunctions++;
+ ImportedFunctions += int(F.getMetadata("thinlto_src_module") != nullptr);
+ }
+}
+static std::string getStatString(const char *Msg, int32_t Fraction, int32_t All,
+ const char *PercentageOfMsg,
+ bool LineEnd = true) {
+ double Result = 0;
+ if (All != 0)
+ Result = 100 * static_cast<double>(Fraction) / All;
+
+ std::stringstream Str;
+ Str << std::setprecision(4) << Msg << ": " << Fraction << " [" << Result
+ << "% of " << PercentageOfMsg << "]";
+ if (LineEnd)
+ Str << "\n";
+ return Str.str();
+}
+
+void ImportedFunctionsInliningStatistics::dump(const bool Verbose) {
+ calculateRealInlines();
+ NonImportedCallers.clear();
+
+ int32_t InlinedImportedFunctionsCount = 0;
+ int32_t InlinedNotImportedFunctionsCount = 0;
+
+ int32_t InlinedImportedFunctionsToImportingModuleCount = 0;
+ int32_t InlinedNotImportedFunctionsToImportingModuleCount = 0;
+
+ const auto SortedNodes = getSortedNodes();
+ std::string Out;
+ Out.reserve(5000);
+ raw_string_ostream Ostream(Out);
+
+ Ostream << "------- Dumping inliner stats for [" << ModuleName
+ << "] -------\n";
+
+ if (Verbose)
+ Ostream << "-- List of inlined functions:\n";
+
+ for (const auto &Node : SortedNodes) {
+ assert(Node->second->NumberOfInlines >= Node->second->NumberOfRealInlines);
+ if (Node->second->NumberOfInlines == 0)
+ continue;
+
+ if (Node->second->Imported) {
+ InlinedImportedFunctionsCount++;
+ InlinedImportedFunctionsToImportingModuleCount +=
+ int(Node->second->NumberOfRealInlines > 0);
+ } else {
+ InlinedNotImportedFunctionsCount++;
+ InlinedNotImportedFunctionsToImportingModuleCount +=
+ int(Node->second->NumberOfRealInlines > 0);
+ }
+
+ if (Verbose)
+ Ostream << "Inlined "
+ << (Node->second->Imported ? "imported " : "not imported ")
+ << "function [" << Node->first() << "]"
+ << ": #inlines = " << Node->second->NumberOfInlines
+ << ", #inlines_to_importing_module = "
+ << Node->second->NumberOfRealInlines << "\n";
+ }
+
+ auto InlinedFunctionsCount =
+ InlinedImportedFunctionsCount + InlinedNotImportedFunctionsCount;
+ auto NotImportedFuncCount = AllFunctions - ImportedFunctions;
+ auto ImportedNotInlinedIntoModule =
+ ImportedFunctions - InlinedImportedFunctionsToImportingModuleCount;
+
+ Ostream << "-- Summary:\n"
+ << "All functions: " << AllFunctions
+ << ", imported functions: " << ImportedFunctions << "\n"
+ << getStatString("inlined functions", InlinedFunctionsCount,
+ AllFunctions, "all functions")
+ << getStatString("imported functions inlined anywhere",
+ InlinedImportedFunctionsCount, ImportedFunctions,
+ "imported functions")
+ << getStatString("imported functions inlined into importing module",
+ InlinedImportedFunctionsToImportingModuleCount,
+ ImportedFunctions, "imported functions",
+ /*LineEnd=*/false)
+ << getStatString(", remaining", ImportedNotInlinedIntoModule,
+ ImportedFunctions, "imported functions")
+ << getStatString("non-imported functions inlined anywhere",
+ InlinedNotImportedFunctionsCount,
+ NotImportedFuncCount, "non-imported functions")
+ << getStatString(
+ "non-imported functions inlined into importing module",
+ InlinedNotImportedFunctionsToImportingModuleCount,
+ NotImportedFuncCount, "non-imported functions");
+ Ostream.flush();
+ dbgs() << Out;
+}
+
+void ImportedFunctionsInliningStatistics::calculateRealInlines() {
+ // Removing duplicated Callers.
+ std::sort(NonImportedCallers.begin(), NonImportedCallers.end());
+ NonImportedCallers.erase(
+ std::unique(NonImportedCallers.begin(), NonImportedCallers.end()),
+ NonImportedCallers.end());
+
+ for (const auto &Name : NonImportedCallers) {
+ auto &Node = *NodesMap[Name];
+ if (!Node.Visited)
+ dfs(Node);
+ }
+}
+
+void ImportedFunctionsInliningStatistics::dfs(InlineGraphNode &GraphNode) {
+ assert(!GraphNode.Visited);
+ GraphNode.Visited = true;
+ for (auto *const InlinedFunctionNode : GraphNode.InlinedCallees) {
+ InlinedFunctionNode->NumberOfRealInlines++;
+ if (!InlinedFunctionNode->Visited)
+ dfs(*InlinedFunctionNode);
+ }
+}
+
+ImportedFunctionsInliningStatistics::SortedNodesTy
+ImportedFunctionsInliningStatistics::getSortedNodes() {
+ SortedNodesTy SortedNodes;
+ SortedNodes.reserve(NodesMap.size());
+ for (const NodesMapTy::value_type& Node : NodesMap)
+ SortedNodes.push_back(&Node);
+
+ std::sort(
+ SortedNodes.begin(), SortedNodes.end(),
+ [&](const SortedNodesTy::value_type &Lhs,
+ const SortedNodesTy::value_type &Rhs) {
+ if (Lhs->second->NumberOfInlines != Rhs->second->NumberOfInlines)
+ return Lhs->second->NumberOfInlines > Rhs->second->NumberOfInlines;
+ if (Lhs->second->NumberOfRealInlines != Rhs->second->NumberOfRealInlines)
+ return Lhs->second->NumberOfRealInlines >
+ Rhs->second->NumberOfRealInlines;
+ return Lhs->first() < Rhs->first();
+ });
+ return SortedNodes;
+}
diff --git a/contrib/llvm/lib/Transforms/Utils/InlineFunction.cpp b/contrib/llvm/lib/Transforms/Utils/InlineFunction.cpp
index e82c07f..a40079c 100644
--- a/contrib/llvm/lib/Transforms/Utils/InlineFunction.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/InlineFunction.cpp
@@ -14,6 +14,7 @@
#include "llvm/Transforms/Utils/Cloning.h"
#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringExtras.h"
@@ -228,7 +229,7 @@ static Value *getUnwindDestTokenHelper(Instruction *EHPad,
Instruction *ChildPad = cast<Instruction>(Child);
auto Memo = MemoMap.find(ChildPad);
if (Memo == MemoMap.end()) {
- // Haven't figure out this child pad yet; queue it.
+ // Haven't figured out this child pad yet; queue it.
Worklist.push_back(ChildPad);
continue;
}
@@ -366,6 +367,10 @@ static Value *getUnwindDestToken(Instruction *EHPad,
// search up the chain to try to find a funclet with information. Put
// null entries in the memo map to avoid re-processing as we go up.
MemoMap[EHPad] = nullptr;
+#ifndef NDEBUG
+ SmallPtrSet<Instruction *, 4> TempMemos;
+ TempMemos.insert(EHPad);
+#endif
Instruction *LastUselessPad = EHPad;
Value *AncestorToken;
for (AncestorToken = getParentPad(EHPad);
@@ -374,6 +379,13 @@ static Value *getUnwindDestToken(Instruction *EHPad,
// Skip over catchpads since they just follow their catchswitches.
if (isa<CatchPadInst>(AncestorPad))
continue;
+ // If the MemoMap had an entry mapping AncestorPad to nullptr, since we
+ // haven't yet called getUnwindDestTokenHelper for AncestorPad in this
+ // call to getUnwindDestToken, that would mean that AncestorPad had no
+ // information in itself, its descendants, or its ancestors. If that
+ // were the case, then we should also have recorded the lack of information
+ // for the descendant that we're coming from. So assert that we don't
+ // find a null entry in the MemoMap for AncestorPad.
assert(!MemoMap.count(AncestorPad) || MemoMap[AncestorPad]);
auto AncestorMemo = MemoMap.find(AncestorPad);
if (AncestorMemo == MemoMap.end()) {
@@ -384,25 +396,85 @@ static Value *getUnwindDestToken(Instruction *EHPad,
if (UnwindDestToken)
break;
LastUselessPad = AncestorPad;
+ MemoMap[LastUselessPad] = nullptr;
+#ifndef NDEBUG
+ TempMemos.insert(LastUselessPad);
+#endif
}
- // Since the whole tree under LastUselessPad has no information, it all must
- // match UnwindDestToken; record that to avoid repeating the search.
+ // We know that getUnwindDestTokenHelper was called on LastUselessPad and
+ // returned nullptr (and likewise for EHPad and any of its ancestors up to
+ // LastUselessPad), so LastUselessPad has no information from below. Since
+ // getUnwindDestTokenHelper must investigate all downward paths through
+ // no-information nodes to prove that a node has no information like this,
+ // and since any time it finds information it records it in the MemoMap for
+ // not just the immediately-containing funclet but also any ancestors also
+ // exited, it must be the case that, walking downward from LastUselessPad,
+ // visiting just those nodes which have not been mapped to an unwind dest
+ // by getUnwindDestTokenHelper (the nullptr TempMemos notwithstanding, since
+ // they are just used to keep getUnwindDestTokenHelper from repeating work),
+ // any node visited must have been exhaustively searched with no information
+ // for it found.
SmallVector<Instruction *, 8> Worklist(1, LastUselessPad);
while (!Worklist.empty()) {
Instruction *UselessPad = Worklist.pop_back_val();
- assert(!MemoMap.count(UselessPad) || MemoMap[UselessPad] == nullptr);
+ auto Memo = MemoMap.find(UselessPad);
+ if (Memo != MemoMap.end() && Memo->second) {
+ // Here the name 'UselessPad' is a bit of a misnomer, because we've found
+ // that it is a funclet that does have information about unwinding to
+ // a particular destination; its parent was a useless pad.
+ // Since its parent has no information, the unwind edge must not escape
+ // the parent, and must target a sibling of this pad. This local unwind
+ // gives us no information about EHPad. Leave it and the subtree rooted
+ // at it alone.
+ assert(getParentPad(Memo->second) == getParentPad(UselessPad));
+ continue;
+ }
+ // We know we don't have information for UselesPad. If it has an entry in
+ // the MemoMap (mapping it to nullptr), it must be one of the TempMemos
+ // added on this invocation of getUnwindDestToken; if a previous invocation
+ // recorded nullptr, it would have had to prove that the ancestors of
+ // UselessPad, which include LastUselessPad, had no information, and that
+ // in turn would have required proving that the descendants of
+ // LastUselesPad, which include EHPad, have no information about
+ // LastUselessPad, which would imply that EHPad was mapped to nullptr in
+ // the MemoMap on that invocation, which isn't the case if we got here.
+ assert(!MemoMap.count(UselessPad) || TempMemos.count(UselessPad));
+ // Assert as we enumerate users that 'UselessPad' doesn't have any unwind
+ // information that we'd be contradicting by making a map entry for it
+ // (which is something that getUnwindDestTokenHelper must have proved for
+ // us to get here). Just assert on is direct users here; the checks in
+ // this downward walk at its descendants will verify that they don't have
+ // any unwind edges that exit 'UselessPad' either (i.e. they either have no
+ // unwind edges or unwind to a sibling).
MemoMap[UselessPad] = UnwindDestToken;
if (auto *CatchSwitch = dyn_cast<CatchSwitchInst>(UselessPad)) {
- for (BasicBlock *HandlerBlock : CatchSwitch->handlers())
- for (User *U : HandlerBlock->getFirstNonPHI()->users())
+ assert(CatchSwitch->getUnwindDest() == nullptr && "Expected useless pad");
+ for (BasicBlock *HandlerBlock : CatchSwitch->handlers()) {
+ auto *CatchPad = HandlerBlock->getFirstNonPHI();
+ for (User *U : CatchPad->users()) {
+ assert(
+ (!isa<InvokeInst>(U) ||
+ (getParentPad(
+ cast<InvokeInst>(U)->getUnwindDest()->getFirstNonPHI()) ==
+ CatchPad)) &&
+ "Expected useless pad");
if (isa<CatchSwitchInst>(U) || isa<CleanupPadInst>(U))
Worklist.push_back(cast<Instruction>(U));
+ }
+ }
} else {
assert(isa<CleanupPadInst>(UselessPad));
- for (User *U : UselessPad->users())
+ for (User *U : UselessPad->users()) {
+ assert(!isa<CleanupReturnInst>(U) && "Expected useless pad");
+ assert((!isa<InvokeInst>(U) ||
+ (getParentPad(
+ cast<InvokeInst>(U)->getUnwindDest()->getFirstNonPHI()) ==
+ UselessPad)) &&
+ "Expected useless pad");
if (isa<CatchSwitchInst>(U) || isa<CleanupPadInst>(U))
Worklist.push_back(cast<Instruction>(U));
+ }
}
}
@@ -463,37 +535,7 @@ static BasicBlock *HandleCallsInBlockInlinedThroughInvoke(
#endif // NDEBUG
}
- // Convert this function call into an invoke instruction. First, split the
- // basic block.
- BasicBlock *Split =
- BB->splitBasicBlock(CI->getIterator(), CI->getName() + ".noexc");
-
- // Delete the unconditional branch inserted by splitBasicBlock
- BB->getInstList().pop_back();
-
- // Create the new invoke instruction.
- SmallVector<Value*, 8> InvokeArgs(CI->arg_begin(), CI->arg_end());
- SmallVector<OperandBundleDef, 1> OpBundles;
-
- CI->getOperandBundlesAsDefs(OpBundles);
-
- // Note: we're round tripping operand bundles through memory here, and that
- // can potentially be avoided with a cleverer API design that we do not have
- // as of this time.
-
- InvokeInst *II =
- InvokeInst::Create(CI->getCalledValue(), Split, UnwindEdge, InvokeArgs,
- OpBundles, CI->getName(), BB);
- II->setDebugLoc(CI->getDebugLoc());
- II->setCallingConv(CI->getCallingConv());
- II->setAttributes(CI->getAttributes());
-
- // Make sure that anything using the call now uses the invoke! This also
- // updates the CallGraph if present, because it uses a WeakVH.
- CI->replaceAllUsesWith(II);
-
- // Delete the original call
- Split->getInstList().pop_front();
+ changeToInvokeAndSplitBasicBlock(CI, UnwindEdge);
return BB;
}
return nullptr;
@@ -718,7 +760,7 @@ static void PropagateParallelLoopAccessMetadata(CallSite CS,
/// When inlining a function that contains noalias scope metadata,
/// this metadata needs to be cloned so that the inlined blocks
-/// have different "unqiue scopes" at every call site. Were this not done, then
+/// have different "unique scopes" at every call site. Were this not done, then
/// aliasing scopes from a function inlined into a caller multiple times could
/// not be differentiated (and this would lead to miscompiles because the
/// non-aliasing property communicated by the metadata could have
@@ -1053,8 +1095,10 @@ static void AddAliasScopeMetadata(CallSite CS, ValueToValueMapTy &VMap,
/// If the inlined function has non-byval align arguments, then
/// add @llvm.assume-based alignment assumptions to preserve this information.
static void AddAlignmentAssumptions(CallSite CS, InlineFunctionInfo &IFI) {
- if (!PreserveAlignmentAssumptions)
+ if (!PreserveAlignmentAssumptions || !IFI.GetAssumptionCache)
return;
+
+ AssumptionCache *AC = &(*IFI.GetAssumptionCache)(*CS.getCaller());
auto &DL = CS.getCaller()->getParent()->getDataLayout();
// To avoid inserting redundant assumptions, we should check for assumptions
@@ -1077,13 +1121,12 @@ static void AddAlignmentAssumptions(CallSite CS, InlineFunctionInfo &IFI) {
// If we can already prove the asserted alignment in the context of the
// caller, then don't bother inserting the assumption.
Value *Arg = CS.getArgument(I->getArgNo());
- if (getKnownAlignment(Arg, DL, CS.getInstruction(),
- &IFI.ACT->getAssumptionCache(*CS.getCaller()),
- &DT) >= Align)
+ if (getKnownAlignment(Arg, DL, CS.getInstruction(), AC, &DT) >= Align)
continue;
- IRBuilder<>(CS.getInstruction())
- .CreateAlignmentAssumption(DL, Arg, Align);
+ CallInst *NewAssumption = IRBuilder<>(CS.getInstruction())
+ .CreateAlignmentAssumption(DL, Arg, Align);
+ AC->registerAssumption(NewAssumption);
}
}
}
@@ -1194,12 +1237,13 @@ static Value *HandleByValArgument(Value *Arg, Instruction *TheCall,
if (ByValAlignment <= 1) // 0 = unspecified, 1 = no particular alignment.
return Arg;
+ AssumptionCache *AC =
+ IFI.GetAssumptionCache ? &(*IFI.GetAssumptionCache)(*Caller) : nullptr;
const DataLayout &DL = Caller->getParent()->getDataLayout();
// If the pointer is already known to be sufficiently aligned, or if we can
// round it up to a larger alignment, then we don't need a temporary.
- if (getOrEnforceKnownAlignment(Arg, ByValAlignment, DL, TheCall,
- &IFI.ACT->getAssumptionCache(*Caller)) >=
+ if (getOrEnforceKnownAlignment(Arg, ByValAlignment, DL, TheCall, AC) >=
ByValAlignment)
return Arg;
@@ -1304,7 +1348,7 @@ static bool allocaWouldBeStaticInEntry(const AllocaInst *AI ) {
/// Update inlined instructions' line numbers to
/// to encode location where these instructions are inlined.
static void fixupLineNumbers(Function *Fn, Function::iterator FI,
- Instruction *TheCall) {
+ Instruction *TheCall, bool CalleeHasDebugInfo) {
const DebugLoc &TheCallDL = TheCall->getDebugLoc();
if (!TheCallDL)
return;
@@ -1326,22 +1370,26 @@ static void fixupLineNumbers(Function *Fn, Function::iterator FI,
for (; FI != Fn->end(); ++FI) {
for (BasicBlock::iterator BI = FI->begin(), BE = FI->end();
BI != BE; ++BI) {
- DebugLoc DL = BI->getDebugLoc();
- if (!DL) {
- // If the inlined instruction has no line number, make it look as if it
- // originates from the call location. This is important for
- // ((__always_inline__, __nodebug__)) functions which must use caller
- // location for all instructions in their function body.
-
- // Don't update static allocas, as they may get moved later.
- if (auto *AI = dyn_cast<AllocaInst>(BI))
- if (allocaWouldBeStaticInEntry(AI))
- continue;
-
- BI->setDebugLoc(TheCallDL);
- } else {
- BI->setDebugLoc(updateInlinedAtInfo(DL, InlinedAtNode, BI->getContext(), IANodes));
+ if (DebugLoc DL = BI->getDebugLoc()) {
+ BI->setDebugLoc(
+ updateInlinedAtInfo(DL, InlinedAtNode, BI->getContext(), IANodes));
+ continue;
}
+
+ if (CalleeHasDebugInfo)
+ continue;
+
+ // If the inlined instruction has no line number, make it look as if it
+ // originates from the call location. This is important for
+ // ((__always_inline__, __nodebug__)) functions which must use caller
+ // location for all instructions in their function body.
+
+ // Don't update static allocas, as they may get moved later.
+ if (auto *AI = dyn_cast<AllocaInst>(BI))
+ if (allocaWouldBeStaticInEntry(AI))
+ continue;
+
+ BI->setDebugLoc(TheCallDL);
}
}
}
@@ -1597,8 +1645,11 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
if (IFI.CG)
UpdateCallGraphAfterInlining(CS, FirstNewBlock, VMap, IFI);
- // Update inlined instructions' line number information.
- fixupLineNumbers(Caller, FirstNewBlock, TheCall);
+ // For 'nodebug' functions, the associated DISubprogram is always null.
+ // Conservatively avoid propagating the callsite debug location to
+ // instructions inlined from a function whose DISubprogram is not null.
+ fixupLineNumbers(Caller, FirstNewBlock, TheCall,
+ CalledFunc->getSubprogram() != nullptr);
// Clone existing noalias metadata if necessary.
CloneAliasScopeMetadata(CS, VMap);
@@ -1609,10 +1660,15 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
// Propagate llvm.mem.parallel_loop_access if necessary.
PropagateParallelLoopAccessMetadata(CS, VMap);
- // FIXME: We could register any cloned assumptions instead of clearing the
- // whole function's cache.
- if (IFI.ACT)
- IFI.ACT->getAssumptionCache(*Caller).clear();
+ // Register any cloned assumptions.
+ if (IFI.GetAssumptionCache)
+ for (BasicBlock &NewBlock :
+ make_range(FirstNewBlock->getIterator(), Caller->end()))
+ for (Instruction &I : NewBlock) {
+ if (auto *II = dyn_cast<IntrinsicInst>(&I))
+ if (II->getIntrinsicID() == Intrinsic::assume)
+ (*IFI.GetAssumptionCache)(*Caller).registerAssumption(II);
+ }
}
// If there are any alloca instructions in the block that used to be the entry
@@ -1708,6 +1764,9 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
IRBuilder<> builder(&FirstNewBlock->front());
for (unsigned ai = 0, ae = IFI.StaticAllocas.size(); ai != ae; ++ai) {
AllocaInst *AI = IFI.StaticAllocas[ai];
+ // Don't mark swifterror allocas. They can't have bitcast uses.
+ if (AI->isSwiftError())
+ continue;
// If the alloca is already scoped to something smaller than the whole
// function then there's no need to add redundant, less accurate markers.
@@ -1949,6 +2008,20 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
std::swap(Returns, NormalReturns);
}
+ // Now that all of the transforms on the inlined code have taken place but
+ // before we splice the inlined code into the CFG and lose track of which
+ // blocks were actually inlined, collect the call sites. We only do this if
+ // call graph updates weren't requested, as those provide value handle based
+ // tracking of inlined call sites instead.
+ if (InlinedFunctionInfo.ContainsCalls && !IFI.CG) {
+ // Otherwise just collect the raw call sites that were inlined.
+ for (BasicBlock &NewBB :
+ make_range(FirstNewBlock->getIterator(), Caller->end()))
+ for (Instruction &I : NewBB)
+ if (auto CS = CallSite(&I))
+ IFI.InlinedCallSites.push_back(CS);
+ }
+
// If we cloned in _exactly one_ basic block, and if that block ends in a
// return instruction, we splice the body of the inlined callee directly into
// the calling basic block.
@@ -2130,9 +2203,10 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
// the entries are the same or undef). If so, remove the PHI so it doesn't
// block other optimizations.
if (PHI) {
+ AssumptionCache *AC =
+ IFI.GetAssumptionCache ? &(*IFI.GetAssumptionCache)(*Caller) : nullptr;
auto &DL = Caller->getParent()->getDataLayout();
- if (Value *V = SimplifyInstruction(PHI, DL, nullptr, nullptr,
- &IFI.ACT->getAssumptionCache(*Caller))) {
+ if (Value *V = SimplifyInstruction(PHI, DL, nullptr, nullptr, AC)) {
PHI->replaceAllUsesWith(V);
PHI->eraseFromParent();
}
diff --git a/contrib/llvm/lib/Transforms/Utils/LCSSA.cpp b/contrib/llvm/lib/Transforms/Utils/LCSSA.cpp
index 0d5a25b..68c6b74 100644
--- a/contrib/llvm/lib/Transforms/Utils/LCSSA.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/LCSSA.cpp
@@ -51,10 +51,19 @@ using namespace llvm;
STATISTIC(NumLCSSA, "Number of live out of a loop variables");
+#ifdef EXPENSIVE_CHECKS
+static bool VerifyLoopLCSSA = true;
+#else
+static bool VerifyLoopLCSSA = false;
+#endif
+static cl::opt<bool,true>
+VerifyLoopLCSSAFlag("verify-loop-lcssa", cl::location(VerifyLoopLCSSA),
+ cl::desc("Verify loop lcssa form (time consuming)"));
+
/// Return true if the specified block is in the list.
static bool isExitBlock(BasicBlock *BB,
const SmallVectorImpl<BasicBlock *> &ExitBlocks) {
- return find(ExitBlocks, BB) != ExitBlocks.end();
+ return is_contained(ExitBlocks, BB);
}
/// For every instruction from the worklist, check to see if it has any uses
@@ -63,19 +72,25 @@ static bool isExitBlock(BasicBlock *BB,
bool llvm::formLCSSAForInstructions(SmallVectorImpl<Instruction *> &Worklist,
DominatorTree &DT, LoopInfo &LI) {
SmallVector<Use *, 16> UsesToRewrite;
- SmallVector<BasicBlock *, 8> ExitBlocks;
SmallSetVector<PHINode *, 16> PHIsToRemove;
PredIteratorCache PredCache;
bool Changed = false;
+ // Cache the Loop ExitBlocks across this loop. We expect to get a lot of
+ // instructions within the same loops, computing the exit blocks is
+ // expensive, and we're not mutating the loop structure.
+ SmallDenseMap<Loop*, SmallVector<BasicBlock *,1>> LoopExitBlocks;
+
while (!Worklist.empty()) {
UsesToRewrite.clear();
- ExitBlocks.clear();
Instruction *I = Worklist.pop_back_val();
BasicBlock *InstBB = I->getParent();
Loop *L = LI.getLoopFor(InstBB);
- L->getExitBlocks(ExitBlocks);
+ if (!LoopExitBlocks.count(L))
+ L->getExitBlocks(LoopExitBlocks[L]);
+ assert(LoopExitBlocks.count(L));
+ const SmallVectorImpl<BasicBlock *> &ExitBlocks = LoopExitBlocks[L];
if (ExitBlocks.empty())
continue;
@@ -186,14 +201,14 @@ bool llvm::formLCSSAForInstructions(SmallVectorImpl<Instruction *> &Worklist,
// Otherwise, do full PHI insertion.
SSAUpdate.RewriteUse(*UseToRewrite);
+ }
- // SSAUpdater might have inserted phi-nodes inside other loops. We'll need
- // to post-process them to keep LCSSA form.
- for (PHINode *InsertedPN : InsertedPHIs) {
- if (auto *OtherLoop = LI.getLoopFor(InsertedPN->getParent()))
- if (!L->contains(OtherLoop))
- PostProcessPHIs.push_back(InsertedPN);
- }
+ // SSAUpdater might have inserted phi-nodes inside other loops. We'll need
+ // to post-process them to keep LCSSA form.
+ for (PHINode *InsertedPN : InsertedPHIs) {
+ if (auto *OtherLoop = LI.getLoopFor(InsertedPN->getParent()))
+ if (!L->contains(OtherLoop))
+ PostProcessPHIs.push_back(InsertedPN);
}
// Post process PHI instructions that were inserted into another disjoint
@@ -229,7 +244,7 @@ blockDominatesAnExit(BasicBlock *BB,
DominatorTree &DT,
const SmallVectorImpl<BasicBlock *> &ExitBlocks) {
DomTreeNode *DomNode = DT.getNode(BB);
- return llvm::any_of(ExitBlocks, [&](BasicBlock * EB) {
+ return any_of(ExitBlocks, [&](BasicBlock *EB) {
return DT.dominates(DomNode, DT.getNode(EB));
});
}
@@ -315,6 +330,19 @@ struct LCSSAWrapperPass : public FunctionPass {
ScalarEvolution *SE;
bool runOnFunction(Function &F) override;
+ void verifyAnalysis() const override {
+ // This check is very expensive. On the loop intensive compiles it may cause
+ // up to 10x slowdown. Currently it's disabled by default. LPPassManager
+ // always does limited form of the LCSSA verification. Similar reasoning
+ // was used for the LoopInfo verifier.
+ if (VerifyLoopLCSSA) {
+ assert(all_of(*LI,
+ [&](Loop *L) {
+ return L->isRecursivelyLCSSAForm(*DT, *LI);
+ }) &&
+ "LCSSA form is broken!");
+ }
+ };
/// This transformation requires natural loop information & requires that
/// loop preheaders be inserted into the CFG. It maintains both of these,
@@ -330,6 +358,10 @@ struct LCSSAWrapperPass : public FunctionPass {
AU.addPreserved<GlobalsAAWrapperPass>();
AU.addPreserved<ScalarEvolutionWrapperPass>();
AU.addPreserved<SCEVAAWrapperPass>();
+
+ // This is needed to perform LCSSA verification inside LPPassManager
+ AU.addRequired<LCSSAVerificationPass>();
+ AU.addPreserved<LCSSAVerificationPass>();
}
};
}
@@ -339,6 +371,7 @@ INITIALIZE_PASS_BEGIN(LCSSAWrapperPass, "lcssa", "Loop-Closed SSA Form Pass",
false, false)
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LCSSAVerificationPass)
INITIALIZE_PASS_END(LCSSAWrapperPass, "lcssa", "Loop-Closed SSA Form Pass",
false, false)
@@ -355,7 +388,7 @@ bool LCSSAWrapperPass::runOnFunction(Function &F) {
return formLCSSAOnAllLoops(LI, *DT, SE);
}
-PreservedAnalyses LCSSAPass::run(Function &F, AnalysisManager<Function> &AM) {
+PreservedAnalyses LCSSAPass::run(Function &F, FunctionAnalysisManager &AM) {
auto &LI = AM.getResult<LoopAnalysis>(F);
auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
auto *SE = AM.getCachedResult<ScalarEvolutionAnalysis>(F);
diff --git a/contrib/llvm/lib/Transforms/Utils/LibCallsShrinkWrap.cpp b/contrib/llvm/lib/Transforms/Utils/LibCallsShrinkWrap.cpp
new file mode 100644
index 0000000..d97cd75
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Utils/LibCallsShrinkWrap.cpp
@@ -0,0 +1,571 @@
+//===-- LibCallsShrinkWrap.cpp ----------------------------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass shrink-wraps a call to function if the result is not used.
+// The call can set errno but is otherwise side effect free. For example:
+// sqrt(val);
+// is transformed to
+// if (val < 0)
+// sqrt(val);
+// Even if the result of library call is not being used, the compiler cannot
+// safely delete the call because the function can set errno on error
+// conditions.
+// Note in many functions, the error condition solely depends on the incoming
+// parameter. In this optimization, we can generate the condition can lead to
+// the errno to shrink-wrap the call. Since the chances of hitting the error
+// condition is low, the runtime call is effectively eliminated.
+//
+// These partially dead calls are usually results of C++ abstraction penalty
+// exposed by inlining.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/LibCallsShrinkWrap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "libcalls-shrinkwrap"
+
+STATISTIC(NumWrappedOneCond, "Number of One-Condition Wrappers Inserted");
+STATISTIC(NumWrappedTwoCond, "Number of Two-Condition Wrappers Inserted");
+
+static cl::opt<bool> LibCallsShrinkWrapDoDomainError(
+ "libcalls-shrinkwrap-domain-error", cl::init(true), cl::Hidden,
+ cl::desc("Perform shrink-wrap on lib calls with domain errors"));
+static cl::opt<bool> LibCallsShrinkWrapDoRangeError(
+ "libcalls-shrinkwrap-range-error", cl::init(true), cl::Hidden,
+ cl::desc("Perform shrink-wrap on lib calls with range errors"));
+static cl::opt<bool> LibCallsShrinkWrapDoPoleError(
+ "libcalls-shrinkwrap-pole-error", cl::init(true), cl::Hidden,
+ cl::desc("Perform shrink-wrap on lib calls with pole errors"));
+
+namespace {
+class LibCallsShrinkWrapLegacyPass : public FunctionPass {
+public:
+ static char ID; // Pass identification, replacement for typeid
+ explicit LibCallsShrinkWrapLegacyPass() : FunctionPass(ID) {
+ initializeLibCallsShrinkWrapLegacyPassPass(
+ *PassRegistry::getPassRegistry());
+ }
+ void getAnalysisUsage(AnalysisUsage &AU) const override;
+ bool runOnFunction(Function &F) override;
+};
+}
+
+char LibCallsShrinkWrapLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(LibCallsShrinkWrapLegacyPass, "libcalls-shrinkwrap",
+ "Conditionally eliminate dead library calls", false,
+ false)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(LibCallsShrinkWrapLegacyPass, "libcalls-shrinkwrap",
+ "Conditionally eliminate dead library calls", false, false)
+
+namespace {
+class LibCallsShrinkWrap : public InstVisitor<LibCallsShrinkWrap> {
+public:
+ LibCallsShrinkWrap(const TargetLibraryInfo &TLI) : TLI(TLI), Changed(false){};
+ bool isChanged() const { return Changed; }
+ void visitCallInst(CallInst &CI) { checkCandidate(CI); }
+ void perform() {
+ for (auto &CI : WorkList) {
+ DEBUG(dbgs() << "CDCE calls: " << CI->getCalledFunction()->getName()
+ << "\n");
+ if (perform(CI)) {
+ Changed = true;
+ DEBUG(dbgs() << "Transformed\n");
+ }
+ }
+ }
+
+private:
+ bool perform(CallInst *CI);
+ void checkCandidate(CallInst &CI);
+ void shrinkWrapCI(CallInst *CI, Value *Cond);
+ bool performCallDomainErrorOnly(CallInst *CI, const LibFunc::Func &Func);
+ bool performCallErrors(CallInst *CI, const LibFunc::Func &Func);
+ bool performCallRangeErrorOnly(CallInst *CI, const LibFunc::Func &Func);
+ Value *generateOneRangeCond(CallInst *CI, const LibFunc::Func &Func);
+ Value *generateTwoRangeCond(CallInst *CI, const LibFunc::Func &Func);
+ Value *generateCondForPow(CallInst *CI, const LibFunc::Func &Func);
+
+ // Create an OR of two conditions.
+ Value *createOrCond(CallInst *CI, CmpInst::Predicate Cmp, float Val,
+ CmpInst::Predicate Cmp2, float Val2) {
+ IRBuilder<> BBBuilder(CI);
+ Value *Arg = CI->getArgOperand(0);
+ auto Cond2 = createCond(BBBuilder, Arg, Cmp2, Val2);
+ auto Cond1 = createCond(BBBuilder, Arg, Cmp, Val);
+ return BBBuilder.CreateOr(Cond1, Cond2);
+ }
+
+ // Create a single condition using IRBuilder.
+ Value *createCond(IRBuilder<> &BBBuilder, Value *Arg, CmpInst::Predicate Cmp,
+ float Val) {
+ Constant *V = ConstantFP::get(BBBuilder.getContext(), APFloat(Val));
+ if (!Arg->getType()->isFloatTy())
+ V = ConstantExpr::getFPExtend(V, Arg->getType());
+ return BBBuilder.CreateFCmp(Cmp, Arg, V);
+ }
+
+ // Create a single condition.
+ Value *createCond(CallInst *CI, CmpInst::Predicate Cmp, float Val) {
+ IRBuilder<> BBBuilder(CI);
+ Value *Arg = CI->getArgOperand(0);
+ return createCond(BBBuilder, Arg, Cmp, Val);
+ }
+
+ const TargetLibraryInfo &TLI;
+ SmallVector<CallInst *, 16> WorkList;
+ bool Changed;
+};
+} // end anonymous namespace
+
+// Perform the transformation to calls with errno set by domain error.
+bool LibCallsShrinkWrap::performCallDomainErrorOnly(CallInst *CI,
+ const LibFunc::Func &Func) {
+ Value *Cond = nullptr;
+
+ switch (Func) {
+ case LibFunc::acos: // DomainError: (x < -1 || x > 1)
+ case LibFunc::acosf: // Same as acos
+ case LibFunc::acosl: // Same as acos
+ case LibFunc::asin: // DomainError: (x < -1 || x > 1)
+ case LibFunc::asinf: // Same as asin
+ case LibFunc::asinl: // Same as asin
+ {
+ ++NumWrappedTwoCond;
+ Cond = createOrCond(CI, CmpInst::FCMP_OLT, -1.0f, CmpInst::FCMP_OGT, 1.0f);
+ break;
+ }
+ case LibFunc::cos: // DomainError: (x == +inf || x == -inf)
+ case LibFunc::cosf: // Same as cos
+ case LibFunc::cosl: // Same as cos
+ case LibFunc::sin: // DomainError: (x == +inf || x == -inf)
+ case LibFunc::sinf: // Same as sin
+ case LibFunc::sinl: // Same as sin
+ {
+ ++NumWrappedTwoCond;
+ Cond = createOrCond(CI, CmpInst::FCMP_OEQ, INFINITY, CmpInst::FCMP_OEQ,
+ -INFINITY);
+ break;
+ }
+ case LibFunc::acosh: // DomainError: (x < 1)
+ case LibFunc::acoshf: // Same as acosh
+ case LibFunc::acoshl: // Same as acosh
+ {
+ ++NumWrappedOneCond;
+ Cond = createCond(CI, CmpInst::FCMP_OLT, 1.0f);
+ break;
+ }
+ case LibFunc::sqrt: // DomainError: (x < 0)
+ case LibFunc::sqrtf: // Same as sqrt
+ case LibFunc::sqrtl: // Same as sqrt
+ {
+ ++NumWrappedOneCond;
+ Cond = createCond(CI, CmpInst::FCMP_OLT, 0.0f);
+ break;
+ }
+ default:
+ return false;
+ }
+ shrinkWrapCI(CI, Cond);
+ return true;
+}
+
+// Perform the transformation to calls with errno set by range error.
+bool LibCallsShrinkWrap::performCallRangeErrorOnly(CallInst *CI,
+ const LibFunc::Func &Func) {
+ Value *Cond = nullptr;
+
+ switch (Func) {
+ case LibFunc::cosh:
+ case LibFunc::coshf:
+ case LibFunc::coshl:
+ case LibFunc::exp:
+ case LibFunc::expf:
+ case LibFunc::expl:
+ case LibFunc::exp10:
+ case LibFunc::exp10f:
+ case LibFunc::exp10l:
+ case LibFunc::exp2:
+ case LibFunc::exp2f:
+ case LibFunc::exp2l:
+ case LibFunc::sinh:
+ case LibFunc::sinhf:
+ case LibFunc::sinhl: {
+ Cond = generateTwoRangeCond(CI, Func);
+ break;
+ }
+ case LibFunc::expm1: // RangeError: (709, inf)
+ case LibFunc::expm1f: // RangeError: (88, inf)
+ case LibFunc::expm1l: // RangeError: (11356, inf)
+ {
+ Cond = generateOneRangeCond(CI, Func);
+ break;
+ }
+ default:
+ return false;
+ }
+ shrinkWrapCI(CI, Cond);
+ return true;
+}
+
+// Perform the transformation to calls with errno set by combination of errors.
+bool LibCallsShrinkWrap::performCallErrors(CallInst *CI,
+ const LibFunc::Func &Func) {
+ Value *Cond = nullptr;
+
+ switch (Func) {
+ case LibFunc::atanh: // DomainError: (x < -1 || x > 1)
+ // PoleError: (x == -1 || x == 1)
+ // Overall Cond: (x <= -1 || x >= 1)
+ case LibFunc::atanhf: // Same as atanh
+ case LibFunc::atanhl: // Same as atanh
+ {
+ if (!LibCallsShrinkWrapDoDomainError || !LibCallsShrinkWrapDoPoleError)
+ return false;
+ ++NumWrappedTwoCond;
+ Cond = createOrCond(CI, CmpInst::FCMP_OLE, -1.0f, CmpInst::FCMP_OGE, 1.0f);
+ break;
+ }
+ case LibFunc::log: // DomainError: (x < 0)
+ // PoleError: (x == 0)
+ // Overall Cond: (x <= 0)
+ case LibFunc::logf: // Same as log
+ case LibFunc::logl: // Same as log
+ case LibFunc::log10: // Same as log
+ case LibFunc::log10f: // Same as log
+ case LibFunc::log10l: // Same as log
+ case LibFunc::log2: // Same as log
+ case LibFunc::log2f: // Same as log
+ case LibFunc::log2l: // Same as log
+ case LibFunc::logb: // Same as log
+ case LibFunc::logbf: // Same as log
+ case LibFunc::logbl: // Same as log
+ {
+ if (!LibCallsShrinkWrapDoDomainError || !LibCallsShrinkWrapDoPoleError)
+ return false;
+ ++NumWrappedOneCond;
+ Cond = createCond(CI, CmpInst::FCMP_OLE, 0.0f);
+ break;
+ }
+ case LibFunc::log1p: // DomainError: (x < -1)
+ // PoleError: (x == -1)
+ // Overall Cond: (x <= -1)
+ case LibFunc::log1pf: // Same as log1p
+ case LibFunc::log1pl: // Same as log1p
+ {
+ if (!LibCallsShrinkWrapDoDomainError || !LibCallsShrinkWrapDoPoleError)
+ return false;
+ ++NumWrappedOneCond;
+ Cond = createCond(CI, CmpInst::FCMP_OLE, -1.0f);
+ break;
+ }
+ case LibFunc::pow: // DomainError: x < 0 and y is noninteger
+ // PoleError: x == 0 and y < 0
+ // RangeError: overflow or underflow
+ case LibFunc::powf:
+ case LibFunc::powl: {
+ if (!LibCallsShrinkWrapDoDomainError || !LibCallsShrinkWrapDoPoleError ||
+ !LibCallsShrinkWrapDoRangeError)
+ return false;
+ Cond = generateCondForPow(CI, Func);
+ if (Cond == nullptr)
+ return false;
+ break;
+ }
+ default:
+ return false;
+ }
+ assert(Cond && "performCallErrors should not see an empty condition");
+ shrinkWrapCI(CI, Cond);
+ return true;
+}
+
+// Checks if CI is a candidate for shrinkwrapping and put it into work list if
+// true.
+void LibCallsShrinkWrap::checkCandidate(CallInst &CI) {
+ if (CI.isNoBuiltin())
+ return;
+ // A possible improvement is to handle the calls with the return value being
+ // used. If there is API for fast libcall implementation without setting
+ // errno, we can use the same framework to direct/wrap the call to the fast
+ // API in the error free path, and leave the original call in the slow path.
+ if (!CI.use_empty())
+ return;
+
+ LibFunc::Func Func;
+ Function *Callee = CI.getCalledFunction();
+ if (!Callee)
+ return;
+ if (!TLI.getLibFunc(*Callee, Func) || !TLI.has(Func))
+ return;
+
+ if (CI.getNumArgOperands() == 0)
+ return;
+ // TODO: Handle long double in other formats.
+ Type *ArgType = CI.getArgOperand(0)->getType();
+ if (!(ArgType->isFloatTy() || ArgType->isDoubleTy() ||
+ ArgType->isX86_FP80Ty()))
+ return;
+
+ WorkList.push_back(&CI);
+}
+
+// Generate the upper bound condition for RangeError.
+Value *LibCallsShrinkWrap::generateOneRangeCond(CallInst *CI,
+ const LibFunc::Func &Func) {
+ float UpperBound;
+ switch (Func) {
+ case LibFunc::expm1: // RangeError: (709, inf)
+ UpperBound = 709.0f;
+ break;
+ case LibFunc::expm1f: // RangeError: (88, inf)
+ UpperBound = 88.0f;
+ break;
+ case LibFunc::expm1l: // RangeError: (11356, inf)
+ UpperBound = 11356.0f;
+ break;
+ default:
+ llvm_unreachable("Should be reach here");
+ }
+
+ ++NumWrappedOneCond;
+ return createCond(CI, CmpInst::FCMP_OGT, UpperBound);
+}
+
+// Generate the lower and upper bound condition for RangeError.
+Value *LibCallsShrinkWrap::generateTwoRangeCond(CallInst *CI,
+ const LibFunc::Func &Func) {
+ float UpperBound, LowerBound;
+ switch (Func) {
+ case LibFunc::cosh: // RangeError: (x < -710 || x > 710)
+ case LibFunc::sinh: // Same as cosh
+ LowerBound = -710.0f;
+ UpperBound = 710.0f;
+ break;
+ case LibFunc::coshf: // RangeError: (x < -89 || x > 89)
+ case LibFunc::sinhf: // Same as coshf
+ LowerBound = -89.0f;
+ UpperBound = 89.0f;
+ break;
+ case LibFunc::coshl: // RangeError: (x < -11357 || x > 11357)
+ case LibFunc::sinhl: // Same as coshl
+ LowerBound = -11357.0f;
+ UpperBound = 11357.0f;
+ break;
+ case LibFunc::exp: // RangeError: (x < -745 || x > 709)
+ LowerBound = -745.0f;
+ UpperBound = 709.0f;
+ break;
+ case LibFunc::expf: // RangeError: (x < -103 || x > 88)
+ LowerBound = -103.0f;
+ UpperBound = 88.0f;
+ break;
+ case LibFunc::expl: // RangeError: (x < -11399 || x > 11356)
+ LowerBound = -11399.0f;
+ UpperBound = 11356.0f;
+ break;
+ case LibFunc::exp10: // RangeError: (x < -323 || x > 308)
+ LowerBound = -323.0f;
+ UpperBound = 308.0f;
+ break;
+ case LibFunc::exp10f: // RangeError: (x < -45 || x > 38)
+ LowerBound = -45.0f;
+ UpperBound = 38.0f;
+ break;
+ case LibFunc::exp10l: // RangeError: (x < -4950 || x > 4932)
+ LowerBound = -4950.0f;
+ UpperBound = 4932.0f;
+ break;
+ case LibFunc::exp2: // RangeError: (x < -1074 || x > 1023)
+ LowerBound = -1074.0f;
+ UpperBound = 1023.0f;
+ break;
+ case LibFunc::exp2f: // RangeError: (x < -149 || x > 127)
+ LowerBound = -149.0f;
+ UpperBound = 127.0f;
+ break;
+ case LibFunc::exp2l: // RangeError: (x < -16445 || x > 11383)
+ LowerBound = -16445.0f;
+ UpperBound = 11383.0f;
+ break;
+ default:
+ llvm_unreachable("Should be reach here");
+ }
+
+ ++NumWrappedTwoCond;
+ return createOrCond(CI, CmpInst::FCMP_OGT, UpperBound, CmpInst::FCMP_OLT,
+ LowerBound);
+}
+
+// For pow(x,y), We only handle the following cases:
+// (1) x is a constant && (x >= 1) && (x < MaxUInt8)
+// Cond is: (y > 127)
+// (2) x is a value coming from an integer type.
+// (2.1) if x's bit_size == 8
+// Cond: (x <= 0 || y > 128)
+// (2.2) if x's bit_size is 16
+// Cond: (x <= 0 || y > 64)
+// (2.3) if x's bit_size is 32
+// Cond: (x <= 0 || y > 32)
+// Support for powl(x,y) and powf(x,y) are TBD.
+//
+// Note that condition can be more conservative than the actual condition
+// (i.e. we might invoke the calls that will not set the errno.).
+//
+Value *LibCallsShrinkWrap::generateCondForPow(CallInst *CI,
+ const LibFunc::Func &Func) {
+ // FIXME: LibFunc::powf and powl TBD.
+ if (Func != LibFunc::pow) {
+ DEBUG(dbgs() << "Not handled powf() and powl()\n");
+ return nullptr;
+ }
+
+ Value *Base = CI->getArgOperand(0);
+ Value *Exp = CI->getArgOperand(1);
+ IRBuilder<> BBBuilder(CI);
+
+ // Constant Base case.
+ if (ConstantFP *CF = dyn_cast<ConstantFP>(Base)) {
+ double D = CF->getValueAPF().convertToDouble();
+ if (D < 1.0f || D > APInt::getMaxValue(8).getZExtValue()) {
+ DEBUG(dbgs() << "Not handled pow(): constant base out of range\n");
+ return nullptr;
+ }
+
+ ++NumWrappedOneCond;
+ Constant *V = ConstantFP::get(CI->getContext(), APFloat(127.0f));
+ if (!Exp->getType()->isFloatTy())
+ V = ConstantExpr::getFPExtend(V, Exp->getType());
+ return BBBuilder.CreateFCmp(CmpInst::FCMP_OGT, Exp, V);
+ }
+
+ // If the Base value coming from an integer type.
+ Instruction *I = dyn_cast<Instruction>(Base);
+ if (!I) {
+ DEBUG(dbgs() << "Not handled pow(): FP type base\n");
+ return nullptr;
+ }
+ unsigned Opcode = I->getOpcode();
+ if (Opcode == Instruction::UIToFP || Opcode == Instruction::SIToFP) {
+ unsigned BW = I->getOperand(0)->getType()->getPrimitiveSizeInBits();
+ float UpperV = 0.0f;
+ if (BW == 8)
+ UpperV = 128.0f;
+ else if (BW == 16)
+ UpperV = 64.0f;
+ else if (BW == 32)
+ UpperV = 32.0f;
+ else {
+ DEBUG(dbgs() << "Not handled pow(): type too wide\n");
+ return nullptr;
+ }
+
+ ++NumWrappedTwoCond;
+ Constant *V = ConstantFP::get(CI->getContext(), APFloat(UpperV));
+ Constant *V0 = ConstantFP::get(CI->getContext(), APFloat(0.0f));
+ if (!Exp->getType()->isFloatTy())
+ V = ConstantExpr::getFPExtend(V, Exp->getType());
+ if (!Base->getType()->isFloatTy())
+ V0 = ConstantExpr::getFPExtend(V0, Exp->getType());
+
+ Value *Cond = BBBuilder.CreateFCmp(CmpInst::FCMP_OGT, Exp, V);
+ Value *Cond0 = BBBuilder.CreateFCmp(CmpInst::FCMP_OLE, Base, V0);
+ return BBBuilder.CreateOr(Cond0, Cond);
+ }
+ DEBUG(dbgs() << "Not handled pow(): base not from integer convert\n");
+ return nullptr;
+}
+
+// Wrap conditions that can potentially generate errno to the library call.
+void LibCallsShrinkWrap::shrinkWrapCI(CallInst *CI, Value *Cond) {
+ assert(Cond != nullptr && "hrinkWrapCI is not expecting an empty call inst");
+ MDNode *BranchWeights =
+ MDBuilder(CI->getContext()).createBranchWeights(1, 2000);
+ TerminatorInst *NewInst =
+ SplitBlockAndInsertIfThen(Cond, CI, false, BranchWeights);
+ BasicBlock *CallBB = NewInst->getParent();
+ CallBB->setName("cdce.call");
+ CallBB->getSingleSuccessor()->setName("cdce.end");
+ CI->removeFromParent();
+ CallBB->getInstList().insert(CallBB->getFirstInsertionPt(), CI);
+ DEBUG(dbgs() << "== Basic Block After ==");
+ DEBUG(dbgs() << *CallBB->getSinglePredecessor() << *CallBB
+ << *CallBB->getSingleSuccessor() << "\n");
+}
+
+// Perform the transformation to a single candidate.
+bool LibCallsShrinkWrap::perform(CallInst *CI) {
+ LibFunc::Func Func;
+ Function *Callee = CI->getCalledFunction();
+ assert(Callee && "perform() should apply to a non-empty callee");
+ TLI.getLibFunc(*Callee, Func);
+ assert(Func && "perform() is not expecting an empty function");
+
+ if (LibCallsShrinkWrapDoDomainError && performCallDomainErrorOnly(CI, Func))
+ return true;
+
+ if (LibCallsShrinkWrapDoRangeError && performCallRangeErrorOnly(CI, Func))
+ return true;
+
+ return performCallErrors(CI, Func);
+}
+
+void LibCallsShrinkWrapLegacyPass::getAnalysisUsage(AnalysisUsage &AU) const {
+ AU.addPreserved<GlobalsAAWrapperPass>();
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
+}
+
+static bool runImpl(Function &F, const TargetLibraryInfo &TLI) {
+ if (F.hasFnAttribute(Attribute::OptimizeForSize))
+ return false;
+ LibCallsShrinkWrap CCDCE(TLI);
+ CCDCE.visit(F);
+ CCDCE.perform();
+ return CCDCE.isChanged();
+}
+
+bool LibCallsShrinkWrapLegacyPass::runOnFunction(Function &F) {
+ auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+ return runImpl(F, TLI);
+}
+
+namespace llvm {
+char &LibCallsShrinkWrapPassID = LibCallsShrinkWrapLegacyPass::ID;
+
+// Public interface to LibCallsShrinkWrap pass.
+FunctionPass *createLibCallsShrinkWrapPass() {
+ return new LibCallsShrinkWrapLegacyPass();
+}
+
+PreservedAnalyses LibCallsShrinkWrapPass::run(Function &F,
+ FunctionAnalysisManager &FAM) {
+ auto &TLI = FAM.getResult<TargetLibraryAnalysis>(F);
+ bool Changed = runImpl(F, TLI);
+ if (!Changed)
+ return PreservedAnalyses::all();
+ auto PA = PreservedAnalyses();
+ PA.preserve<GlobalsAA>();
+ return PA;
+}
+}
diff --git a/contrib/llvm/lib/Transforms/Utils/Local.cpp b/contrib/llvm/lib/Transforms/Utils/Local.cpp
index f1838d8..6e4174a 100644
--- a/contrib/llvm/lib/Transforms/Utils/Local.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/Local.cpp
@@ -340,6 +340,10 @@ bool llvm::isInstructionTriviallyDead(Instruction *I,
if (Constant *C = dyn_cast<Constant>(CI->getArgOperand(0)))
return C->isNullValue() || isa<UndefValue>(C);
+ if (CallSite CS = CallSite(I))
+ if (isMathLibCallNoop(CS, TLI))
+ return true;
+
return false;
}
@@ -886,6 +890,17 @@ bool llvm::TryToSimplifyUncondBranchFromEmptyBlock(BasicBlock *BB) {
}
}
+ // If the unconditional branch we replaced contains llvm.loop metadata, we
+ // add the metadata to the branch instructions in the predecessors.
+ unsigned LoopMDKind = BB->getContext().getMDKindID("llvm.loop");
+ Instruction *TI = BB->getTerminator();
+ if (TI)
+ if (MDNode *LoopMD = TI->getMetadata(LoopMDKind))
+ for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) {
+ BasicBlock *Pred = *PI;
+ Pred->getTerminator()->setMetadata(LoopMDKind, LoopMD);
+ }
+
// Everything that jumped to BB now goes to Succ.
BB->replaceAllUsesWith(Succ);
if (!Succ->hasName()) Succ->takeName(BB);
@@ -1001,10 +1016,6 @@ static unsigned enforceKnownAlignment(Value *V, unsigned Align,
return Align;
}
-/// getOrEnforceKnownAlignment - If the specified pointer has an alignment that
-/// we can determine, return it, otherwise return 0. If PrefAlign is specified,
-/// and it is more than the alignment of the ultimate object, see if we can
-/// increase the alignment of the ultimate object, making this check succeed.
unsigned llvm::getOrEnforceKnownAlignment(Value *V, unsigned PrefAlign,
const DataLayout &DL,
const Instruction *CxtI,
@@ -1057,9 +1068,27 @@ static bool LdStHasDebugValue(DILocalVariable *DIVar, DIExpression *DIExpr,
return false;
}
+/// See if there is a dbg.value intrinsic for DIVar for the PHI node.
+static bool PhiHasDebugValue(DILocalVariable *DIVar,
+ DIExpression *DIExpr,
+ PHINode *APN) {
+ // Since we can't guarantee that the original dbg.declare instrinsic
+ // is removed by LowerDbgDeclare(), we need to make sure that we are
+ // not inserting the same dbg.value intrinsic over and over.
+ DbgValueList DbgValues;
+ FindAllocaDbgValues(DbgValues, APN);
+ for (auto DVI : DbgValues) {
+ assert (DVI->getValue() == APN);
+ assert (DVI->getOffset() == 0);
+ if ((DVI->getVariable() == DIVar) && (DVI->getExpression() == DIExpr))
+ return true;
+ }
+ return false;
+}
+
/// Inserts a llvm.dbg.value intrinsic before a store to an alloca'd value
/// that has an associated llvm.dbg.decl intrinsic.
-bool llvm::ConvertDebugDeclareToDebugValue(DbgDeclareInst *DDI,
+void llvm::ConvertDebugDeclareToDebugValue(DbgDeclareInst *DDI,
StoreInst *SI, DIBuilder &Builder) {
auto *DIVar = DDI->getVariable();
auto *DIExpr = DDI->getExpression();
@@ -1073,26 +1102,27 @@ bool llvm::ConvertDebugDeclareToDebugValue(DbgDeclareInst *DDI,
if (SExtInst *SExt = dyn_cast<SExtInst>(SI->getOperand(0)))
ExtendedArg = dyn_cast<Argument>(SExt->getOperand(0));
if (ExtendedArg) {
- // We're now only describing a subset of the variable. The piece we're
+ // We're now only describing a subset of the variable. The fragment we're
// describing will always be smaller than the variable size, because
// VariableSize == Size of Alloca described by DDI. Since SI stores
// to the alloca described by DDI, if it's first operand is an extend,
// we're guaranteed that before extension, the value was narrower than
// the size of the alloca, hence the size of the described variable.
SmallVector<uint64_t, 3> Ops;
- unsigned PieceOffset = 0;
- // If this already is a bit piece, we drop the bit piece from the expression
- // and record the offset.
- if (DIExpr->isBitPiece()) {
+ unsigned FragmentOffset = 0;
+ // If this already is a bit fragment, we drop the bit fragment from the
+ // expression and record the offset.
+ auto Fragment = DIExpr->getFragmentInfo();
+ if (Fragment) {
Ops.append(DIExpr->elements_begin(), DIExpr->elements_end()-3);
- PieceOffset = DIExpr->getBitPieceOffset();
+ FragmentOffset = Fragment->OffsetInBits;
} else {
Ops.append(DIExpr->elements_begin(), DIExpr->elements_end());
}
- Ops.push_back(dwarf::DW_OP_bit_piece);
- Ops.push_back(PieceOffset); // Offset
+ Ops.push_back(dwarf::DW_OP_LLVM_fragment);
+ Ops.push_back(FragmentOffset);
const DataLayout &DL = DDI->getModule()->getDataLayout();
- Ops.push_back(DL.getTypeSizeInBits(ExtendedArg->getType())); // Size
+ Ops.push_back(DL.getTypeSizeInBits(ExtendedArg->getType()));
auto NewDIExpr = Builder.createExpression(Ops);
if (!LdStHasDebugValue(DIVar, NewDIExpr, SI))
Builder.insertDbgValueIntrinsic(ExtendedArg, 0, DIVar, NewDIExpr,
@@ -1100,19 +1130,18 @@ bool llvm::ConvertDebugDeclareToDebugValue(DbgDeclareInst *DDI,
} else if (!LdStHasDebugValue(DIVar, DIExpr, SI))
Builder.insertDbgValueIntrinsic(SI->getOperand(0), 0, DIVar, DIExpr,
DDI->getDebugLoc(), SI);
- return true;
}
/// Inserts a llvm.dbg.value intrinsic before a load of an alloca'd value
/// that has an associated llvm.dbg.decl intrinsic.
-bool llvm::ConvertDebugDeclareToDebugValue(DbgDeclareInst *DDI,
+void llvm::ConvertDebugDeclareToDebugValue(DbgDeclareInst *DDI,
LoadInst *LI, DIBuilder &Builder) {
auto *DIVar = DDI->getVariable();
auto *DIExpr = DDI->getExpression();
assert(DIVar && "Missing variable");
if (LdStHasDebugValue(DIVar, DIExpr, LI))
- return true;
+ return;
// We are now tracking the loaded value instead of the address. In the
// future if multi-location support is added to the IR, it might be
@@ -1121,7 +1150,28 @@ bool llvm::ConvertDebugDeclareToDebugValue(DbgDeclareInst *DDI,
Instruction *DbgValue = Builder.insertDbgValueIntrinsic(
LI, 0, DIVar, DIExpr, DDI->getDebugLoc(), (Instruction *)nullptr);
DbgValue->insertAfter(LI);
- return true;
+}
+
+/// Inserts a llvm.dbg.value intrinsic after a phi
+/// that has an associated llvm.dbg.decl intrinsic.
+void llvm::ConvertDebugDeclareToDebugValue(DbgDeclareInst *DDI,
+ PHINode *APN, DIBuilder &Builder) {
+ auto *DIVar = DDI->getVariable();
+ auto *DIExpr = DDI->getExpression();
+ assert(DIVar && "Missing variable");
+
+ if (PhiHasDebugValue(DIVar, DIExpr, APN))
+ return;
+
+ BasicBlock *BB = APN->getParent();
+ auto InsertionPt = BB->getFirstInsertionPt();
+
+ // The block may be a catchswitch block, which does not have a valid
+ // insertion point.
+ // FIXME: Insert dbg.value markers in the successors when appropriate.
+ if (InsertionPt != BB->end())
+ Builder.insertDbgValueIntrinsic(APN, 0, DIVar, DIExpr, DDI->getDebugLoc(),
+ &*InsertionPt);
}
/// Determine whether this alloca is either a VLA or an array.
@@ -1191,6 +1241,16 @@ DbgDeclareInst *llvm::FindAllocaDbgDeclare(Value *V) {
return nullptr;
}
+/// FindAllocaDbgValues - Finds the llvm.dbg.value intrinsics describing the
+/// alloca 'V', if any.
+void llvm::FindAllocaDbgValues(DbgValueList &DbgValues, Value *V) {
+ if (auto *L = LocalAsMetadata::getIfExists(V))
+ if (auto *MDV = MetadataAsValue::getIfExists(V->getContext(), L))
+ for (User *U : MDV->users())
+ if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(U))
+ DbgValues.push_back(DVI);
+}
+
static void DIExprAddDeref(SmallVectorImpl<uint64_t> &Expr) {
Expr.push_back(dwarf::DW_OP_deref);
}
@@ -1310,12 +1370,13 @@ unsigned llvm::removeAllNonTerminatorAndEHPadInstructions(BasicBlock *BB) {
return NumDeadInst;
}
-unsigned llvm::changeToUnreachable(Instruction *I, bool UseLLVMTrap) {
+unsigned llvm::changeToUnreachable(Instruction *I, bool UseLLVMTrap,
+ bool PreserveLCSSA) {
BasicBlock *BB = I->getParent();
// Loop over all of the successors, removing BB's entry from any PHI
// nodes.
for (BasicBlock *Successor : successors(BB))
- Successor->removePredecessor(BB);
+ Successor->removePredecessor(BB, PreserveLCSSA);
// Insert a call to llvm.trap right before this. This turns the undefined
// behavior into a hard fail instead of falling through into random code.
@@ -1360,6 +1421,43 @@ static void changeToCall(InvokeInst *II) {
II->eraseFromParent();
}
+BasicBlock *llvm::changeToInvokeAndSplitBasicBlock(CallInst *CI,
+ BasicBlock *UnwindEdge) {
+ BasicBlock *BB = CI->getParent();
+
+ // Convert this function call into an invoke instruction. First, split the
+ // basic block.
+ BasicBlock *Split =
+ BB->splitBasicBlock(CI->getIterator(), CI->getName() + ".noexc");
+
+ // Delete the unconditional branch inserted by splitBasicBlock
+ BB->getInstList().pop_back();
+
+ // Create the new invoke instruction.
+ SmallVector<Value *, 8> InvokeArgs(CI->arg_begin(), CI->arg_end());
+ SmallVector<OperandBundleDef, 1> OpBundles;
+
+ CI->getOperandBundlesAsDefs(OpBundles);
+
+ // Note: we're round tripping operand bundles through memory here, and that
+ // can potentially be avoided with a cleverer API design that we do not have
+ // as of this time.
+
+ InvokeInst *II = InvokeInst::Create(CI->getCalledValue(), Split, UnwindEdge,
+ InvokeArgs, OpBundles, CI->getName(), BB);
+ II->setDebugLoc(CI->getDebugLoc());
+ II->setCallingConv(CI->getCallingConv());
+ II->setAttributes(CI->getAttributes());
+
+ // Make sure that anything using the call now uses the invoke! This also
+ // updates the CallGraph if present, because it uses a WeakVH.
+ CI->replaceAllUsesWith(II);
+
+ // Delete the original call
+ Split->getInstList().pop_front();
+ return Split;
+}
+
static bool markAliveBlocks(Function &F,
SmallPtrSetImpl<BasicBlock*> &Reachable) {
@@ -1586,10 +1684,10 @@ void llvm::combineMetadata(Instruction *K, const Instruction *J,
SmallVector<std::pair<unsigned, MDNode *>, 4> Metadata;
K->dropUnknownNonDebugMetadata(KnownIDs);
K->getAllMetadataOtherThanDebugLoc(Metadata);
- for (unsigned i = 0, n = Metadata.size(); i < n; ++i) {
- unsigned Kind = Metadata[i].first;
+ for (const auto &MD : Metadata) {
+ unsigned Kind = MD.first;
MDNode *JMD = J->getMetadata(Kind);
- MDNode *KMD = Metadata[i].second;
+ MDNode *KMD = MD.second;
switch (Kind) {
default:
@@ -1646,6 +1744,17 @@ void llvm::combineMetadata(Instruction *K, const Instruction *J,
K->setMetadata(LLVMContext::MD_invariant_group, JMD);
}
+void llvm::combineMetadataForCSE(Instruction *K, const Instruction *J) {
+ unsigned KnownIDs[] = {
+ LLVMContext::MD_tbaa, LLVMContext::MD_alias_scope,
+ LLVMContext::MD_noalias, LLVMContext::MD_range,
+ LLVMContext::MD_invariant_load, LLVMContext::MD_nonnull,
+ LLVMContext::MD_invariant_group, LLVMContext::MD_align,
+ LLVMContext::MD_dereferenceable,
+ LLVMContext::MD_dereferenceable_or_null};
+ combineMetadata(K, J, KnownIDs);
+}
+
unsigned llvm::replaceDominatedUsesWith(Value *From, Value *To,
DominatorTree &DT,
const BasicBlockEdge &Root) {
@@ -1703,6 +1812,7 @@ bool llvm::callsGCLeafFunction(ImmutableCallSite CS) {
return false;
}
+namespace {
/// A potential constituent of a bitreverse or bswap expression. See
/// collectBitParts for a fuller explanation.
struct BitPart {
@@ -1718,6 +1828,7 @@ struct BitPart {
enum { Unset = -1 };
};
+} // end anonymous namespace
/// Analyze the specified subexpression and see if it is capable of providing
/// pieces of a bswap or bitreverse. The subexpression provides a potential
@@ -1954,23 +2065,12 @@ bool llvm::recognizeBSwapOrBitReverseIdiom(
// in ASan/MSan/TSan/DFSan, and thus make us miss some memory accesses,
// we mark affected calls as NoBuiltin, which will disable optimization
// in CodeGen.
-void llvm::maybeMarkSanitizerLibraryCallNoBuiltin(CallInst *CI,
- const TargetLibraryInfo *TLI) {
+void llvm::maybeMarkSanitizerLibraryCallNoBuiltin(
+ CallInst *CI, const TargetLibraryInfo *TLI) {
Function *F = CI->getCalledFunction();
LibFunc::Func Func;
- if (!F || F->hasLocalLinkage() || !F->hasName() ||
- !TLI->getLibFunc(F->getName(), Func))
- return;
- switch (Func) {
- default: break;
- case LibFunc::memcmp:
- case LibFunc::memchr:
- case LibFunc::strcpy:
- case LibFunc::stpcpy:
- case LibFunc::strcmp:
- case LibFunc::strlen:
- case LibFunc::strnlen:
- CI->addAttribute(AttributeSet::FunctionIndex, Attribute::NoBuiltin);
- break;
- }
+ if (F && !F->hasLocalLinkage() && F->hasName() &&
+ TLI->getLibFunc(F->getName(), Func) && TLI->hasOptimizedCodeGen(Func) &&
+ !F->doesNotAccessMemory())
+ CI->addAttribute(AttributeSet::FunctionIndex, Attribute::NoBuiltin);
}
diff --git a/contrib/llvm/lib/Transforms/Utils/LoopSimplify.cpp b/contrib/llvm/lib/Transforms/Utils/LoopSimplify.cpp
index 2846e8f..00cda2a 100644
--- a/contrib/llvm/lib/Transforms/Utils/LoopSimplify.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/LoopSimplify.cpp
@@ -361,25 +361,12 @@ static Loop *separateNestedLoop(Loop *L, BasicBlock *Preheader,
// Fix LCSSA form for L. Some values, which previously were only used inside
// L, can now be used in NewOuter loop. We need to insert phi-nodes for them
// in corresponding exit blocks.
+ // We don't need to form LCSSA recursively, because there cannot be uses
+ // inside a newly created loop of defs from inner loops as those would
+ // already be a use of an LCSSA phi node.
+ formLCSSA(*L, *DT, LI, SE);
- // Go through all instructions in OuterLoopBlocks and check if they are
- // using operands from the inner loop. In this case we'll need to fix LCSSA
- // for these instructions.
- SmallSetVector<Instruction *, 8> WorklistSet;
- for (BasicBlock *OuterBB: OuterLoopBlocks) {
- for (Instruction &I : *OuterBB) {
- for (Value *Op : I.operands()) {
- Instruction *OpI = dyn_cast<Instruction>(Op);
- if (!OpI || !L->contains(OpI))
- continue;
- WorklistSet.insert(OpI);
- }
- }
- }
- SmallVector<Instruction *, 8> Worklist(WorklistSet.begin(),
- WorklistSet.end());
- formLCSSAForInstructions(Worklist, *DT, *LI);
- assert(NewOuter->isRecursivelyLCSSAForm(*DT) &&
+ assert(NewOuter->isRecursivelyLCSSAForm(*DT, *LI) &&
"LCSSA is broken after separating nested loops!");
}
@@ -483,13 +470,21 @@ static BasicBlock *insertUniqueBackedgeBlock(Loop *L, BasicBlock *Preheader,
}
// Now that all of the PHI nodes have been inserted and adjusted, modify the
- // backedge blocks to just to the BEBlock instead of the header.
+ // backedge blocks to jump to the BEBlock instead of the header.
+ // If one of the backedges has llvm.loop metadata attached, we remove
+ // it from the backedge and add it to BEBlock.
+ unsigned LoopMDKind = BEBlock->getContext().getMDKindID("llvm.loop");
+ MDNode *LoopMD = nullptr;
for (unsigned i = 0, e = BackedgeBlocks.size(); i != e; ++i) {
TerminatorInst *TI = BackedgeBlocks[i]->getTerminator();
+ if (!LoopMD)
+ LoopMD = TI->getMetadata(LoopMDKind);
+ TI->setMetadata(LoopMDKind, nullptr);
for (unsigned Op = 0, e = TI->getNumSuccessors(); Op != e; ++Op)
if (TI->getSuccessor(Op) == Header)
TI->setSuccessor(Op, BEBlock);
}
+ BEBlock->getTerminator()->setMetadata(LoopMDKind, LoopMD);
//===--- Update all analyses which we must preserve now -----------------===//
@@ -535,7 +530,7 @@ ReprocessLoop:
// Zap the dead pred's terminator and replace it with unreachable.
TerminatorInst *TI = P->getTerminator();
- changeToUnreachable(TI, /*UseLLVMTrap=*/false);
+ changeToUnreachable(TI, /*UseLLVMTrap=*/false, PreserveLCSSA);
Changed = true;
}
}
@@ -635,8 +630,10 @@ ReprocessLoop:
(PN = dyn_cast<PHINode>(I++)); )
if (Value *V = SimplifyInstruction(PN, DL, nullptr, DT, AC)) {
if (SE) SE->forgetValue(PN);
- PN->replaceAllUsesWith(V);
- PN->eraseFromParent();
+ if (!PreserveLCSSA || LI->replacementPreservesLCSSAForm(PN, V)) {
+ PN->replaceAllUsesWith(V);
+ PN->eraseFromParent();
+ }
}
// If this loop has multiple exits and the exits all go to the same
@@ -821,8 +818,8 @@ bool LoopSimplify::runOnFunction(Function &F) {
if (PreserveLCSSA) {
assert(DT && "DT not available.");
assert(LI && "LI not available.");
- bool InLCSSA =
- all_of(*LI, [&](Loop *L) { return L->isRecursivelyLCSSAForm(*DT); });
+ bool InLCSSA = all_of(
+ *LI, [&](Loop *L) { return L->isRecursivelyLCSSAForm(*DT, *LI); });
assert(InLCSSA && "Requested to preserve LCSSA, but it's already broken.");
}
#endif
@@ -833,8 +830,8 @@ bool LoopSimplify::runOnFunction(Function &F) {
#ifndef NDEBUG
if (PreserveLCSSA) {
- bool InLCSSA =
- all_of(*LI, [&](Loop *L) { return L->isRecursivelyLCSSAForm(*DT); });
+ bool InLCSSA = all_of(
+ *LI, [&](Loop *L) { return L->isRecursivelyLCSSAForm(*DT, *LI); });
assert(InLCSSA && "LCSSA is broken after loop-simplify.");
}
#endif
@@ -842,7 +839,7 @@ bool LoopSimplify::runOnFunction(Function &F) {
}
PreservedAnalyses LoopSimplifyPass::run(Function &F,
- AnalysisManager<Function> &AM) {
+ FunctionAnalysisManager &AM) {
bool Changed = false;
LoopInfo *LI = &AM.getResult<LoopAnalysis>(F);
DominatorTree *DT = &AM.getResult<DominatorTreeAnalysis>(F);
@@ -854,6 +851,10 @@ PreservedAnalyses LoopSimplifyPass::run(Function &F,
for (LoopInfo::iterator I = LI->begin(), E = LI->end(); I != E; ++I)
Changed |= simplifyLoop(*I, DT, LI, SE, AC, true /* PreserveLCSSA */);
+ // FIXME: We need to invalidate this to avoid PR28400. Is there a better
+ // solution?
+ AM.invalidate<ScalarEvolutionAnalysis>(F);
+
if (!Changed)
return PreservedAnalyses::all();
PreservedAnalyses PA;
diff --git a/contrib/llvm/lib/Transforms/Utils/LoopUnroll.cpp b/contrib/llvm/lib/Transforms/Utils/LoopUnroll.cpp
index 7f1f78f..e346ebd 100644
--- a/contrib/llvm/lib/Transforms/Utils/LoopUnroll.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/LoopUnroll.cpp
@@ -23,11 +23,12 @@
#include "llvm/Analysis/InstructionSimplify.h"
#include "llvm/Analysis/LoopIterator.h"
#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/OptimizationDiagnosticInfo.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
@@ -46,7 +47,7 @@ STATISTIC(NumCompletelyUnrolled, "Number of loops completely unrolled");
STATISTIC(NumUnrolled, "Number of loops unrolled (completely or otherwise)");
static cl::opt<bool>
-UnrollRuntimeEpilog("unroll-runtime-epilog", cl::init(true), cl::Hidden,
+UnrollRuntimeEpilog("unroll-runtime-epilog", cl::init(false), cl::Hidden,
cl::desc("Allow runtime unrolled loops to be unrolled "
"with epilog instead of prolog."));
@@ -171,20 +172,58 @@ static bool needToInsertPhisForLCSSA(Loop *L, std::vector<BasicBlock *> Blocks,
return false;
}
+/// Adds ClonedBB to LoopInfo, creates a new loop for ClonedBB if necessary
+/// and adds a mapping from the original loop to the new loop to NewLoops.
+/// Returns nullptr if no new loop was created and a pointer to the
+/// original loop OriginalBB was part of otherwise.
+const Loop* llvm::addClonedBlockToLoopInfo(BasicBlock *OriginalBB,
+ BasicBlock *ClonedBB, LoopInfo *LI,
+ NewLoopsMap &NewLoops) {
+ // Figure out which loop New is in.
+ const Loop *OldLoop = LI->getLoopFor(OriginalBB);
+ assert(OldLoop && "Should (at least) be in the loop being unrolled!");
+
+ Loop *&NewLoop = NewLoops[OldLoop];
+ if (!NewLoop) {
+ // Found a new sub-loop.
+ assert(OriginalBB == OldLoop->getHeader() &&
+ "Header should be first in RPO");
+
+ NewLoop = new Loop();
+ Loop *NewLoopParent = NewLoops.lookup(OldLoop->getParentLoop());
+
+ if (NewLoopParent)
+ NewLoopParent->addChildLoop(NewLoop);
+ else
+ LI->addTopLevelLoop(NewLoop);
+
+ NewLoop->addBasicBlockToLoop(ClonedBB, *LI);
+ return OldLoop;
+ } else {
+ NewLoop->addBasicBlockToLoop(ClonedBB, *LI);
+ return nullptr;
+ }
+}
+
/// Unroll the given loop by Count. The loop must be in LCSSA form. Returns true
/// if unrolling was successful, or false if the loop was unmodified. Unrolling
/// can only fail when the loop's latch block is not terminated by a conditional
/// branch instruction. However, if the trip count (and multiple) are not known,
/// loop unrolling will mostly produce more code that is no faster.
///
-/// TripCount is generally defined as the number of times the loop header
-/// executes. UnrollLoop relaxes the definition to permit early exits: here
-/// TripCount is the iteration on which control exits LatchBlock if no early
-/// exits were taken. Note that UnrollLoop assumes that the loop counter test
-/// terminates LatchBlock in order to remove unnecesssary instances of the
-/// test. In other words, control may exit the loop prior to TripCount
-/// iterations via an early branch, but control may not exit the loop from the
-/// LatchBlock's terminator prior to TripCount iterations.
+/// TripCount is the upper bound of the iteration on which control exits
+/// LatchBlock. Control may exit the loop prior to TripCount iterations either
+/// via an early branch in other loop block or via LatchBlock terminator. This
+/// is relaxed from the general definition of trip count which is the number of
+/// times the loop header executes. Note that UnrollLoop assumes that the loop
+/// counter test is in LatchBlock in order to remove unnecesssary instances of
+/// the test. If control can exit the loop from the LatchBlock's terminator
+/// prior to TripCount iterations, flag PreserveCondBr needs to be set.
+///
+/// PreserveCondBr indicates whether the conditional branch of the LatchBlock
+/// needs to be preserved. It is needed when we use trip count upper bound to
+/// fully unroll the loop. If PreserveOnlyFirst is also set then only the first
+/// conditional branch needs to be preserved.
///
/// Similarly, TripMultiple divides the number of times that the LatchBlock may
/// execute without exiting the loop.
@@ -196,15 +235,21 @@ static bool needToInsertPhisForLCSSA(Loop *L, std::vector<BasicBlock *> Blocks,
/// runtime-unroll the loop if computing RuntimeTripCount will be expensive and
/// AllowExpensiveTripCount is false.
///
+/// If we want to perform PGO-based loop peeling, PeelCount is set to the
+/// number of iterations we want to peel off.
+///
/// The LoopInfo Analysis that is passed will be kept consistent.
///
/// This utility preserves LoopInfo. It will also preserve ScalarEvolution and
/// DominatorTree if they are non-null.
bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, bool Force,
bool AllowRuntime, bool AllowExpensiveTripCount,
- unsigned TripMultiple, LoopInfo *LI, ScalarEvolution *SE,
- DominatorTree *DT, AssumptionCache *AC,
+ bool PreserveCondBr, bool PreserveOnlyFirst,
+ unsigned TripMultiple, unsigned PeelCount, LoopInfo *LI,
+ ScalarEvolution *SE, DominatorTree *DT,
+ AssumptionCache *AC, OptimizationRemarkEmitter *ORE,
bool PreserveLCSSA) {
+
BasicBlock *Preheader = L->getLoopPreheader();
if (!Preheader) {
DEBUG(dbgs() << " Can't unroll; loop preheader-insertion failed.\n");
@@ -250,9 +295,8 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, bool Force,
if (TripCount != 0 && Count > TripCount)
Count = TripCount;
- // Don't enter the unroll code if there is nothing to do. This way we don't
- // need to support "partial unrolling by 1".
- if (TripCount == 0 && Count < 2)
+ // Don't enter the unroll code if there is nothing to do.
+ if (TripCount == 0 && Count < 2 && PeelCount == 0)
return false;
assert(Count > 0);
@@ -272,14 +316,22 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, bool Force,
// now we just recompute LCSSA for the outer loop, but it should be possible
// to fix it in-place.
bool NeedToFixLCSSA = PreserveLCSSA && CompletelyUnroll &&
- std::any_of(ExitBlocks.begin(), ExitBlocks.end(),
- [&](BasicBlock *BB) { return isa<PHINode>(BB->begin()); });
+ any_of(ExitBlocks, [](const BasicBlock *BB) {
+ return isa<PHINode>(BB->begin());
+ });
// We assume a run-time trip count if the compiler cannot
// figure out the loop trip count and the unroll-runtime
// flag is specified.
bool RuntimeTripCount = (TripCount == 0 && Count > 0 && AllowRuntime);
+ assert((!RuntimeTripCount || !PeelCount) &&
+ "Did not expect runtime trip-count unrolling "
+ "and peeling for the same loop");
+
+ if (PeelCount)
+ peelLoop(L, PeelCount, LI, SE, DT, PreserveLCSSA);
+
// Loops containing convergent instructions must have a count that divides
// their TripMultiple.
DEBUG(
@@ -293,9 +345,7 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, bool Force,
"Unroll count must divide trip multiple if loop contains a "
"convergent operation.");
});
- // Don't output the runtime loop remainder if Count is a multiple of
- // TripMultiple. Such a remainder is never needed, and is unsafe if the loop
- // contains a convergent instruction.
+
if (RuntimeTripCount && TripMultiple % Count != 0 &&
!UnrollRuntimeLoopRemainder(L, Count, AllowExpensiveTripCount,
UnrollRuntimeEpilog, LI, SE, DT,
@@ -322,35 +372,40 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, bool Force,
(unsigned)GreatestCommonDivisor64(Count, TripMultiple);
}
+ using namespace ore;
// Report the unrolling decision.
- DebugLoc LoopLoc = L->getStartLoc();
- Function *F = Header->getParent();
- LLVMContext &Ctx = F->getContext();
-
if (CompletelyUnroll) {
DEBUG(dbgs() << "COMPLETELY UNROLLING loop %" << Header->getName()
<< " with trip count " << TripCount << "!\n");
- emitOptimizationRemark(Ctx, DEBUG_TYPE, *F, LoopLoc,
- Twine("completely unrolled loop with ") +
- Twine(TripCount) + " iterations");
+ ORE->emit(OptimizationRemark(DEBUG_TYPE, "FullyUnrolled", L->getStartLoc(),
+ L->getHeader())
+ << "completely unrolled loop with "
+ << NV("UnrollCount", TripCount) << " iterations");
+ } else if (PeelCount) {
+ DEBUG(dbgs() << "PEELING loop %" << Header->getName()
+ << " with iteration count " << PeelCount << "!\n");
+ ORE->emit(OptimizationRemark(DEBUG_TYPE, "Peeled", L->getStartLoc(),
+ L->getHeader())
+ << " peeled loop by " << NV("PeelCount", PeelCount)
+ << " iterations");
} else {
- auto EmitDiag = [&](const Twine &T) {
- emitOptimizationRemark(Ctx, DEBUG_TYPE, *F, LoopLoc,
- "unrolled loop by a factor of " + Twine(Count) +
- T);
- };
+ OptimizationRemark Diag(DEBUG_TYPE, "PartialUnrolled", L->getStartLoc(),
+ L->getHeader());
+ Diag << "unrolled loop by a factor of " << NV("UnrollCount", Count);
DEBUG(dbgs() << "UNROLLING loop %" << Header->getName()
<< " by " << Count);
if (TripMultiple == 0 || BreakoutTrip != TripMultiple) {
DEBUG(dbgs() << " with a breakout at trip " << BreakoutTrip);
- EmitDiag(" with a breakout at trip " + Twine(BreakoutTrip));
+ ORE->emit(Diag << " with a breakout at trip "
+ << NV("BreakoutTrip", BreakoutTrip));
} else if (TripMultiple != 1) {
DEBUG(dbgs() << " with " << TripMultiple << " trips per branch");
- EmitDiag(" with " + Twine(TripMultiple) + " trips per branch");
+ ORE->emit(Diag << " with " << NV("TripMultiple", TripMultiple)
+ << " trips per branch");
} else if (RuntimeTripCount) {
DEBUG(dbgs() << " with run-time trip count");
- EmitDiag(" with run-time trip count");
+ ORE->emit(Diag << " with run-time trip count");
}
DEBUG(dbgs() << "!\n");
}
@@ -382,6 +437,15 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, bool Force,
LoopBlocksDFS::RPOIterator BlockEnd = DFS.endRPO();
std::vector<BasicBlock*> UnrolledLoopBlocks = L->getBlocks();
+
+ // Loop Unrolling might create new loops. While we do preserve LoopInfo, we
+ // might break loop-simplified form for these loops (as they, e.g., would
+ // share the same exit blocks). We'll keep track of loops for which we can
+ // break this so that later we can re-simplify them.
+ SmallSetVector<Loop *, 4> LoopsToSimplify;
+ for (Loop *SubLoop : *L)
+ LoopsToSimplify.insert(SubLoop);
+
for (unsigned It = 1; It != Count; ++It) {
std::vector<BasicBlock*> NewBlocks;
SmallDenseMap<const Loop *, Loop *, 4> NewLoops;
@@ -397,27 +461,14 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, bool Force,
assert(LI->getLoopFor(*BB) == L && "Header should not be in a sub-loop");
L->addBasicBlockToLoop(New, *LI);
} else {
- // Figure out which loop New is in.
- const Loop *OldLoop = LI->getLoopFor(*BB);
- assert(OldLoop && "Should (at least) be in the loop being unrolled!");
-
- Loop *&NewLoop = NewLoops[OldLoop];
- if (!NewLoop) {
- // Found a new sub-loop.
- assert(*BB == OldLoop->getHeader() &&
- "Header should be first in RPO");
-
- Loop *NewLoopParent = NewLoops.lookup(OldLoop->getParentLoop());
- assert(NewLoopParent &&
- "Expected parent loop before sub-loop in RPO");
- NewLoop = new Loop;
- NewLoopParent->addChildLoop(NewLoop);
+ const Loop *OldLoop = addClonedBlockToLoopInfo(*BB, New, LI, NewLoops);
+ if (OldLoop) {
+ LoopsToSimplify.insert(NewLoops[OldLoop]);
// Forget the old loop, since its inputs may have changed.
if (SE)
SE->forgetLoop(OldLoop);
}
- NewLoop->addBasicBlockToLoop(New, *LI);
}
if (*BB == Header)
@@ -480,9 +531,14 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, bool Force,
}
// Remap all instructions in the most recent iteration
- for (BasicBlock *NewBlock : NewBlocks)
- for (Instruction &I : *NewBlock)
+ for (BasicBlock *NewBlock : NewBlocks) {
+ for (Instruction &I : *NewBlock) {
::remapInstruction(&I, LastValueMap);
+ if (auto *II = dyn_cast<IntrinsicInst>(&I))
+ if (II->getIntrinsicID() == Intrinsic::assume)
+ AC->registerAssumption(II);
+ }
+ }
}
// Loop over the PHI nodes in the original block, setting incoming values.
@@ -524,12 +580,16 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, bool Force,
if (CompletelyUnroll) {
if (j == 0)
Dest = LoopExit;
- NeedConditional = false;
- }
-
- // If we know the trip count or a multiple of it, we can safely use an
- // unconditional branch for some iterations.
- if (j != BreakoutTrip && (TripMultiple == 0 || j % TripMultiple != 0)) {
+ // If using trip count upper bound to completely unroll, we need to keep
+ // the conditional branch except the last one because the loop may exit
+ // after any iteration.
+ assert(NeedConditional &&
+ "NeedCondition cannot be modified by both complete "
+ "unrolling and runtime unrolling");
+ NeedConditional = (PreserveCondBr && j && !(PreserveOnlyFirst && i != 0));
+ } else if (j != BreakoutTrip && (TripMultiple == 0 || j % TripMultiple != 0)) {
+ // If we know the trip count or a multiple of it, we can safely use an
+ // unconditional branch for some iterations.
NeedConditional = false;
}
@@ -595,10 +655,6 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, bool Force,
}
}
- // FIXME: We could register any cloned assumptions instead of clearing the
- // whole function's cache.
- AC->clear();
-
// FIXME: We only preserve DT info for complete unrolling now. Incrementally
// updating domtree after partial loop unrolling should also be easy.
if (DT && !CompletelyUnroll)
@@ -607,7 +663,7 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, bool Force,
DEBUG(DT->verifyDomTree());
// Simplify any new induction variables in the partially unrolled loop.
- if (SE && !CompletelyUnroll) {
+ if (SE && !CompletelyUnroll && Count > 1) {
SmallVector<WeakVH, 16> DeadInsts;
simplifyLoopIVs(L, SE, DT, LI, DeadInsts);
@@ -636,6 +692,11 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, bool Force,
}
}
+ // TODO: after peeling or unrolling, previously loop variant conditions are
+ // likely to fold to constants, eagerly propagating those here will require
+ // fewer cleanup passes to be run. Alternatively, a LoopEarlyCSE might be
+ // appropriate.
+
NumCompletelyUnrolled += CompletelyUnroll;
++NumUnrolled;
@@ -663,6 +724,11 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, bool Force,
if (!OuterL && !CompletelyUnroll)
OuterL = L;
if (OuterL) {
+ // OuterL includes all loops for which we can break loop-simplify, so
+ // it's sufficient to simplify only it (it'll recursively simplify inner
+ // loops too).
+ // TODO: That potentially might be compile-time expensive. We should try
+ // to fix the loop-simplified form incrementally.
simplifyLoop(OuterL, DT, LI, SE, AC, PreserveLCSSA);
// LCSSA must be performed on the outermost affected loop. The unrolled
@@ -678,6 +744,10 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, bool Force,
else
assert(OuterL->isLCSSAForm(*DT) &&
"Loops should be in LCSSA form after loop-unroll.");
+ } else {
+ // Simplify loops for which we might've broken loop-simplify form.
+ for (Loop *SubLoop : LoopsToSimplify)
+ simplifyLoop(SubLoop, DT, LI, SE, AC, PreserveLCSSA);
}
}
diff --git a/contrib/llvm/lib/Transforms/Utils/LoopUnrollPeel.cpp b/contrib/llvm/lib/Transforms/Utils/LoopUnrollPeel.cpp
new file mode 100644
index 0000000..842cf31
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Utils/LoopUnrollPeel.cpp
@@ -0,0 +1,414 @@
+//===-- UnrollLoopPeel.cpp - Loop peeling utilities -----------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements some loop unrolling utilities for peeling loops
+// with dynamically inferred (from PGO) trip counts. See LoopUnroll.cpp for
+// unrolling loops with compile-time constant trip counts.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/LoopIterator.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/UnrollLoop.h"
+#include <algorithm>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "loop-unroll"
+STATISTIC(NumPeeled, "Number of loops peeled");
+
+static cl::opt<unsigned> UnrollPeelMaxCount(
+ "unroll-peel-max-count", cl::init(7), cl::Hidden,
+ cl::desc("Max average trip count which will cause loop peeling."));
+
+static cl::opt<unsigned> UnrollForcePeelCount(
+ "unroll-force-peel-count", cl::init(0), cl::Hidden,
+ cl::desc("Force a peel count regardless of profiling information."));
+
+// Check whether we are capable of peeling this loop.
+static bool canPeel(Loop *L) {
+ // Make sure the loop is in simplified form
+ if (!L->isLoopSimplifyForm())
+ return false;
+
+ // Only peel loops that contain a single exit
+ if (!L->getExitingBlock() || !L->getUniqueExitBlock())
+ return false;
+
+ return true;
+}
+
+// Return the number of iterations we want to peel off.
+void llvm::computePeelCount(Loop *L, unsigned LoopSize,
+ TargetTransformInfo::UnrollingPreferences &UP) {
+ UP.PeelCount = 0;
+ if (!canPeel(L))
+ return;
+
+ // Only try to peel innermost loops.
+ if (!L->empty())
+ return;
+
+ // If the user provided a peel count, use that.
+ bool UserPeelCount = UnrollForcePeelCount.getNumOccurrences() > 0;
+ if (UserPeelCount) {
+ DEBUG(dbgs() << "Force-peeling first " << UnrollForcePeelCount
+ << " iterations.\n");
+ UP.PeelCount = UnrollForcePeelCount;
+ return;
+ }
+
+ // If we don't know the trip count, but have reason to believe the average
+ // trip count is low, peeling should be beneficial, since we will usually
+ // hit the peeled section.
+ // We only do this in the presence of profile information, since otherwise
+ // our estimates of the trip count are not reliable enough.
+ if (UP.AllowPeeling && L->getHeader()->getParent()->getEntryCount()) {
+ Optional<unsigned> PeelCount = getLoopEstimatedTripCount(L);
+ if (!PeelCount)
+ return;
+
+ DEBUG(dbgs() << "Profile-based estimated trip count is " << *PeelCount
+ << "\n");
+
+ if (*PeelCount) {
+ if ((*PeelCount <= UnrollPeelMaxCount) &&
+ (LoopSize * (*PeelCount + 1) <= UP.Threshold)) {
+ DEBUG(dbgs() << "Peeling first " << *PeelCount << " iterations.\n");
+ UP.PeelCount = *PeelCount;
+ return;
+ }
+ DEBUG(dbgs() << "Requested peel count: " << *PeelCount << "\n");
+ DEBUG(dbgs() << "Max peel count: " << UnrollPeelMaxCount << "\n");
+ DEBUG(dbgs() << "Peel cost: " << LoopSize * (*PeelCount + 1) << "\n");
+ DEBUG(dbgs() << "Max peel cost: " << UP.Threshold << "\n");
+ }
+ }
+
+ return;
+}
+
+/// \brief Update the branch weights of the latch of a peeled-off loop
+/// iteration.
+/// This sets the branch weights for the latch of the recently peeled off loop
+/// iteration correctly.
+/// Our goal is to make sure that:
+/// a) The total weight of all the copies of the loop body is preserved.
+/// b) The total weight of the loop exit is preserved.
+/// c) The body weight is reasonably distributed between the peeled iterations.
+///
+/// \param Header The copy of the header block that belongs to next iteration.
+/// \param LatchBR The copy of the latch branch that belongs to this iteration.
+/// \param IterNumber The serial number of the iteration that was just
+/// peeled off.
+/// \param AvgIters The average number of iterations we expect the loop to have.
+/// \param[in,out] PeeledHeaderWeight The total number of dynamic loop
+/// iterations that are unaccounted for. As an input, it represents the number
+/// of times we expect to enter the header of the iteration currently being
+/// peeled off. The output is the number of times we expect to enter the
+/// header of the next iteration.
+static void updateBranchWeights(BasicBlock *Header, BranchInst *LatchBR,
+ unsigned IterNumber, unsigned AvgIters,
+ uint64_t &PeeledHeaderWeight) {
+
+ // FIXME: Pick a more realistic distribution.
+ // Currently the proportion of weight we assign to the fall-through
+ // side of the branch drops linearly with the iteration number, and we use
+ // a 0.9 fudge factor to make the drop-off less sharp...
+ if (PeeledHeaderWeight) {
+ uint64_t FallThruWeight =
+ PeeledHeaderWeight * ((float)(AvgIters - IterNumber) / AvgIters * 0.9);
+ uint64_t ExitWeight = PeeledHeaderWeight - FallThruWeight;
+ PeeledHeaderWeight -= ExitWeight;
+
+ unsigned HeaderIdx = (LatchBR->getSuccessor(0) == Header ? 0 : 1);
+ MDBuilder MDB(LatchBR->getContext());
+ MDNode *WeightNode =
+ HeaderIdx ? MDB.createBranchWeights(ExitWeight, FallThruWeight)
+ : MDB.createBranchWeights(FallThruWeight, ExitWeight);
+ LatchBR->setMetadata(LLVMContext::MD_prof, WeightNode);
+ }
+}
+
+/// \brief Clones the body of the loop L, putting it between \p InsertTop and \p
+/// InsertBot.
+/// \param IterNumber The serial number of the iteration currently being
+/// peeled off.
+/// \param Exit The exit block of the original loop.
+/// \param[out] NewBlocks A list of the the blocks in the newly created clone
+/// \param[out] VMap The value map between the loop and the new clone.
+/// \param LoopBlocks A helper for DFS-traversal of the loop.
+/// \param LVMap A value-map that maps instructions from the original loop to
+/// instructions in the last peeled-off iteration.
+static void cloneLoopBlocks(Loop *L, unsigned IterNumber, BasicBlock *InsertTop,
+ BasicBlock *InsertBot, BasicBlock *Exit,
+ SmallVectorImpl<BasicBlock *> &NewBlocks,
+ LoopBlocksDFS &LoopBlocks, ValueToValueMapTy &VMap,
+ ValueToValueMapTy &LVMap, LoopInfo *LI) {
+
+ BasicBlock *Header = L->getHeader();
+ BasicBlock *Latch = L->getLoopLatch();
+ BasicBlock *PreHeader = L->getLoopPreheader();
+
+ Function *F = Header->getParent();
+ LoopBlocksDFS::RPOIterator BlockBegin = LoopBlocks.beginRPO();
+ LoopBlocksDFS::RPOIterator BlockEnd = LoopBlocks.endRPO();
+ Loop *ParentLoop = L->getParentLoop();
+
+ // For each block in the original loop, create a new copy,
+ // and update the value map with the newly created values.
+ for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) {
+ BasicBlock *NewBB = CloneBasicBlock(*BB, VMap, ".peel", F);
+ NewBlocks.push_back(NewBB);
+
+ if (ParentLoop)
+ ParentLoop->addBasicBlockToLoop(NewBB, *LI);
+
+ VMap[*BB] = NewBB;
+ }
+
+ // Hook-up the control flow for the newly inserted blocks.
+ // The new header is hooked up directly to the "top", which is either
+ // the original loop preheader (for the first iteration) or the previous
+ // iteration's exiting block (for every other iteration)
+ InsertTop->getTerminator()->setSuccessor(0, cast<BasicBlock>(VMap[Header]));
+
+ // Similarly, for the latch:
+ // The original exiting edge is still hooked up to the loop exit.
+ // The backedge now goes to the "bottom", which is either the loop's real
+ // header (for the last peeled iteration) or the copied header of the next
+ // iteration (for every other iteration)
+ BranchInst *LatchBR =
+ cast<BranchInst>(cast<BasicBlock>(VMap[Latch])->getTerminator());
+ unsigned HeaderIdx = (LatchBR->getSuccessor(0) == Header ? 0 : 1);
+ LatchBR->setSuccessor(HeaderIdx, InsertBot);
+ LatchBR->setSuccessor(1 - HeaderIdx, Exit);
+
+ // The new copy of the loop body starts with a bunch of PHI nodes
+ // that pick an incoming value from either the preheader, or the previous
+ // loop iteration. Since this copy is no longer part of the loop, we
+ // resolve this statically:
+ // For the first iteration, we use the value from the preheader directly.
+ // For any other iteration, we replace the phi with the value generated by
+ // the immediately preceding clone of the loop body (which represents
+ // the previous iteration).
+ for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(I); ++I) {
+ PHINode *NewPHI = cast<PHINode>(VMap[&*I]);
+ if (IterNumber == 0) {
+ VMap[&*I] = NewPHI->getIncomingValueForBlock(PreHeader);
+ } else {
+ Value *LatchVal = NewPHI->getIncomingValueForBlock(Latch);
+ Instruction *LatchInst = dyn_cast<Instruction>(LatchVal);
+ if (LatchInst && L->contains(LatchInst))
+ VMap[&*I] = LVMap[LatchInst];
+ else
+ VMap[&*I] = LatchVal;
+ }
+ cast<BasicBlock>(VMap[Header])->getInstList().erase(NewPHI);
+ }
+
+ // Fix up the outgoing values - we need to add a value for the iteration
+ // we've just created. Note that this must happen *after* the incoming
+ // values are adjusted, since the value going out of the latch may also be
+ // a value coming into the header.
+ for (BasicBlock::iterator I = Exit->begin(); isa<PHINode>(I); ++I) {
+ PHINode *PHI = cast<PHINode>(I);
+ Value *LatchVal = PHI->getIncomingValueForBlock(Latch);
+ Instruction *LatchInst = dyn_cast<Instruction>(LatchVal);
+ if (LatchInst && L->contains(LatchInst))
+ LatchVal = VMap[LatchVal];
+ PHI->addIncoming(LatchVal, cast<BasicBlock>(VMap[Latch]));
+ }
+
+ // LastValueMap is updated with the values for the current loop
+ // which are used the next time this function is called.
+ for (const auto &KV : VMap)
+ LVMap[KV.first] = KV.second;
+}
+
+/// \brief Peel off the first \p PeelCount iterations of loop \p L.
+///
+/// Note that this does not peel them off as a single straight-line block.
+/// Rather, each iteration is peeled off separately, and needs to check the
+/// exit condition.
+/// For loops that dynamically execute \p PeelCount iterations or less
+/// this provides a benefit, since the peeled off iterations, which account
+/// for the bulk of dynamic execution, can be further simplified by scalar
+/// optimizations.
+bool llvm::peelLoop(Loop *L, unsigned PeelCount, LoopInfo *LI,
+ ScalarEvolution *SE, DominatorTree *DT,
+ bool PreserveLCSSA) {
+ if (!canPeel(L))
+ return false;
+
+ LoopBlocksDFS LoopBlocks(L);
+ LoopBlocks.perform(LI);
+
+ BasicBlock *Header = L->getHeader();
+ BasicBlock *PreHeader = L->getLoopPreheader();
+ BasicBlock *Latch = L->getLoopLatch();
+ BasicBlock *Exit = L->getUniqueExitBlock();
+
+ Function *F = Header->getParent();
+
+ // Set up all the necessary basic blocks. It is convenient to split the
+ // preheader into 3 parts - two blocks to anchor the peeled copy of the loop
+ // body, and a new preheader for the "real" loop.
+
+ // Peeling the first iteration transforms.
+ //
+ // PreHeader:
+ // ...
+ // Header:
+ // LoopBody
+ // If (cond) goto Header
+ // Exit:
+ //
+ // into
+ //
+ // InsertTop:
+ // LoopBody
+ // If (!cond) goto Exit
+ // InsertBot:
+ // NewPreHeader:
+ // ...
+ // Header:
+ // LoopBody
+ // If (cond) goto Header
+ // Exit:
+ //
+ // Each following iteration will split the current bottom anchor in two,
+ // and put the new copy of the loop body between these two blocks. That is,
+ // after peeling another iteration from the example above, we'll split
+ // InsertBot, and get:
+ //
+ // InsertTop:
+ // LoopBody
+ // If (!cond) goto Exit
+ // InsertBot:
+ // LoopBody
+ // If (!cond) goto Exit
+ // InsertBot.next:
+ // NewPreHeader:
+ // ...
+ // Header:
+ // LoopBody
+ // If (cond) goto Header
+ // Exit:
+
+ BasicBlock *InsertTop = SplitEdge(PreHeader, Header, DT, LI);
+ BasicBlock *InsertBot =
+ SplitBlock(InsertTop, InsertTop->getTerminator(), DT, LI);
+ BasicBlock *NewPreHeader =
+ SplitBlock(InsertBot, InsertBot->getTerminator(), DT, LI);
+
+ InsertTop->setName(Header->getName() + ".peel.begin");
+ InsertBot->setName(Header->getName() + ".peel.next");
+ NewPreHeader->setName(PreHeader->getName() + ".peel.newph");
+
+ ValueToValueMapTy LVMap;
+
+ // If we have branch weight information, we'll want to update it for the
+ // newly created branches.
+ BranchInst *LatchBR =
+ cast<BranchInst>(cast<BasicBlock>(Latch)->getTerminator());
+ unsigned HeaderIdx = (LatchBR->getSuccessor(0) == Header ? 0 : 1);
+
+ uint64_t TrueWeight, FalseWeight;
+ uint64_t ExitWeight = 0, CurHeaderWeight = 0;
+ if (LatchBR->extractProfMetadata(TrueWeight, FalseWeight)) {
+ ExitWeight = HeaderIdx ? TrueWeight : FalseWeight;
+ // The # of times the loop body executes is the sum of the exit block
+ // weight and the # of times the backedges are taken.
+ CurHeaderWeight = TrueWeight + FalseWeight;
+ }
+
+ // For each peeled-off iteration, make a copy of the loop.
+ for (unsigned Iter = 0; Iter < PeelCount; ++Iter) {
+ SmallVector<BasicBlock *, 8> NewBlocks;
+ ValueToValueMapTy VMap;
+
+ // Subtract the exit weight from the current header weight -- the exit
+ // weight is exactly the weight of the previous iteration's header.
+ // FIXME: due to the way the distribution is constructed, we need a
+ // guard here to make sure we don't end up with non-positive weights.
+ if (ExitWeight < CurHeaderWeight)
+ CurHeaderWeight -= ExitWeight;
+ else
+ CurHeaderWeight = 1;
+
+ cloneLoopBlocks(L, Iter, InsertTop, InsertBot, Exit,
+ NewBlocks, LoopBlocks, VMap, LVMap, LI);
+ updateBranchWeights(InsertBot, cast<BranchInst>(VMap[LatchBR]), Iter,
+ PeelCount, ExitWeight);
+
+ InsertTop = InsertBot;
+ InsertBot = SplitBlock(InsertBot, InsertBot->getTerminator(), DT, LI);
+ InsertBot->setName(Header->getName() + ".peel.next");
+
+ F->getBasicBlockList().splice(InsertTop->getIterator(),
+ F->getBasicBlockList(),
+ NewBlocks[0]->getIterator(), F->end());
+
+ // Remap to use values from the current iteration instead of the
+ // previous one.
+ remapInstructionsInBlocks(NewBlocks, VMap);
+ }
+
+ // Now adjust the phi nodes in the loop header to get their initial values
+ // from the last peeled-off iteration instead of the preheader.
+ for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(I); ++I) {
+ PHINode *PHI = cast<PHINode>(I);
+ Value *NewVal = PHI->getIncomingValueForBlock(Latch);
+ Instruction *LatchInst = dyn_cast<Instruction>(NewVal);
+ if (LatchInst && L->contains(LatchInst))
+ NewVal = LVMap[LatchInst];
+
+ PHI->setIncomingValue(PHI->getBasicBlockIndex(NewPreHeader), NewVal);
+ }
+
+ // Adjust the branch weights on the loop exit.
+ if (ExitWeight) {
+ // The backedge count is the difference of current header weight and
+ // current loop exit weight. If the current header weight is smaller than
+ // the current loop exit weight, we mark the loop backedge weight as 1.
+ uint64_t BackEdgeWeight = 0;
+ if (ExitWeight < CurHeaderWeight)
+ BackEdgeWeight = CurHeaderWeight - ExitWeight;
+ else
+ BackEdgeWeight = 1;
+ MDBuilder MDB(LatchBR->getContext());
+ MDNode *WeightNode =
+ HeaderIdx ? MDB.createBranchWeights(ExitWeight, BackEdgeWeight)
+ : MDB.createBranchWeights(BackEdgeWeight, ExitWeight);
+ LatchBR->setMetadata(LLVMContext::MD_prof, WeightNode);
+ }
+
+ // If the loop is nested, we changed the parent loop, update SE.
+ if (Loop *ParentLoop = L->getParentLoop())
+ SE->forgetLoop(ParentLoop);
+
+ NumPeeled++;
+
+ return true;
+}
diff --git a/contrib/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/contrib/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
index 861a50c..d3ea156 100644
--- a/contrib/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
@@ -112,6 +112,18 @@ static void ConnectProlog(Loop *L, Value *BECount, unsigned Count,
}
}
+ // Make sure that created prolog loop is in simplified form
+ SmallVector<BasicBlock *, 4> PrologExitPreds;
+ Loop *PrologLoop = LI->getLoopFor(PrologLatch);
+ if (PrologLoop) {
+ for (BasicBlock *PredBB : predecessors(PrologExit))
+ if (PrologLoop->contains(PredBB))
+ PrologExitPreds.push_back(PredBB);
+
+ SplitBlockPredecessors(PrologExit, PrologExitPreds, ".unr-lcssa", DT, LI,
+ PreserveLCSSA);
+ }
+
// Create a branch around the original loop, which is taken if there are no
// iterations remaining to be executed after running the prologue.
Instruction *InsertPt = PrologExit->getTerminator();
@@ -289,16 +301,23 @@ static void CloneLoopBlocks(Loop *L, Value *NewIter,
LI->addTopLevelLoop(NewLoop);
}
+ NewLoopsMap NewLoops;
+ if (NewLoop)
+ NewLoops[L] = NewLoop;
+ else if (ParentLoop)
+ NewLoops[L] = ParentLoop;
+
// For each block in the original loop, create a new copy,
// and update the value map with the newly created values.
for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) {
BasicBlock *NewBB = CloneBasicBlock(*BB, VMap, "." + suffix, F);
NewBlocks.push_back(NewBB);
-
- if (NewLoop)
- NewLoop->addBasicBlockToLoop(NewBB, *LI);
- else if (ParentLoop)
- ParentLoop->addBasicBlockToLoop(NewBB, *LI);
+
+ // If we're unrolling the outermost loop, there's no remainder loop,
+ // and this block isn't in a nested loop, then the new block is not
+ // in any loop. Otherwise, add it to loopinfo.
+ if (CreateRemainderLoop || LI->getLoopFor(*BB) != L || ParentLoop)
+ addClonedBlockToLoopInfo(*BB, NewBB, LI, NewLoops);
VMap[*BB] = NewBB;
if (Header == *BB) {
@@ -479,11 +498,6 @@ bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count,
if (Log2_32(Count) > BEWidth)
return false;
- // If this loop is nested, then the loop unroller changes the code in the
- // parent loop, so the Scalar Evolution pass needs to be run again.
- if (Loop *ParentLoop = L->getParentLoop())
- SE->forgetLoop(ParentLoop);
-
BasicBlock *Latch = L->getLoopLatch();
// Loop structure is the following:
@@ -673,6 +687,12 @@ bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count,
ConnectProlog(L, BECount, Count, PrologExit, PreHeader, NewPreHeader,
VMap, DT, LI, PreserveLCSSA);
}
+
+ // If this loop is nested, then the loop unroller changes the code in the
+ // parent loop, so the Scalar Evolution pass needs to be run again.
+ if (Loop *ParentLoop = L->getParentLoop())
+ SE->forgetLoop(ParentLoop);
+
NumRuntimeUnrolled++;
return true;
}
diff --git a/contrib/llvm/lib/Transforms/Utils/LoopUtils.cpp b/contrib/llvm/lib/Transforms/Utils/LoopUtils.cpp
index 3902c67..c8efa9e 100644
--- a/contrib/llvm/lib/Transforms/Utils/LoopUtils.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/LoopUtils.cpp
@@ -11,14 +11,17 @@
//
//===----------------------------------------------------------------------===//
+#include "llvm/Transforms/Utils/LoopUtils.h"
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/BasicAliasAnalysis.h"
-#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
#include "llvm/Analysis/ScalarEvolutionExpander.h"
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
-#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/Module.h"
@@ -26,7 +29,6 @@
#include "llvm/IR/ValueHandle.h"
#include "llvm/Pass.h"
#include "llvm/Support/Debug.h"
-#include "llvm/Transforms/Utils/LoopUtils.h"
using namespace llvm;
using namespace llvm::PatternMatch;
@@ -305,7 +307,7 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurrenceKind Kind,
// The instruction used by an outside user must be the last instruction
// before we feed back to the reduction phi. Otherwise, we loose VF-1
// operations on the value.
- if (std::find(Phi->op_begin(), Phi->op_end(), Cur) == Phi->op_end())
+ if (!is_contained(Phi->operands(), Cur))
return false;
ExitInstruction = Cur;
@@ -654,8 +656,8 @@ Value *RecurrenceDescriptor::createMinMaxOp(IRBuilder<> &Builder,
}
InductionDescriptor::InductionDescriptor(Value *Start, InductionKind K,
- const SCEV *Step)
- : StartValue(Start), IK(K), Step(Step) {
+ const SCEV *Step, BinaryOperator *BOp)
+ : StartValue(Start), IK(K), Step(Step), InductionBinOp(BOp) {
assert(IK != IK_NoInduction && "Not an induction");
// Start value type should match the induction kind and the value
@@ -672,7 +674,15 @@ InductionDescriptor::InductionDescriptor(Value *Start, InductionKind K,
assert((IK != IK_PtrInduction || getConstIntStepValue()) &&
"Step value should be constant for pointer induction");
- assert(Step->getType()->isIntegerTy() && "StepValue is not an integer");
+ assert((IK == IK_FpInduction || Step->getType()->isIntegerTy()) &&
+ "StepValue is not an integer");
+
+ assert((IK != IK_FpInduction || Step->getType()->isFloatingPointTy()) &&
+ "StepValue is not FP for FpInduction");
+ assert((IK != IK_FpInduction || (InductionBinOp &&
+ (InductionBinOp->getOpcode() == Instruction::FAdd ||
+ InductionBinOp->getOpcode() == Instruction::FSub))) &&
+ "Binary opcode should be specified for FP induction");
}
int InductionDescriptor::getConsecutiveDirection() const {
@@ -693,6 +703,8 @@ Value *InductionDescriptor::transform(IRBuilder<> &B, Value *Index,
const DataLayout& DL) const {
SCEVExpander Exp(*SE, DL, "induction");
+ assert(Index->getType() == Step->getType() &&
+ "Index type does not match StepValue type");
switch (IK) {
case IK_IntInduction: {
assert(Index->getType() == StartValue->getType() &&
@@ -717,29 +729,113 @@ Value *InductionDescriptor::transform(IRBuilder<> &B, Value *Index,
return Exp.expandCodeFor(S, StartValue->getType(), &*B.GetInsertPoint());
}
case IK_PtrInduction: {
- assert(Index->getType() == Step->getType() &&
- "Index type does not match StepValue type");
assert(isa<SCEVConstant>(Step) &&
"Expected constant step for pointer induction");
const SCEV *S = SE->getMulExpr(SE->getSCEV(Index), Step);
Index = Exp.expandCodeFor(S, Index->getType(), &*B.GetInsertPoint());
return B.CreateGEP(nullptr, StartValue, Index);
}
+ case IK_FpInduction: {
+ assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
+ assert(InductionBinOp &&
+ (InductionBinOp->getOpcode() == Instruction::FAdd ||
+ InductionBinOp->getOpcode() == Instruction::FSub) &&
+ "Original bin op should be defined for FP induction");
+
+ Value *StepValue = cast<SCEVUnknown>(Step)->getValue();
+
+ // Floating point operations had to be 'fast' to enable the induction.
+ FastMathFlags Flags;
+ Flags.setUnsafeAlgebra();
+
+ Value *MulExp = B.CreateFMul(StepValue, Index);
+ if (isa<Instruction>(MulExp))
+ // We have to check, the MulExp may be a constant.
+ cast<Instruction>(MulExp)->setFastMathFlags(Flags);
+
+ Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode() , StartValue,
+ MulExp, "induction");
+ if (isa<Instruction>(BOp))
+ cast<Instruction>(BOp)->setFastMathFlags(Flags);
+
+ return BOp;
+ }
case IK_NoInduction:
return nullptr;
}
llvm_unreachable("invalid enum");
}
-bool InductionDescriptor::isInductionPHI(PHINode *Phi,
+bool InductionDescriptor::isFPInductionPHI(PHINode *Phi, const Loop *TheLoop,
+ ScalarEvolution *SE,
+ InductionDescriptor &D) {
+
+ // Here we only handle FP induction variables.
+ assert(Phi->getType()->isFloatingPointTy() && "Unexpected Phi type");
+
+ if (TheLoop->getHeader() != Phi->getParent())
+ return false;
+
+ // The loop may have multiple entrances or multiple exits; we can analyze
+ // this phi if it has a unique entry value and a unique backedge value.
+ if (Phi->getNumIncomingValues() != 2)
+ return false;
+ Value *BEValue = nullptr, *StartValue = nullptr;
+ if (TheLoop->contains(Phi->getIncomingBlock(0))) {
+ BEValue = Phi->getIncomingValue(0);
+ StartValue = Phi->getIncomingValue(1);
+ } else {
+ assert(TheLoop->contains(Phi->getIncomingBlock(1)) &&
+ "Unexpected Phi node in the loop");
+ BEValue = Phi->getIncomingValue(1);
+ StartValue = Phi->getIncomingValue(0);
+ }
+
+ BinaryOperator *BOp = dyn_cast<BinaryOperator>(BEValue);
+ if (!BOp)
+ return false;
+
+ Value *Addend = nullptr;
+ if (BOp->getOpcode() == Instruction::FAdd) {
+ if (BOp->getOperand(0) == Phi)
+ Addend = BOp->getOperand(1);
+ else if (BOp->getOperand(1) == Phi)
+ Addend = BOp->getOperand(0);
+ } else if (BOp->getOpcode() == Instruction::FSub)
+ if (BOp->getOperand(0) == Phi)
+ Addend = BOp->getOperand(1);
+
+ if (!Addend)
+ return false;
+
+ // The addend should be loop invariant
+ if (auto *I = dyn_cast<Instruction>(Addend))
+ if (TheLoop->contains(I))
+ return false;
+
+ // FP Step has unknown SCEV
+ const SCEV *Step = SE->getUnknown(Addend);
+ D = InductionDescriptor(StartValue, IK_FpInduction, Step, BOp);
+ return true;
+}
+
+bool InductionDescriptor::isInductionPHI(PHINode *Phi, const Loop *TheLoop,
PredicatedScalarEvolution &PSE,
InductionDescriptor &D,
bool Assume) {
Type *PhiTy = Phi->getType();
- // We only handle integer and pointer inductions variables.
- if (!PhiTy->isIntegerTy() && !PhiTy->isPointerTy())
+
+ // Handle integer and pointer inductions variables.
+ // Now we handle also FP induction but not trying to make a
+ // recurrent expression from the PHI node in-place.
+
+ if (!PhiTy->isIntegerTy() && !PhiTy->isPointerTy() &&
+ !PhiTy->isFloatTy() && !PhiTy->isDoubleTy() && !PhiTy->isHalfTy())
return false;
+ if (PhiTy->isFloatingPointTy())
+ return isFPInductionPHI(Phi, TheLoop, PSE.getSE(), D);
+
const SCEV *PhiScev = PSE.getSCEV(Phi);
const auto *AR = dyn_cast<SCEVAddRecExpr>(PhiScev);
@@ -752,10 +848,10 @@ bool InductionDescriptor::isInductionPHI(PHINode *Phi,
return false;
}
- return isInductionPHI(Phi, PSE.getSE(), D, AR);
+ return isInductionPHI(Phi, TheLoop, PSE.getSE(), D, AR);
}
-bool InductionDescriptor::isInductionPHI(PHINode *Phi,
+bool InductionDescriptor::isInductionPHI(PHINode *Phi, const Loop *TheLoop,
ScalarEvolution *SE,
InductionDescriptor &D,
const SCEV *Expr) {
@@ -773,15 +869,20 @@ bool InductionDescriptor::isInductionPHI(PHINode *Phi,
return false;
}
- assert(AR->getLoop()->getHeader() == Phi->getParent() &&
- "PHI is an AddRec for a different loop?!");
+ if (AR->getLoop() != TheLoop) {
+ // FIXME: We should treat this as a uniform. Unfortunately, we
+ // don't currently know how to handled uniform PHIs.
+ DEBUG(dbgs() << "LV: PHI is a recurrence with respect to an outer loop.\n");
+ return false;
+ }
+
Value *StartValue =
Phi->getIncomingValueForBlock(AR->getLoop()->getLoopPreheader());
const SCEV *Step = AR->getStepRecurrence(*SE);
// Calculate the pointer stride and check if it is consecutive.
// The stride may be a constant or a loop invariant integer value.
const SCEVConstant *ConstStep = dyn_cast<SCEVConstant>(Step);
- if (!ConstStep && !SE->isLoopInvariant(Step, AR->getLoop()))
+ if (!ConstStep && !SE->isLoopInvariant(Step, TheLoop))
return false;
if (PhiTy->isIntegerTy()) {
@@ -824,7 +925,7 @@ SmallVector<Instruction *, 8> llvm::findDefsUsedOutsideOfLoop(Loop *L) {
// be adapted into a pointer.
for (auto &Inst : *Block) {
auto Users = Inst.users();
- if (std::any_of(Users.begin(), Users.end(), [&](User *U) {
+ if (any_of(Users, [&](User *U) {
auto *Use = cast<Instruction>(U);
return !L->contains(Use->getParent());
}))
@@ -851,6 +952,10 @@ void llvm::getLoopAnalysisUsage(AnalysisUsage &AU) {
AU.addPreservedID(LoopSimplifyID);
AU.addRequiredID(LCSSAID);
AU.addPreservedID(LCSSAID);
+ // This is used in the LPPassManager to perform LCSSA verification on passes
+ // which preserve lcssa form
+ AU.addRequired<LCSSAVerificationPass>();
+ AU.addPreserved<LCSSAVerificationPass>();
// Loop passes are designed to run inside of a loop pass manager which means
// that any function analyses they require must be required by the first loop
@@ -967,3 +1072,36 @@ bool llvm::isGuaranteedToExecute(const Instruction &Inst,
// just a special case of this.)
return true;
}
+
+Optional<unsigned> llvm::getLoopEstimatedTripCount(Loop *L) {
+ // Only support loops with a unique exiting block, and a latch.
+ if (!L->getExitingBlock())
+ return None;
+
+ // Get the branch weights for the the loop's backedge.
+ BranchInst *LatchBR =
+ dyn_cast<BranchInst>(L->getLoopLatch()->getTerminator());
+ if (!LatchBR || LatchBR->getNumSuccessors() != 2)
+ return None;
+
+ assert((LatchBR->getSuccessor(0) == L->getHeader() ||
+ LatchBR->getSuccessor(1) == L->getHeader()) &&
+ "At least one edge out of the latch must go to the header");
+
+ // To estimate the number of times the loop body was executed, we want to
+ // know the number of times the backedge was taken, vs. the number of times
+ // we exited the loop.
+ uint64_t TrueVal, FalseVal;
+ if (!LatchBR->extractProfMetadata(TrueVal, FalseVal))
+ return None;
+
+ if (!TrueVal || !FalseVal)
+ return 0;
+
+ // Divide the count of the backedge by the count of the edge exiting the loop,
+ // rounding to nearest.
+ if (LatchBR->getSuccessor(0) == L->getHeader())
+ return (TrueVal + (FalseVal / 2)) / FalseVal;
+ else
+ return (FalseVal + (TrueVal / 2)) / TrueVal;
+}
diff --git a/contrib/llvm/lib/Transforms/Utils/LoopVersioning.cpp b/contrib/llvm/lib/Transforms/Utils/LoopVersioning.cpp
index b3c6169..29756d9 100644
--- a/contrib/llvm/lib/Transforms/Utils/LoopVersioning.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/LoopVersioning.cpp
@@ -36,7 +36,7 @@ LoopVersioning::LoopVersioning(const LoopAccessInfo &LAI, Loop *L, LoopInfo *LI,
: VersionedLoop(L), NonVersionedLoop(nullptr), LAI(LAI), LI(LI), DT(DT),
SE(SE) {
assert(L->getExitBlock() && "No single exit block");
- assert(L->getLoopPreheader() && "No preheader");
+ assert(L->isLoopSimplifyForm() && "Loop is not in loop-simplify form");
if (UseLAIChecks) {
setAliasChecks(LAI.getRuntimePointerChecking()->getChecks());
setSCEVChecks(LAI.getPSE().getUnionPredicate());
@@ -278,8 +278,8 @@ public:
bool Changed = false;
for (Loop *L : Worklist) {
const LoopAccessInfo &LAI = LAA->getInfo(L);
- if (LAI.getNumRuntimePointerChecks() ||
- !LAI.getPSE().getUnionPredicate().isAlwaysTrue()) {
+ if (L->isLoopSimplifyForm() && (LAI.getNumRuntimePointerChecks() ||
+ !LAI.getPSE().getUnionPredicate().isAlwaysTrue())) {
LoopVersioning LVer(LAI, L, LI, DT, SE);
LVer.versionLoop();
LVer.annotateLoopWithNoAlias();
diff --git a/contrib/llvm/lib/Transforms/Utils/LowerInvoke.cpp b/contrib/llvm/lib/Transforms/Utils/LowerInvoke.cpp
index 1b31c5a..ee84541 100644
--- a/contrib/llvm/lib/Transforms/Utils/LowerInvoke.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/LowerInvoke.cpp
@@ -14,6 +14,7 @@
//
//===----------------------------------------------------------------------===//
+#include "llvm/Transforms/Utils/LowerInvoke.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/IR/Instructions.h"
@@ -28,36 +29,29 @@ using namespace llvm;
STATISTIC(NumInvokes, "Number of invokes replaced");
namespace {
- class LowerInvoke : public FunctionPass {
+ class LowerInvokeLegacyPass : public FunctionPass {
public:
static char ID; // Pass identification, replacement for typeid
- explicit LowerInvoke() : FunctionPass(ID) {
- initializeLowerInvokePass(*PassRegistry::getPassRegistry());
+ explicit LowerInvokeLegacyPass() : FunctionPass(ID) {
+ initializeLowerInvokeLegacyPassPass(*PassRegistry::getPassRegistry());
}
bool runOnFunction(Function &F) override;
};
}
-char LowerInvoke::ID = 0;
-INITIALIZE_PASS(LowerInvoke, "lowerinvoke",
+char LowerInvokeLegacyPass::ID = 0;
+INITIALIZE_PASS(LowerInvokeLegacyPass, "lowerinvoke",
"Lower invoke and unwind, for unwindless code generators",
false, false)
-char &llvm::LowerInvokePassID = LowerInvoke::ID;
-
-// Public Interface To the LowerInvoke pass.
-FunctionPass *llvm::createLowerInvokePass() {
- return new LowerInvoke();
-}
-
-bool LowerInvoke::runOnFunction(Function &F) {
+static bool runImpl(Function &F) {
bool Changed = false;
for (BasicBlock &BB : F)
if (InvokeInst *II = dyn_cast<InvokeInst>(BB.getTerminator())) {
- SmallVector<Value*,16> CallArgs(II->op_begin(), II->op_end() - 3);
+ SmallVector<Value *, 16> CallArgs(II->op_begin(), II->op_end() - 3);
// Insert a normal call instruction...
- CallInst *NewCall = CallInst::Create(II->getCalledValue(),
- CallArgs, "", II);
+ CallInst *NewCall =
+ CallInst::Create(II->getCalledValue(), CallArgs, "", II);
NewCall->takeName(II);
NewCall->setCallingConv(II->getCallingConv());
NewCall->setAttributes(II->getAttributes());
@@ -73,7 +67,28 @@ bool LowerInvoke::runOnFunction(Function &F) {
// Remove the invoke instruction now.
BB.getInstList().erase(II);
- ++NumInvokes; Changed = true;
+ ++NumInvokes;
+ Changed = true;
}
return Changed;
}
+
+bool LowerInvokeLegacyPass::runOnFunction(Function &F) {
+ return runImpl(F);
+}
+
+namespace llvm {
+char &LowerInvokePassID = LowerInvokeLegacyPass::ID;
+
+// Public Interface To the LowerInvoke pass.
+FunctionPass *createLowerInvokePass() { return new LowerInvokeLegacyPass(); }
+
+PreservedAnalyses LowerInvokePass::run(Function &F,
+ FunctionAnalysisManager &AM) {
+ bool Changed = runImpl(F);
+ if (!Changed)
+ return PreservedAnalyses::all();
+
+ return PreservedAnalyses::none();
+}
+}
diff --git a/contrib/llvm/lib/Transforms/Utils/LowerSwitch.cpp b/contrib/llvm/lib/Transforms/Utils/LowerSwitch.cpp
index 5c07469..75cd3bc 100644
--- a/contrib/llvm/lib/Transforms/Utils/LowerSwitch.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/LowerSwitch.cpp
@@ -478,10 +478,10 @@ void LowerSwitch::processSwitchInst(SwitchInst *SI,
// cases.
assert(MaxPop > 0 && PopSucc);
Default = PopSucc;
- Cases.erase(std::remove_if(
- Cases.begin(), Cases.end(),
- [PopSucc](const CaseRange &R) { return R.BB == PopSucc; }),
- Cases.end());
+ Cases.erase(
+ remove_if(Cases,
+ [PopSucc](const CaseRange &R) { return R.BB == PopSucc; }),
+ Cases.end());
// If there are no cases left, just branch.
if (Cases.empty()) {
diff --git a/contrib/llvm/lib/Transforms/Utils/Mem2Reg.cpp b/contrib/llvm/lib/Transforms/Utils/Mem2Reg.cpp
index 1419254..24b3b12 100644
--- a/contrib/llvm/lib/Transforms/Utils/Mem2Reg.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/Mem2Reg.cpp
@@ -53,7 +53,7 @@ static bool promoteMemoryToRegister(Function &F, DominatorTree &DT,
return Changed;
}
-PreservedAnalyses PromotePass::run(Function &F, AnalysisManager<Function> &AM) {
+PreservedAnalyses PromotePass::run(Function &F, FunctionAnalysisManager &AM) {
auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
auto &AC = AM.getResult<AssumptionAnalysis>(F);
if (!promoteMemoryToRegister(F, DT, AC))
diff --git a/contrib/llvm/lib/Transforms/Utils/MemorySSA.cpp b/contrib/llvm/lib/Transforms/Utils/MemorySSA.cpp
index 8ba3cae..1ce4225 100644
--- a/contrib/llvm/lib/Transforms/Utils/MemorySSA.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/MemorySSA.cpp
@@ -17,6 +17,7 @@
#include "llvm/ADT/GraphTraits.h"
#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallBitVector.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/Statistic.h"
@@ -60,6 +61,11 @@ INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
INITIALIZE_PASS_END(MemorySSAPrinterLegacyPass, "print-memoryssa",
"Memory SSA Printer", false, false)
+static cl::opt<unsigned> MaxCheckLimit(
+ "memssa-check-limit", cl::Hidden, cl::init(100),
+ cl::desc("The maximum number of stores/phis MemorySSA"
+ "will consider trying to walk past (default = 100)"));
+
static cl::opt<bool>
VerifyMemorySSA("verify-memoryssa", cl::init(false), cl::Hidden,
cl::desc("Verify MemorySSA in legacy printer pass."));
@@ -86,7 +92,963 @@ public:
OS << "; " << *MA << "\n";
}
};
+}
+
+namespace {
+/// Our current alias analysis API differentiates heavily between calls and
+/// non-calls, and functions called on one usually assert on the other.
+/// This class encapsulates the distinction to simplify other code that wants
+/// "Memory affecting instructions and related data" to use as a key.
+/// For example, this class is used as a densemap key in the use optimizer.
+class MemoryLocOrCall {
+public:
+ MemoryLocOrCall() : IsCall(false) {}
+ MemoryLocOrCall(MemoryUseOrDef *MUD)
+ : MemoryLocOrCall(MUD->getMemoryInst()) {}
+ MemoryLocOrCall(const MemoryUseOrDef *MUD)
+ : MemoryLocOrCall(MUD->getMemoryInst()) {}
+
+ MemoryLocOrCall(Instruction *Inst) {
+ if (ImmutableCallSite(Inst)) {
+ IsCall = true;
+ CS = ImmutableCallSite(Inst);
+ } else {
+ IsCall = false;
+ // There is no such thing as a memorylocation for a fence inst, and it is
+ // unique in that regard.
+ if (!isa<FenceInst>(Inst))
+ Loc = MemoryLocation::get(Inst);
+ }
+ }
+
+ explicit MemoryLocOrCall(const MemoryLocation &Loc)
+ : IsCall(false), Loc(Loc) {}
+
+ bool IsCall;
+ ImmutableCallSite getCS() const {
+ assert(IsCall);
+ return CS;
+ }
+ MemoryLocation getLoc() const {
+ assert(!IsCall);
+ return Loc;
+ }
+
+ bool operator==(const MemoryLocOrCall &Other) const {
+ if (IsCall != Other.IsCall)
+ return false;
+
+ if (IsCall)
+ return CS.getCalledValue() == Other.CS.getCalledValue();
+ return Loc == Other.Loc;
+ }
+
+private:
+ union {
+ ImmutableCallSite CS;
+ MemoryLocation Loc;
+ };
+};
+}
+
+namespace llvm {
+template <> struct DenseMapInfo<MemoryLocOrCall> {
+ static inline MemoryLocOrCall getEmptyKey() {
+ return MemoryLocOrCall(DenseMapInfo<MemoryLocation>::getEmptyKey());
+ }
+ static inline MemoryLocOrCall getTombstoneKey() {
+ return MemoryLocOrCall(DenseMapInfo<MemoryLocation>::getTombstoneKey());
+ }
+ static unsigned getHashValue(const MemoryLocOrCall &MLOC) {
+ if (MLOC.IsCall)
+ return hash_combine(MLOC.IsCall,
+ DenseMapInfo<const Value *>::getHashValue(
+ MLOC.getCS().getCalledValue()));
+ return hash_combine(
+ MLOC.IsCall, DenseMapInfo<MemoryLocation>::getHashValue(MLOC.getLoc()));
+ }
+ static bool isEqual(const MemoryLocOrCall &LHS, const MemoryLocOrCall &RHS) {
+ return LHS == RHS;
+ }
+};
+
+enum class Reorderability { Always, IfNoAlias, Never };
+
+/// This does one-way checks to see if Use could theoretically be hoisted above
+/// MayClobber. This will not check the other way around.
+///
+/// This assumes that, for the purposes of MemorySSA, Use comes directly after
+/// MayClobber, with no potentially clobbering operations in between them.
+/// (Where potentially clobbering ops are memory barriers, aliased stores, etc.)
+static Reorderability getLoadReorderability(const LoadInst *Use,
+ const LoadInst *MayClobber) {
+ bool VolatileUse = Use->isVolatile();
+ bool VolatileClobber = MayClobber->isVolatile();
+ // Volatile operations may never be reordered with other volatile operations.
+ if (VolatileUse && VolatileClobber)
+ return Reorderability::Never;
+
+ // The lang ref allows reordering of volatile and non-volatile operations.
+ // Whether an aliasing nonvolatile load and volatile load can be reordered,
+ // though, is ambiguous. Because it may not be best to exploit this ambiguity,
+ // we only allow volatile/non-volatile reordering if the volatile and
+ // non-volatile operations don't alias.
+ Reorderability Result = VolatileUse || VolatileClobber
+ ? Reorderability::IfNoAlias
+ : Reorderability::Always;
+
+ // If a load is seq_cst, it cannot be moved above other loads. If its ordering
+ // is weaker, it can be moved above other loads. We just need to be sure that
+ // MayClobber isn't an acquire load, because loads can't be moved above
+ // acquire loads.
+ //
+ // Note that this explicitly *does* allow the free reordering of monotonic (or
+ // weaker) loads of the same address.
+ bool SeqCstUse = Use->getOrdering() == AtomicOrdering::SequentiallyConsistent;
+ bool MayClobberIsAcquire = isAtLeastOrStrongerThan(MayClobber->getOrdering(),
+ AtomicOrdering::Acquire);
+ if (SeqCstUse || MayClobberIsAcquire)
+ return Reorderability::Never;
+ return Result;
+}
+
+static bool instructionClobbersQuery(MemoryDef *MD,
+ const MemoryLocation &UseLoc,
+ const Instruction *UseInst,
+ AliasAnalysis &AA) {
+ Instruction *DefInst = MD->getMemoryInst();
+ assert(DefInst && "Defining instruction not actually an instruction");
+
+ if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(DefInst)) {
+ // These intrinsics will show up as affecting memory, but they are just
+ // markers.
+ switch (II->getIntrinsicID()) {
+ case Intrinsic::lifetime_start:
+ case Intrinsic::lifetime_end:
+ case Intrinsic::invariant_start:
+ case Intrinsic::invariant_end:
+ case Intrinsic::assume:
+ return false;
+ default:
+ break;
+ }
+ }
+
+ ImmutableCallSite UseCS(UseInst);
+ if (UseCS) {
+ ModRefInfo I = AA.getModRefInfo(DefInst, UseCS);
+ return I != MRI_NoModRef;
+ }
+
+ if (auto *DefLoad = dyn_cast<LoadInst>(DefInst)) {
+ if (auto *UseLoad = dyn_cast<LoadInst>(UseInst)) {
+ switch (getLoadReorderability(UseLoad, DefLoad)) {
+ case Reorderability::Always:
+ return false;
+ case Reorderability::Never:
+ return true;
+ case Reorderability::IfNoAlias:
+ return !AA.isNoAlias(UseLoc, MemoryLocation::get(DefLoad));
+ }
+ }
+ }
+
+ return AA.getModRefInfo(DefInst, UseLoc) & MRI_Mod;
+}
+
+static bool instructionClobbersQuery(MemoryDef *MD, const MemoryUseOrDef *MU,
+ const MemoryLocOrCall &UseMLOC,
+ AliasAnalysis &AA) {
+ // FIXME: This is a temporary hack to allow a single instructionClobbersQuery
+ // to exist while MemoryLocOrCall is pushed through places.
+ if (UseMLOC.IsCall)
+ return instructionClobbersQuery(MD, MemoryLocation(), MU->getMemoryInst(),
+ AA);
+ return instructionClobbersQuery(MD, UseMLOC.getLoc(), MU->getMemoryInst(),
+ AA);
+}
+
+// Return true when MD may alias MU, return false otherwise.
+bool defClobbersUseOrDef(MemoryDef *MD, const MemoryUseOrDef *MU,
+ AliasAnalysis &AA) {
+ return instructionClobbersQuery(MD, MU, MemoryLocOrCall(MU), AA);
+}
+}
+
+namespace {
+struct UpwardsMemoryQuery {
+ // True if our original query started off as a call
+ bool IsCall;
+ // The pointer location we started the query with. This will be empty if
+ // IsCall is true.
+ MemoryLocation StartingLoc;
+ // This is the instruction we were querying about.
+ const Instruction *Inst;
+ // The MemoryAccess we actually got called with, used to test local domination
+ const MemoryAccess *OriginalAccess;
+
+ UpwardsMemoryQuery()
+ : IsCall(false), Inst(nullptr), OriginalAccess(nullptr) {}
+
+ UpwardsMemoryQuery(const Instruction *Inst, const MemoryAccess *Access)
+ : IsCall(ImmutableCallSite(Inst)), Inst(Inst), OriginalAccess(Access) {
+ if (!IsCall)
+ StartingLoc = MemoryLocation::get(Inst);
+ }
+};
+
+static bool lifetimeEndsAt(MemoryDef *MD, const MemoryLocation &Loc,
+ AliasAnalysis &AA) {
+ Instruction *Inst = MD->getMemoryInst();
+ if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
+ switch (II->getIntrinsicID()) {
+ case Intrinsic::lifetime_start:
+ case Intrinsic::lifetime_end:
+ return AA.isMustAlias(MemoryLocation(II->getArgOperand(1)), Loc);
+ default:
+ return false;
+ }
+ }
+ return false;
+}
+
+static bool isUseTriviallyOptimizableToLiveOnEntry(AliasAnalysis &AA,
+ const Instruction *I) {
+ // If the memory can't be changed, then loads of the memory can't be
+ // clobbered.
+ //
+ // FIXME: We should handle invariant groups, as well. It's a bit harder,
+ // because we need to pay close attention to invariant group barriers.
+ return isa<LoadInst>(I) && (I->getMetadata(LLVMContext::MD_invariant_load) ||
+ AA.pointsToConstantMemory(I));
+}
+
+/// Cache for our caching MemorySSA walker.
+class WalkerCache {
+ DenseMap<ConstMemoryAccessPair, MemoryAccess *> Accesses;
+ DenseMap<const MemoryAccess *, MemoryAccess *> Calls;
+
+public:
+ MemoryAccess *lookup(const MemoryAccess *MA, const MemoryLocation &Loc,
+ bool IsCall) const {
+ ++NumClobberCacheLookups;
+ MemoryAccess *R = IsCall ? Calls.lookup(MA) : Accesses.lookup({MA, Loc});
+ if (R)
+ ++NumClobberCacheHits;
+ return R;
+ }
+
+ bool insert(const MemoryAccess *MA, MemoryAccess *To,
+ const MemoryLocation &Loc, bool IsCall) {
+ // This is fine for Phis, since there are times where we can't optimize
+ // them. Making a def its own clobber is never correct, though.
+ assert((MA != To || isa<MemoryPhi>(MA)) &&
+ "Something can't clobber itself!");
+
+ ++NumClobberCacheInserts;
+ bool Inserted;
+ if (IsCall)
+ Inserted = Calls.insert({MA, To}).second;
+ else
+ Inserted = Accesses.insert({{MA, Loc}, To}).second;
+
+ return Inserted;
+ }
+
+ bool remove(const MemoryAccess *MA, const MemoryLocation &Loc, bool IsCall) {
+ return IsCall ? Calls.erase(MA) : Accesses.erase({MA, Loc});
+ }
+
+ void clear() {
+ Accesses.clear();
+ Calls.clear();
+ }
+
+ bool contains(const MemoryAccess *MA) const {
+ for (auto &P : Accesses)
+ if (P.first.first == MA || P.second == MA)
+ return true;
+ for (auto &P : Calls)
+ if (P.first == MA || P.second == MA)
+ return true;
+ return false;
+ }
+};
+
+/// Walks the defining uses of MemoryDefs. Stops after we hit something that has
+/// no defining use (e.g. a MemoryPhi or liveOnEntry). Note that, when comparing
+/// against a null def_chain_iterator, this will compare equal only after
+/// walking said Phi/liveOnEntry.
+struct def_chain_iterator
+ : public iterator_facade_base<def_chain_iterator, std::forward_iterator_tag,
+ MemoryAccess *> {
+ def_chain_iterator() : MA(nullptr) {}
+ def_chain_iterator(MemoryAccess *MA) : MA(MA) {}
+
+ MemoryAccess *operator*() const { return MA; }
+
+ def_chain_iterator &operator++() {
+ // N.B. liveOnEntry has a null defining access.
+ if (auto *MUD = dyn_cast<MemoryUseOrDef>(MA))
+ MA = MUD->getDefiningAccess();
+ else
+ MA = nullptr;
+ return *this;
+ }
+
+ bool operator==(const def_chain_iterator &O) const { return MA == O.MA; }
+
+private:
+ MemoryAccess *MA;
+};
+
+static iterator_range<def_chain_iterator>
+def_chain(MemoryAccess *MA, MemoryAccess *UpTo = nullptr) {
+#ifdef EXPENSIVE_CHECKS
+ assert((!UpTo || find(def_chain(MA), UpTo) != def_chain_iterator()) &&
+ "UpTo isn't in the def chain!");
+#endif
+ return make_range(def_chain_iterator(MA), def_chain_iterator(UpTo));
+}
+
+/// Verifies that `Start` is clobbered by `ClobberAt`, and that nothing
+/// inbetween `Start` and `ClobberAt` can clobbers `Start`.
+///
+/// This is meant to be as simple and self-contained as possible. Because it
+/// uses no cache, etc., it can be relatively expensive.
+///
+/// \param Start The MemoryAccess that we want to walk from.
+/// \param ClobberAt A clobber for Start.
+/// \param StartLoc The MemoryLocation for Start.
+/// \param MSSA The MemorySSA isntance that Start and ClobberAt belong to.
+/// \param Query The UpwardsMemoryQuery we used for our search.
+/// \param AA The AliasAnalysis we used for our search.
+static void LLVM_ATTRIBUTE_UNUSED
+checkClobberSanity(MemoryAccess *Start, MemoryAccess *ClobberAt,
+ const MemoryLocation &StartLoc, const MemorySSA &MSSA,
+ const UpwardsMemoryQuery &Query, AliasAnalysis &AA) {
+ assert(MSSA.dominates(ClobberAt, Start) && "Clobber doesn't dominate start?");
+
+ if (MSSA.isLiveOnEntryDef(Start)) {
+ assert(MSSA.isLiveOnEntryDef(ClobberAt) &&
+ "liveOnEntry must clobber itself");
+ return;
+ }
+
+ bool FoundClobber = false;
+ DenseSet<MemoryAccessPair> VisitedPhis;
+ SmallVector<MemoryAccessPair, 8> Worklist;
+ Worklist.emplace_back(Start, StartLoc);
+ // Walk all paths from Start to ClobberAt, while looking for clobbers. If one
+ // is found, complain.
+ while (!Worklist.empty()) {
+ MemoryAccessPair MAP = Worklist.pop_back_val();
+ // All we care about is that nothing from Start to ClobberAt clobbers Start.
+ // We learn nothing from revisiting nodes.
+ if (!VisitedPhis.insert(MAP).second)
+ continue;
+
+ for (MemoryAccess *MA : def_chain(MAP.first)) {
+ if (MA == ClobberAt) {
+ if (auto *MD = dyn_cast<MemoryDef>(MA)) {
+ // instructionClobbersQuery isn't essentially free, so don't use `|=`,
+ // since it won't let us short-circuit.
+ //
+ // Also, note that this can't be hoisted out of the `Worklist` loop,
+ // since MD may only act as a clobber for 1 of N MemoryLocations.
+ FoundClobber =
+ FoundClobber || MSSA.isLiveOnEntryDef(MD) ||
+ instructionClobbersQuery(MD, MAP.second, Query.Inst, AA);
+ }
+ break;
+ }
+
+ // We should never hit liveOnEntry, unless it's the clobber.
+ assert(!MSSA.isLiveOnEntryDef(MA) && "Hit liveOnEntry before clobber?");
+
+ if (auto *MD = dyn_cast<MemoryDef>(MA)) {
+ (void)MD;
+ assert(!instructionClobbersQuery(MD, MAP.second, Query.Inst, AA) &&
+ "Found clobber before reaching ClobberAt!");
+ continue;
+ }
+
+ assert(isa<MemoryPhi>(MA));
+ Worklist.append(upward_defs_begin({MA, MAP.second}), upward_defs_end());
+ }
+ }
+
+ // If ClobberAt is a MemoryPhi, we can assume something above it acted as a
+ // clobber. Otherwise, `ClobberAt` should've acted as a clobber at some point.
+ assert((isa<MemoryPhi>(ClobberAt) || FoundClobber) &&
+ "ClobberAt never acted as a clobber");
+}
+
+/// Our algorithm for walking (and trying to optimize) clobbers, all wrapped up
+/// in one class.
+class ClobberWalker {
+ /// Save a few bytes by using unsigned instead of size_t.
+ using ListIndex = unsigned;
+
+ /// Represents a span of contiguous MemoryDefs, potentially ending in a
+ /// MemoryPhi.
+ struct DefPath {
+ MemoryLocation Loc;
+ // Note that, because we always walk in reverse, Last will always dominate
+ // First. Also note that First and Last are inclusive.
+ MemoryAccess *First;
+ MemoryAccess *Last;
+ Optional<ListIndex> Previous;
+
+ DefPath(const MemoryLocation &Loc, MemoryAccess *First, MemoryAccess *Last,
+ Optional<ListIndex> Previous)
+ : Loc(Loc), First(First), Last(Last), Previous(Previous) {}
+
+ DefPath(const MemoryLocation &Loc, MemoryAccess *Init,
+ Optional<ListIndex> Previous)
+ : DefPath(Loc, Init, Init, Previous) {}
+ };
+
+ const MemorySSA &MSSA;
+ AliasAnalysis &AA;
+ DominatorTree &DT;
+ WalkerCache &WC;
+ UpwardsMemoryQuery *Query;
+ bool UseCache;
+
+ // Phi optimization bookkeeping
+ SmallVector<DefPath, 32> Paths;
+ DenseSet<ConstMemoryAccessPair> VisitedPhis;
+ DenseMap<const BasicBlock *, MemoryAccess *> WalkTargetCache;
+
+ void setUseCache(bool Use) { UseCache = Use; }
+ bool shouldIgnoreCache() const {
+ // UseCache will only be false when we're debugging, or when expensive
+ // checks are enabled. In either case, we don't care deeply about speed.
+ return LLVM_UNLIKELY(!UseCache);
+ }
+
+ void addCacheEntry(const MemoryAccess *What, MemoryAccess *To,
+ const MemoryLocation &Loc) const {
+// EXPENSIVE_CHECKS because most of these queries are redundant.
+#ifdef EXPENSIVE_CHECKS
+ assert(MSSA.dominates(To, What));
+#endif
+ if (shouldIgnoreCache())
+ return;
+ WC.insert(What, To, Loc, Query->IsCall);
+ }
+
+ MemoryAccess *lookupCache(const MemoryAccess *MA, const MemoryLocation &Loc) {
+ return shouldIgnoreCache() ? nullptr : WC.lookup(MA, Loc, Query->IsCall);
+ }
+
+ void cacheDefPath(const DefPath &DN, MemoryAccess *Target) const {
+ if (shouldIgnoreCache())
+ return;
+
+ for (MemoryAccess *MA : def_chain(DN.First, DN.Last))
+ addCacheEntry(MA, Target, DN.Loc);
+
+ // DefPaths only express the path we walked. So, DN.Last could either be a
+ // thing we want to cache, or not.
+ if (DN.Last != Target)
+ addCacheEntry(DN.Last, Target, DN.Loc);
+ }
+
+ /// Find the nearest def or phi that `From` can legally be optimized to.
+ ///
+ /// FIXME: Deduplicate this with MSSA::findDominatingDef. Ideally, MSSA should
+ /// keep track of this information for us, and allow us O(1) lookups of this
+ /// info.
+ MemoryAccess *getWalkTarget(const MemoryPhi *From) {
+ assert(From->getNumOperands() && "Phi with no operands?");
+
+ BasicBlock *BB = From->getBlock();
+ auto At = WalkTargetCache.find(BB);
+ if (At != WalkTargetCache.end())
+ return At->second;
+
+ SmallVector<const BasicBlock *, 8> ToCache;
+ ToCache.push_back(BB);
+
+ MemoryAccess *Result = MSSA.getLiveOnEntryDef();
+ DomTreeNode *Node = DT.getNode(BB);
+ while ((Node = Node->getIDom())) {
+ auto At = WalkTargetCache.find(BB);
+ if (At != WalkTargetCache.end()) {
+ Result = At->second;
+ break;
+ }
+
+ auto *Accesses = MSSA.getBlockAccesses(Node->getBlock());
+ if (Accesses) {
+ auto Iter = find_if(reverse(*Accesses), [](const MemoryAccess &MA) {
+ return !isa<MemoryUse>(MA);
+ });
+ if (Iter != Accesses->rend()) {
+ Result = const_cast<MemoryAccess *>(&*Iter);
+ break;
+ }
+ }
+
+ ToCache.push_back(Node->getBlock());
+ }
+
+ for (const BasicBlock *BB : ToCache)
+ WalkTargetCache.insert({BB, Result});
+ return Result;
+ }
+
+ /// Result of calling walkToPhiOrClobber.
+ struct UpwardsWalkResult {
+ /// The "Result" of the walk. Either a clobber, the last thing we walked, or
+ /// both.
+ MemoryAccess *Result;
+ bool IsKnownClobber;
+ bool FromCache;
+ };
+
+ /// Walk to the next Phi or Clobber in the def chain starting at Desc.Last.
+ /// This will update Desc.Last as it walks. It will (optionally) also stop at
+ /// StopAt.
+ ///
+ /// This does not test for whether StopAt is a clobber
+ UpwardsWalkResult walkToPhiOrClobber(DefPath &Desc,
+ MemoryAccess *StopAt = nullptr) {
+ assert(!isa<MemoryUse>(Desc.Last) && "Uses don't exist in my world");
+
+ for (MemoryAccess *Current : def_chain(Desc.Last)) {
+ Desc.Last = Current;
+ if (Current == StopAt)
+ return {Current, false, false};
+
+ if (auto *MD = dyn_cast<MemoryDef>(Current))
+ if (MSSA.isLiveOnEntryDef(MD) ||
+ instructionClobbersQuery(MD, Desc.Loc, Query->Inst, AA))
+ return {MD, true, false};
+
+ // Cache checks must be done last, because if Current is a clobber, the
+ // cache will contain the clobber for Current.
+ if (MemoryAccess *MA = lookupCache(Current, Desc.Loc))
+ return {MA, true, true};
+ }
+
+ assert(isa<MemoryPhi>(Desc.Last) &&
+ "Ended at a non-clobber that's not a phi?");
+ return {Desc.Last, false, false};
+ }
+
+ void addSearches(MemoryPhi *Phi, SmallVectorImpl<ListIndex> &PausedSearches,
+ ListIndex PriorNode) {
+ auto UpwardDefs = make_range(upward_defs_begin({Phi, Paths[PriorNode].Loc}),
+ upward_defs_end());
+ for (const MemoryAccessPair &P : UpwardDefs) {
+ PausedSearches.push_back(Paths.size());
+ Paths.emplace_back(P.second, P.first, PriorNode);
+ }
+ }
+
+ /// Represents a search that terminated after finding a clobber. This clobber
+ /// may or may not be present in the path of defs from LastNode..SearchStart,
+ /// since it may have been retrieved from cache.
+ struct TerminatedPath {
+ MemoryAccess *Clobber;
+ ListIndex LastNode;
+ };
+
+ /// Get an access that keeps us from optimizing to the given phi.
+ ///
+ /// PausedSearches is an array of indices into the Paths array. Its incoming
+ /// value is the indices of searches that stopped at the last phi optimization
+ /// target. It's left in an unspecified state.
+ ///
+ /// If this returns None, NewPaused is a vector of searches that terminated
+ /// at StopWhere. Otherwise, NewPaused is left in an unspecified state.
+ Optional<TerminatedPath>
+ getBlockingAccess(MemoryAccess *StopWhere,
+ SmallVectorImpl<ListIndex> &PausedSearches,
+ SmallVectorImpl<ListIndex> &NewPaused,
+ SmallVectorImpl<TerminatedPath> &Terminated) {
+ assert(!PausedSearches.empty() && "No searches to continue?");
+
+ // BFS vs DFS really doesn't make a difference here, so just do a DFS with
+ // PausedSearches as our stack.
+ while (!PausedSearches.empty()) {
+ ListIndex PathIndex = PausedSearches.pop_back_val();
+ DefPath &Node = Paths[PathIndex];
+
+ // If we've already visited this path with this MemoryLocation, we don't
+ // need to do so again.
+ //
+ // NOTE: That we just drop these paths on the ground makes caching
+ // behavior sporadic. e.g. given a diamond:
+ // A
+ // B C
+ // D
+ //
+ // ...If we walk D, B, A, C, we'll only cache the result of phi
+ // optimization for A, B, and D; C will be skipped because it dies here.
+ // This arguably isn't the worst thing ever, since:
+ // - We generally query things in a top-down order, so if we got below D
+ // without needing cache entries for {C, MemLoc}, then chances are
+ // that those cache entries would end up ultimately unused.
+ // - We still cache things for A, so C only needs to walk up a bit.
+ // If this behavior becomes problematic, we can fix without a ton of extra
+ // work.
+ if (!VisitedPhis.insert({Node.Last, Node.Loc}).second)
+ continue;
+
+ UpwardsWalkResult Res = walkToPhiOrClobber(Node, /*StopAt=*/StopWhere);
+ if (Res.IsKnownClobber) {
+ assert(Res.Result != StopWhere || Res.FromCache);
+ // If this wasn't a cache hit, we hit a clobber when walking. That's a
+ // failure.
+ TerminatedPath Term{Res.Result, PathIndex};
+ if (!Res.FromCache || !MSSA.dominates(Res.Result, StopWhere))
+ return Term;
+
+ // Otherwise, it's a valid thing to potentially optimize to.
+ Terminated.push_back(Term);
+ continue;
+ }
+
+ if (Res.Result == StopWhere) {
+ // We've hit our target. Save this path off for if we want to continue
+ // walking.
+ NewPaused.push_back(PathIndex);
+ continue;
+ }
+
+ assert(!MSSA.isLiveOnEntryDef(Res.Result) && "liveOnEntry is a clobber");
+ addSearches(cast<MemoryPhi>(Res.Result), PausedSearches, PathIndex);
+ }
+
+ return None;
+ }
+
+ template <typename T, typename Walker>
+ struct generic_def_path_iterator
+ : public iterator_facade_base<generic_def_path_iterator<T, Walker>,
+ std::forward_iterator_tag, T *> {
+ generic_def_path_iterator() : W(nullptr), N(None) {}
+ generic_def_path_iterator(Walker *W, ListIndex N) : W(W), N(N) {}
+
+ T &operator*() const { return curNode(); }
+
+ generic_def_path_iterator &operator++() {
+ N = curNode().Previous;
+ return *this;
+ }
+
+ bool operator==(const generic_def_path_iterator &O) const {
+ if (N.hasValue() != O.N.hasValue())
+ return false;
+ return !N.hasValue() || *N == *O.N;
+ }
+
+ private:
+ T &curNode() const { return W->Paths[*N]; }
+
+ Walker *W;
+ Optional<ListIndex> N;
+ };
+
+ using def_path_iterator = generic_def_path_iterator<DefPath, ClobberWalker>;
+ using const_def_path_iterator =
+ generic_def_path_iterator<const DefPath, const ClobberWalker>;
+
+ iterator_range<def_path_iterator> def_path(ListIndex From) {
+ return make_range(def_path_iterator(this, From), def_path_iterator());
+ }
+
+ iterator_range<const_def_path_iterator> const_def_path(ListIndex From) const {
+ return make_range(const_def_path_iterator(this, From),
+ const_def_path_iterator());
+ }
+
+ struct OptznResult {
+ /// The path that contains our result.
+ TerminatedPath PrimaryClobber;
+ /// The paths that we can legally cache back from, but that aren't
+ /// necessarily the result of the Phi optimization.
+ SmallVector<TerminatedPath, 4> OtherClobbers;
+ };
+
+ ListIndex defPathIndex(const DefPath &N) const {
+ // The assert looks nicer if we don't need to do &N
+ const DefPath *NP = &N;
+ assert(!Paths.empty() && NP >= &Paths.front() && NP <= &Paths.back() &&
+ "Out of bounds DefPath!");
+ return NP - &Paths.front();
+ }
+
+ /// Try to optimize a phi as best as we can. Returns a SmallVector of Paths
+ /// that act as legal clobbers. Note that this won't return *all* clobbers.
+ ///
+ /// Phi optimization algorithm tl;dr:
+ /// - Find the earliest def/phi, A, we can optimize to
+ /// - Find if all paths from the starting memory access ultimately reach A
+ /// - If not, optimization isn't possible.
+ /// - Otherwise, walk from A to another clobber or phi, A'.
+ /// - If A' is a def, we're done.
+ /// - If A' is a phi, try to optimize it.
+ ///
+ /// A path is a series of {MemoryAccess, MemoryLocation} pairs. A path
+ /// terminates when a MemoryAccess that clobbers said MemoryLocation is found.
+ OptznResult tryOptimizePhi(MemoryPhi *Phi, MemoryAccess *Start,
+ const MemoryLocation &Loc) {
+ assert(Paths.empty() && VisitedPhis.empty() &&
+ "Reset the optimization state.");
+
+ Paths.emplace_back(Loc, Start, Phi, None);
+ // Stores how many "valid" optimization nodes we had prior to calling
+ // addSearches/getBlockingAccess. Necessary for caching if we had a blocker.
+ auto PriorPathsSize = Paths.size();
+
+ SmallVector<ListIndex, 16> PausedSearches;
+ SmallVector<ListIndex, 8> NewPaused;
+ SmallVector<TerminatedPath, 4> TerminatedPaths;
+
+ addSearches(Phi, PausedSearches, 0);
+
+ // Moves the TerminatedPath with the "most dominated" Clobber to the end of
+ // Paths.
+ auto MoveDominatedPathToEnd = [&](SmallVectorImpl<TerminatedPath> &Paths) {
+ assert(!Paths.empty() && "Need a path to move");
+ auto Dom = Paths.begin();
+ for (auto I = std::next(Dom), E = Paths.end(); I != E; ++I)
+ if (!MSSA.dominates(I->Clobber, Dom->Clobber))
+ Dom = I;
+ auto Last = Paths.end() - 1;
+ if (Last != Dom)
+ std::iter_swap(Last, Dom);
+ };
+
+ MemoryPhi *Current = Phi;
+ while (1) {
+ assert(!MSSA.isLiveOnEntryDef(Current) &&
+ "liveOnEntry wasn't treated as a clobber?");
+
+ MemoryAccess *Target = getWalkTarget(Current);
+ // If a TerminatedPath doesn't dominate Target, then it wasn't a legal
+ // optimization for the prior phi.
+ assert(all_of(TerminatedPaths, [&](const TerminatedPath &P) {
+ return MSSA.dominates(P.Clobber, Target);
+ }));
+
+ // FIXME: This is broken, because the Blocker may be reported to be
+ // liveOnEntry, and we'll happily wait for that to disappear (read: never)
+ // For the moment, this is fine, since we do nothing with blocker info.
+ if (Optional<TerminatedPath> Blocker = getBlockingAccess(
+ Target, PausedSearches, NewPaused, TerminatedPaths)) {
+ // Cache our work on the blocking node, since we know that's correct.
+ cacheDefPath(Paths[Blocker->LastNode], Blocker->Clobber);
+
+ // Find the node we started at. We can't search based on N->Last, since
+ // we may have gone around a loop with a different MemoryLocation.
+ auto Iter = find_if(def_path(Blocker->LastNode), [&](const DefPath &N) {
+ return defPathIndex(N) < PriorPathsSize;
+ });
+ assert(Iter != def_path_iterator());
+
+ DefPath &CurNode = *Iter;
+ assert(CurNode.Last == Current);
+
+ // Two things:
+ // A. We can't reliably cache all of NewPaused back. Consider a case
+ // where we have two paths in NewPaused; one of which can't optimize
+ // above this phi, whereas the other can. If we cache the second path
+ // back, we'll end up with suboptimal cache entries. We can handle
+ // cases like this a bit better when we either try to find all
+ // clobbers that block phi optimization, or when our cache starts
+ // supporting unfinished searches.
+ // B. We can't reliably cache TerminatedPaths back here without doing
+ // extra checks; consider a case like:
+ // T
+ // / \
+ // D C
+ // \ /
+ // S
+ // Where T is our target, C is a node with a clobber on it, D is a
+ // diamond (with a clobber *only* on the left or right node, N), and
+ // S is our start. Say we walk to D, through the node opposite N
+ // (read: ignoring the clobber), and see a cache entry in the top
+ // node of D. That cache entry gets put into TerminatedPaths. We then
+ // walk up to C (N is later in our worklist), find the clobber, and
+ // quit. If we append TerminatedPaths to OtherClobbers, we'll cache
+ // the bottom part of D to the cached clobber, ignoring the clobber
+ // in N. Again, this problem goes away if we start tracking all
+ // blockers for a given phi optimization.
+ TerminatedPath Result{CurNode.Last, defPathIndex(CurNode)};
+ return {Result, {}};
+ }
+
+ // If there's nothing left to search, then all paths led to valid clobbers
+ // that we got from our cache; pick the nearest to the start, and allow
+ // the rest to be cached back.
+ if (NewPaused.empty()) {
+ MoveDominatedPathToEnd(TerminatedPaths);
+ TerminatedPath Result = TerminatedPaths.pop_back_val();
+ return {Result, std::move(TerminatedPaths)};
+ }
+
+ MemoryAccess *DefChainEnd = nullptr;
+ SmallVector<TerminatedPath, 4> Clobbers;
+ for (ListIndex Paused : NewPaused) {
+ UpwardsWalkResult WR = walkToPhiOrClobber(Paths[Paused]);
+ if (WR.IsKnownClobber)
+ Clobbers.push_back({WR.Result, Paused});
+ else
+ // Micro-opt: If we hit the end of the chain, save it.
+ DefChainEnd = WR.Result;
+ }
+
+ if (!TerminatedPaths.empty()) {
+ // If we couldn't find the dominating phi/liveOnEntry in the above loop,
+ // do it now.
+ if (!DefChainEnd)
+ for (MemoryAccess *MA : def_chain(Target))
+ DefChainEnd = MA;
+
+ // If any of the terminated paths don't dominate the phi we'll try to
+ // optimize, we need to figure out what they are and quit.
+ const BasicBlock *ChainBB = DefChainEnd->getBlock();
+ for (const TerminatedPath &TP : TerminatedPaths) {
+ // Because we know that DefChainEnd is as "high" as we can go, we
+ // don't need local dominance checks; BB dominance is sufficient.
+ if (DT.dominates(ChainBB, TP.Clobber->getBlock()))
+ Clobbers.push_back(TP);
+ }
+ }
+
+ // If we have clobbers in the def chain, find the one closest to Current
+ // and quit.
+ if (!Clobbers.empty()) {
+ MoveDominatedPathToEnd(Clobbers);
+ TerminatedPath Result = Clobbers.pop_back_val();
+ return {Result, std::move(Clobbers)};
+ }
+
+ assert(all_of(NewPaused,
+ [&](ListIndex I) { return Paths[I].Last == DefChainEnd; }));
+
+ // Because liveOnEntry is a clobber, this must be a phi.
+ auto *DefChainPhi = cast<MemoryPhi>(DefChainEnd);
+
+ PriorPathsSize = Paths.size();
+ PausedSearches.clear();
+ for (ListIndex I : NewPaused)
+ addSearches(DefChainPhi, PausedSearches, I);
+ NewPaused.clear();
+
+ Current = DefChainPhi;
+ }
+ }
+
+ /// Caches everything in an OptznResult.
+ void cacheOptResult(const OptznResult &R) {
+ if (R.OtherClobbers.empty()) {
+ // If we're not going to be caching OtherClobbers, don't bother with
+ // marking visited/etc.
+ for (const DefPath &N : const_def_path(R.PrimaryClobber.LastNode))
+ cacheDefPath(N, R.PrimaryClobber.Clobber);
+ return;
+ }
+
+ // PrimaryClobber is our answer. If we can cache anything back, we need to
+ // stop caching when we visit PrimaryClobber.
+ SmallBitVector Visited(Paths.size());
+ for (const DefPath &N : const_def_path(R.PrimaryClobber.LastNode)) {
+ Visited[defPathIndex(N)] = true;
+ cacheDefPath(N, R.PrimaryClobber.Clobber);
+ }
+
+ for (const TerminatedPath &P : R.OtherClobbers) {
+ for (const DefPath &N : const_def_path(P.LastNode)) {
+ ListIndex NIndex = defPathIndex(N);
+ if (Visited[NIndex])
+ break;
+ Visited[NIndex] = true;
+ cacheDefPath(N, P.Clobber);
+ }
+ }
+ }
+
+ void verifyOptResult(const OptznResult &R) const {
+ assert(all_of(R.OtherClobbers, [&](const TerminatedPath &P) {
+ return MSSA.dominates(P.Clobber, R.PrimaryClobber.Clobber);
+ }));
+ }
+
+ void resetPhiOptznState() {
+ Paths.clear();
+ VisitedPhis.clear();
+ }
+
+public:
+ ClobberWalker(const MemorySSA &MSSA, AliasAnalysis &AA, DominatorTree &DT,
+ WalkerCache &WC)
+ : MSSA(MSSA), AA(AA), DT(DT), WC(WC), UseCache(true) {}
+
+ void reset() { WalkTargetCache.clear(); }
+
+ /// Finds the nearest clobber for the given query, optimizing phis if
+ /// possible.
+ MemoryAccess *findClobber(MemoryAccess *Start, UpwardsMemoryQuery &Q,
+ bool UseWalkerCache = true) {
+ setUseCache(UseWalkerCache);
+ Query = &Q;
+
+ MemoryAccess *Current = Start;
+ // This walker pretends uses don't exist. If we're handed one, silently grab
+ // its def. (This has the nice side-effect of ensuring we never cache uses)
+ if (auto *MU = dyn_cast<MemoryUse>(Start))
+ Current = MU->getDefiningAccess();
+
+ DefPath FirstDesc(Q.StartingLoc, Current, Current, None);
+ // Fast path for the overly-common case (no crazy phi optimization
+ // necessary)
+ UpwardsWalkResult WalkResult = walkToPhiOrClobber(FirstDesc);
+ MemoryAccess *Result;
+ if (WalkResult.IsKnownClobber) {
+ cacheDefPath(FirstDesc, WalkResult.Result);
+ Result = WalkResult.Result;
+ } else {
+ OptznResult OptRes = tryOptimizePhi(cast<MemoryPhi>(FirstDesc.Last),
+ Current, Q.StartingLoc);
+ verifyOptResult(OptRes);
+ cacheOptResult(OptRes);
+ resetPhiOptznState();
+ Result = OptRes.PrimaryClobber.Clobber;
+ }
+
+#ifdef EXPENSIVE_CHECKS
+ checkClobberSanity(Current, Result, Q.StartingLoc, MSSA, Q, AA);
+#endif
+ return Result;
+ }
+
+ void verify(const MemorySSA *MSSA) { assert(MSSA == &this->MSSA); }
+};
+
+struct RenamePassData {
+ DomTreeNode *DTN;
+ DomTreeNode::const_iterator ChildIt;
+ MemoryAccess *IncomingVal;
+
+ RenamePassData(DomTreeNode *D, DomTreeNode::const_iterator It,
+ MemoryAccess *M)
+ : DTN(D), ChildIt(It), IncomingVal(M) {}
+ void swap(RenamePassData &RHS) {
+ std::swap(DTN, RHS.DTN);
+ std::swap(ChildIt, RHS.ChildIt);
+ std::swap(IncomingVal, RHS.IncomingVal);
+ }
+};
+} // anonymous namespace
+namespace llvm {
/// \brief A MemorySSAWalker that does AA walks and caching of lookups to
/// disambiguate accesses.
///
@@ -121,59 +1083,39 @@ public:
/// ret i32 %r
/// }
class MemorySSA::CachingWalker final : public MemorySSAWalker {
+ WalkerCache Cache;
+ ClobberWalker Walker;
+ bool AutoResetWalker;
+
+ MemoryAccess *getClobberingMemoryAccess(MemoryAccess *, UpwardsMemoryQuery &);
+ void verifyRemoved(MemoryAccess *);
+
public:
CachingWalker(MemorySSA *, AliasAnalysis *, DominatorTree *);
~CachingWalker() override;
- MemoryAccess *getClobberingMemoryAccess(const Instruction *) override;
+ using MemorySSAWalker::getClobberingMemoryAccess;
+ MemoryAccess *getClobberingMemoryAccess(MemoryAccess *) override;
MemoryAccess *getClobberingMemoryAccess(MemoryAccess *,
- MemoryLocation &) override;
+ const MemoryLocation &) override;
void invalidateInfo(MemoryAccess *) override;
-protected:
- struct UpwardsMemoryQuery;
- MemoryAccess *doCacheLookup(const MemoryAccess *, const UpwardsMemoryQuery &,
- const MemoryLocation &);
-
- void doCacheInsert(const MemoryAccess *, MemoryAccess *,
- const UpwardsMemoryQuery &, const MemoryLocation &);
+ /// Whether we call resetClobberWalker() after each time we *actually* walk to
+ /// answer a clobber query.
+ void setAutoResetWalker(bool AutoReset) { AutoResetWalker = AutoReset; }
- void doCacheRemove(const MemoryAccess *, const UpwardsMemoryQuery &,
- const MemoryLocation &);
+ /// Drop the walker's persistent data structures. At the moment, this means
+ /// "drop the walker's cache of BasicBlocks ->
+ /// earliest-MemoryAccess-we-can-optimize-to". This is necessary if we're
+ /// going to have DT updates, if we remove MemoryAccesses, etc.
+ void resetClobberWalker() { Walker.reset(); }
-private:
- MemoryAccessPair UpwardsDFSWalk(MemoryAccess *, const MemoryLocation &,
- UpwardsMemoryQuery &, bool);
- MemoryAccess *getClobberingMemoryAccess(MemoryAccess *, UpwardsMemoryQuery &);
- bool instructionClobbersQuery(const MemoryDef *, UpwardsMemoryQuery &,
- const MemoryLocation &Loc) const;
- void verifyRemoved(MemoryAccess *);
- SmallDenseMap<ConstMemoryAccessPair, MemoryAccess *>
- CachedUpwardsClobberingAccess;
- DenseMap<const MemoryAccess *, MemoryAccess *> CachedUpwardsClobberingCall;
- AliasAnalysis *AA;
- DominatorTree *DT;
-};
-}
-
-namespace {
-struct RenamePassData {
- DomTreeNode *DTN;
- DomTreeNode::const_iterator ChildIt;
- MemoryAccess *IncomingVal;
-
- RenamePassData(DomTreeNode *D, DomTreeNode::const_iterator It,
- MemoryAccess *M)
- : DTN(D), ChildIt(It), IncomingVal(M) {}
- void swap(RenamePassData &RHS) {
- std::swap(DTN, RHS.DTN);
- std::swap(ChildIt, RHS.ChildIt);
- std::swap(IncomingVal, RHS.IncomingVal);
+ void verify(const MemorySSA *MSSA) override {
+ MemorySSAWalker::verify(MSSA);
+ Walker.verify(MSSA);
}
};
-}
-namespace llvm {
/// \brief Rename a single basic block into MemorySSA form.
/// Uses the standard SSA renaming algorithm.
/// \returns The new incoming value.
@@ -184,21 +1126,13 @@ MemoryAccess *MemorySSA::renameBlock(BasicBlock *BB,
if (It != PerBlockAccesses.end()) {
AccessList *Accesses = It->second.get();
for (MemoryAccess &L : *Accesses) {
- switch (L.getValueID()) {
- case Value::MemoryUseVal:
- cast<MemoryUse>(&L)->setDefiningAccess(IncomingVal);
- break;
- case Value::MemoryDefVal:
- // We can't legally optimize defs, because we only allow single
- // memory phis/uses on operations, and if we optimize these, we can
- // end up with multiple reaching defs. Uses do not have this
- // problem, since they do not produce a value
- cast<MemoryDef>(&L)->setDefiningAccess(IncomingVal);
+ if (MemoryUseOrDef *MUD = dyn_cast<MemoryUseOrDef>(&L)) {
+ if (MUD->getDefiningAccess() == nullptr)
+ MUD->setDefiningAccess(IncomingVal);
+ if (isa<MemoryDef>(&L))
+ IncomingVal = &L;
+ } else {
IncomingVal = &L;
- break;
- case Value::MemoryPhiVal:
- IncomingVal = &L;
- break;
}
}
}
@@ -295,21 +1229,10 @@ void MemorySSA::markUnreachableAsLiveOnEntry(BasicBlock *BB) {
MemorySSA::MemorySSA(Function &Func, AliasAnalysis *AA, DominatorTree *DT)
: AA(AA), DT(DT), F(Func), LiveOnEntryDef(nullptr), Walker(nullptr),
- NextID(0) {
+ NextID(INVALID_MEMORYACCESS_ID) {
buildMemorySSA();
}
-MemorySSA::MemorySSA(MemorySSA &&MSSA)
- : AA(MSSA.AA), DT(MSSA.DT), F(MSSA.F),
- ValueToMemoryAccess(std::move(MSSA.ValueToMemoryAccess)),
- PerBlockAccesses(std::move(MSSA.PerBlockAccesses)),
- LiveOnEntryDef(std::move(MSSA.LiveOnEntryDef)),
- Walker(std::move(MSSA.Walker)), NextID(MSSA.NextID) {
- // Update the Walker MSSA pointer so it doesn't point to the moved-from MSSA
- // object any more.
- Walker->MSSA = this;
-}
-
MemorySSA::~MemorySSA() {
// Drop all our references
for (const auto &Pair : PerBlockAccesses)
@@ -325,6 +1248,245 @@ MemorySSA::AccessList *MemorySSA::getOrCreateAccessList(const BasicBlock *BB) {
return Res.first->second.get();
}
+/// This class is a batch walker of all MemoryUse's in the program, and points
+/// their defining access at the thing that actually clobbers them. Because it
+/// is a batch walker that touches everything, it does not operate like the
+/// other walkers. This walker is basically performing a top-down SSA renaming
+/// pass, where the version stack is used as the cache. This enables it to be
+/// significantly more time and memory efficient than using the regular walker,
+/// which is walking bottom-up.
+class MemorySSA::OptimizeUses {
+public:
+ OptimizeUses(MemorySSA *MSSA, MemorySSAWalker *Walker, AliasAnalysis *AA,
+ DominatorTree *DT)
+ : MSSA(MSSA), Walker(Walker), AA(AA), DT(DT) {
+ Walker = MSSA->getWalker();
+ }
+
+ void optimizeUses();
+
+private:
+ /// This represents where a given memorylocation is in the stack.
+ struct MemlocStackInfo {
+ // This essentially is keeping track of versions of the stack. Whenever
+ // the stack changes due to pushes or pops, these versions increase.
+ unsigned long StackEpoch;
+ unsigned long PopEpoch;
+ // This is the lower bound of places on the stack to check. It is equal to
+ // the place the last stack walk ended.
+ // Note: Correctness depends on this being initialized to 0, which densemap
+ // does
+ unsigned long LowerBound;
+ const BasicBlock *LowerBoundBlock;
+ // This is where the last walk for this memory location ended.
+ unsigned long LastKill;
+ bool LastKillValid;
+ };
+ void optimizeUsesInBlock(const BasicBlock *, unsigned long &, unsigned long &,
+ SmallVectorImpl<MemoryAccess *> &,
+ DenseMap<MemoryLocOrCall, MemlocStackInfo> &);
+ MemorySSA *MSSA;
+ MemorySSAWalker *Walker;
+ AliasAnalysis *AA;
+ DominatorTree *DT;
+};
+
+/// Optimize the uses in a given block This is basically the SSA renaming
+/// algorithm, with one caveat: We are able to use a single stack for all
+/// MemoryUses. This is because the set of *possible* reaching MemoryDefs is
+/// the same for every MemoryUse. The *actual* clobbering MemoryDef is just
+/// going to be some position in that stack of possible ones.
+///
+/// We track the stack positions that each MemoryLocation needs
+/// to check, and last ended at. This is because we only want to check the
+/// things that changed since last time. The same MemoryLocation should
+/// get clobbered by the same store (getModRefInfo does not use invariantness or
+/// things like this, and if they start, we can modify MemoryLocOrCall to
+/// include relevant data)
+void MemorySSA::OptimizeUses::optimizeUsesInBlock(
+ const BasicBlock *BB, unsigned long &StackEpoch, unsigned long &PopEpoch,
+ SmallVectorImpl<MemoryAccess *> &VersionStack,
+ DenseMap<MemoryLocOrCall, MemlocStackInfo> &LocStackInfo) {
+
+ /// If no accesses, nothing to do.
+ MemorySSA::AccessList *Accesses = MSSA->getWritableBlockAccesses(BB);
+ if (Accesses == nullptr)
+ return;
+
+ // Pop everything that doesn't dominate the current block off the stack,
+ // increment the PopEpoch to account for this.
+ while (!VersionStack.empty()) {
+ BasicBlock *BackBlock = VersionStack.back()->getBlock();
+ if (DT->dominates(BackBlock, BB))
+ break;
+ while (VersionStack.back()->getBlock() == BackBlock)
+ VersionStack.pop_back();
+ ++PopEpoch;
+ }
+ for (MemoryAccess &MA : *Accesses) {
+ auto *MU = dyn_cast<MemoryUse>(&MA);
+ if (!MU) {
+ VersionStack.push_back(&MA);
+ ++StackEpoch;
+ continue;
+ }
+
+ if (isUseTriviallyOptimizableToLiveOnEntry(*AA, MU->getMemoryInst())) {
+ MU->setDefiningAccess(MSSA->getLiveOnEntryDef(), true);
+ continue;
+ }
+
+ MemoryLocOrCall UseMLOC(MU);
+ auto &LocInfo = LocStackInfo[UseMLOC];
+ // If the pop epoch changed, it means we've removed stuff from top of
+ // stack due to changing blocks. We may have to reset the lower bound or
+ // last kill info.
+ if (LocInfo.PopEpoch != PopEpoch) {
+ LocInfo.PopEpoch = PopEpoch;
+ LocInfo.StackEpoch = StackEpoch;
+ // If the lower bound was in something that no longer dominates us, we
+ // have to reset it.
+ // We can't simply track stack size, because the stack may have had
+ // pushes/pops in the meantime.
+ // XXX: This is non-optimal, but only is slower cases with heavily
+ // branching dominator trees. To get the optimal number of queries would
+ // be to make lowerbound and lastkill a per-loc stack, and pop it until
+ // the top of that stack dominates us. This does not seem worth it ATM.
+ // A much cheaper optimization would be to always explore the deepest
+ // branch of the dominator tree first. This will guarantee this resets on
+ // the smallest set of blocks.
+ if (LocInfo.LowerBoundBlock && LocInfo.LowerBoundBlock != BB &&
+ !DT->dominates(LocInfo.LowerBoundBlock, BB)) {
+ // Reset the lower bound of things to check.
+ // TODO: Some day we should be able to reset to last kill, rather than
+ // 0.
+ LocInfo.LowerBound = 0;
+ LocInfo.LowerBoundBlock = VersionStack[0]->getBlock();
+ LocInfo.LastKillValid = false;
+ }
+ } else if (LocInfo.StackEpoch != StackEpoch) {
+ // If all that has changed is the StackEpoch, we only have to check the
+ // new things on the stack, because we've checked everything before. In
+ // this case, the lower bound of things to check remains the same.
+ LocInfo.PopEpoch = PopEpoch;
+ LocInfo.StackEpoch = StackEpoch;
+ }
+ if (!LocInfo.LastKillValid) {
+ LocInfo.LastKill = VersionStack.size() - 1;
+ LocInfo.LastKillValid = true;
+ }
+
+ // At this point, we should have corrected last kill and LowerBound to be
+ // in bounds.
+ assert(LocInfo.LowerBound < VersionStack.size() &&
+ "Lower bound out of range");
+ assert(LocInfo.LastKill < VersionStack.size() &&
+ "Last kill info out of range");
+ // In any case, the new upper bound is the top of the stack.
+ unsigned long UpperBound = VersionStack.size() - 1;
+
+ if (UpperBound - LocInfo.LowerBound > MaxCheckLimit) {
+ DEBUG(dbgs() << "MemorySSA skipping optimization of " << *MU << " ("
+ << *(MU->getMemoryInst()) << ")"
+ << " because there are " << UpperBound - LocInfo.LowerBound
+ << " stores to disambiguate\n");
+ // Because we did not walk, LastKill is no longer valid, as this may
+ // have been a kill.
+ LocInfo.LastKillValid = false;
+ continue;
+ }
+ bool FoundClobberResult = false;
+ while (UpperBound > LocInfo.LowerBound) {
+ if (isa<MemoryPhi>(VersionStack[UpperBound])) {
+ // For phis, use the walker, see where we ended up, go there
+ Instruction *UseInst = MU->getMemoryInst();
+ MemoryAccess *Result = Walker->getClobberingMemoryAccess(UseInst);
+ // We are guaranteed to find it or something is wrong
+ while (VersionStack[UpperBound] != Result) {
+ assert(UpperBound != 0);
+ --UpperBound;
+ }
+ FoundClobberResult = true;
+ break;
+ }
+
+ MemoryDef *MD = cast<MemoryDef>(VersionStack[UpperBound]);
+ // If the lifetime of the pointer ends at this instruction, it's live on
+ // entry.
+ if (!UseMLOC.IsCall && lifetimeEndsAt(MD, UseMLOC.getLoc(), *AA)) {
+ // Reset UpperBound to liveOnEntryDef's place in the stack
+ UpperBound = 0;
+ FoundClobberResult = true;
+ break;
+ }
+ if (instructionClobbersQuery(MD, MU, UseMLOC, *AA)) {
+ FoundClobberResult = true;
+ break;
+ }
+ --UpperBound;
+ }
+ // At the end of this loop, UpperBound is either a clobber, or lower bound
+ // PHI walking may cause it to be < LowerBound, and in fact, < LastKill.
+ if (FoundClobberResult || UpperBound < LocInfo.LastKill) {
+ MU->setDefiningAccess(VersionStack[UpperBound], true);
+ // We were last killed now by where we got to
+ LocInfo.LastKill = UpperBound;
+ } else {
+ // Otherwise, we checked all the new ones, and now we know we can get to
+ // LastKill.
+ MU->setDefiningAccess(VersionStack[LocInfo.LastKill], true);
+ }
+ LocInfo.LowerBound = VersionStack.size() - 1;
+ LocInfo.LowerBoundBlock = BB;
+ }
+}
+
+/// Optimize uses to point to their actual clobbering definitions.
+void MemorySSA::OptimizeUses::optimizeUses() {
+
+ // We perform a non-recursive top-down dominator tree walk
+ struct StackInfo {
+ const DomTreeNode *Node;
+ DomTreeNode::const_iterator Iter;
+ };
+
+ SmallVector<MemoryAccess *, 16> VersionStack;
+ SmallVector<StackInfo, 16> DomTreeWorklist;
+ DenseMap<MemoryLocOrCall, MemlocStackInfo> LocStackInfo;
+ VersionStack.push_back(MSSA->getLiveOnEntryDef());
+
+ unsigned long StackEpoch = 1;
+ unsigned long PopEpoch = 1;
+ for (const auto *DomNode : depth_first(DT->getRootNode()))
+ optimizeUsesInBlock(DomNode->getBlock(), StackEpoch, PopEpoch, VersionStack,
+ LocStackInfo);
+}
+
+void MemorySSA::placePHINodes(
+ const SmallPtrSetImpl<BasicBlock *> &DefiningBlocks,
+ const DenseMap<const BasicBlock *, unsigned int> &BBNumbers) {
+ // Determine where our MemoryPhi's should go
+ ForwardIDFCalculator IDFs(*DT);
+ IDFs.setDefiningBlocks(DefiningBlocks);
+ SmallVector<BasicBlock *, 32> IDFBlocks;
+ IDFs.calculate(IDFBlocks);
+
+ std::sort(IDFBlocks.begin(), IDFBlocks.end(),
+ [&BBNumbers](const BasicBlock *A, const BasicBlock *B) {
+ return BBNumbers.lookup(A) < BBNumbers.lookup(B);
+ });
+
+ // Now place MemoryPhi nodes.
+ for (auto &BB : IDFBlocks) {
+ // Insert phi node
+ AccessList *Accesses = getOrCreateAccessList(BB);
+ MemoryPhi *Phi = new MemoryPhi(BB->getContext(), BB, NextID++);
+ ValueToMemoryAccess[BB] = Phi;
+ // Phi's always are placed at the front of the block.
+ Accesses->push_front(Phi);
+ }
+}
+
void MemorySSA::buildMemorySSA() {
// We create an access to represent "live on entry", for things like
// arguments or users of globals, where the memory they use is defined before
@@ -335,6 +1497,8 @@ void MemorySSA::buildMemorySSA() {
BasicBlock &StartingPoint = F.getEntryBlock();
LiveOnEntryDef = make_unique<MemoryDef>(F.getContext(), nullptr, nullptr,
&StartingPoint, NextID++);
+ DenseMap<const BasicBlock *, unsigned int> BBNumbers;
+ unsigned NextBBNum = 0;
// We maintain lists of memory accesses per-block, trading memory for time. We
// could just look up the memory access for every possible instruction in the
@@ -344,6 +1508,7 @@ void MemorySSA::buildMemorySSA() {
// Go through each block, figure out where defs occur, and chain together all
// the accesses.
for (BasicBlock &B : F) {
+ BBNumbers[&B] = NextBBNum++;
bool InsertIntoDef = false;
AccessList *Accesses = nullptr;
for (Instruction &I : B) {
@@ -361,81 +1526,20 @@ void MemorySSA::buildMemorySSA() {
if (Accesses)
DefUseBlocks.insert(&B);
}
-
- // Compute live-in.
- // Live in is normally defined as "all the blocks on the path from each def to
- // each of it's uses".
- // MemoryDef's are implicit uses of previous state, so they are also uses.
- // This means we don't really have def-only instructions. The only
- // MemoryDef's that are not really uses are those that are of the LiveOnEntry
- // variable (because LiveOnEntry can reach anywhere, and every def is a
- // must-kill of LiveOnEntry).
- // In theory, you could precisely compute live-in by using alias-analysis to
- // disambiguate defs and uses to see which really pair up with which.
- // In practice, this would be really expensive and difficult. So we simply
- // assume all defs are also uses that need to be kept live.
- // Because of this, the end result of this live-in computation will be "the
- // entire set of basic blocks that reach any use".
-
- SmallPtrSet<BasicBlock *, 32> LiveInBlocks;
- SmallVector<BasicBlock *, 64> LiveInBlockWorklist(DefUseBlocks.begin(),
- DefUseBlocks.end());
- // Now that we have a set of blocks where a value is live-in, recursively add
- // predecessors until we find the full region the value is live.
- while (!LiveInBlockWorklist.empty()) {
- BasicBlock *BB = LiveInBlockWorklist.pop_back_val();
-
- // The block really is live in here, insert it into the set. If already in
- // the set, then it has already been processed.
- if (!LiveInBlocks.insert(BB).second)
- continue;
-
- // Since the value is live into BB, it is either defined in a predecessor or
- // live into it to.
- LiveInBlockWorklist.append(pred_begin(BB), pred_end(BB));
- }
-
- // Determine where our MemoryPhi's should go
- ForwardIDFCalculator IDFs(*DT);
- IDFs.setDefiningBlocks(DefiningBlocks);
- IDFs.setLiveInBlocks(LiveInBlocks);
- SmallVector<BasicBlock *, 32> IDFBlocks;
- IDFs.calculate(IDFBlocks);
-
- // Now place MemoryPhi nodes.
- for (auto &BB : IDFBlocks) {
- // Insert phi node
- AccessList *Accesses = getOrCreateAccessList(BB);
- MemoryPhi *Phi = new MemoryPhi(BB->getContext(), BB, NextID++);
- ValueToMemoryAccess.insert(std::make_pair(BB, Phi));
- // Phi's always are placed at the front of the block.
- Accesses->push_front(Phi);
- }
+ placePHINodes(DefiningBlocks, BBNumbers);
// Now do regular SSA renaming on the MemoryDef/MemoryUse. Visited will get
// filled in with all blocks.
SmallPtrSet<BasicBlock *, 16> Visited;
renamePass(DT->getRootNode(), LiveOnEntryDef.get(), Visited);
- MemorySSAWalker *Walker = getWalker();
+ CachingWalker *Walker = getWalkerImpl();
- // Now optimize the MemoryUse's defining access to point to the nearest
- // dominating clobbering def.
- // This ensures that MemoryUse's that are killed by the same store are
- // immediate users of that store, one of the invariants we guarantee.
- for (auto DomNode : depth_first(DT)) {
- BasicBlock *BB = DomNode->getBlock();
- auto AI = PerBlockAccesses.find(BB);
- if (AI == PerBlockAccesses.end())
- continue;
- AccessList *Accesses = AI->second.get();
- for (auto &MA : *Accesses) {
- if (auto *MU = dyn_cast<MemoryUse>(&MA)) {
- Instruction *Inst = MU->getMemoryInst();
- MU->setDefiningAccess(Walker->getClobberingMemoryAccess(Inst));
- }
- }
- }
+ // We're doing a batch of updates; don't drop useful caches between them.
+ Walker->setAutoResetWalker(false);
+ OptimizeUses(this, Walker, AA, DT).optimizeUses();
+ Walker->setAutoResetWalker(true);
+ Walker->resetClobberWalker();
// Mark the uses in unreachable blocks as live on entry, so that they go
// somewhere.
@@ -444,7 +1548,9 @@ void MemorySSA::buildMemorySSA() {
markUnreachableAsLiveOnEntry(&BB);
}
-MemorySSAWalker *MemorySSA::getWalker() {
+MemorySSAWalker *MemorySSA::getWalker() { return getWalkerImpl(); }
+
+MemorySSA::CachingWalker *MemorySSA::getWalkerImpl() {
if (Walker)
return Walker.get();
@@ -456,9 +1562,10 @@ MemoryPhi *MemorySSA::createMemoryPhi(BasicBlock *BB) {
assert(!getMemoryAccess(BB) && "MemoryPhi already exists for this BB");
AccessList *Accesses = getOrCreateAccessList(BB);
MemoryPhi *Phi = new MemoryPhi(BB->getContext(), BB, NextID++);
- ValueToMemoryAccess.insert(std::make_pair(BB, Phi));
+ ValueToMemoryAccess[BB] = Phi;
// Phi's always are placed at the front of the block.
Accesses->push_front(Phi);
+ BlockNumberingValid.erase(BB);
return Phi;
}
@@ -481,39 +1588,64 @@ MemoryAccess *MemorySSA::createMemoryAccessInBB(Instruction *I,
auto *Accesses = getOrCreateAccessList(BB);
if (Point == Beginning) {
// It goes after any phi nodes
- auto AI = std::find_if(
- Accesses->begin(), Accesses->end(),
- [](const MemoryAccess &MA) { return !isa<MemoryPhi>(MA); });
+ auto AI = find_if(
+ *Accesses, [](const MemoryAccess &MA) { return !isa<MemoryPhi>(MA); });
Accesses->insert(AI, NewAccess);
} else {
Accesses->push_back(NewAccess);
}
-
+ BlockNumberingValid.erase(BB);
return NewAccess;
}
-MemoryAccess *MemorySSA::createMemoryAccessBefore(Instruction *I,
- MemoryAccess *Definition,
- MemoryAccess *InsertPt) {
+
+MemoryUseOrDef *MemorySSA::createMemoryAccessBefore(Instruction *I,
+ MemoryAccess *Definition,
+ MemoryUseOrDef *InsertPt) {
assert(I->getParent() == InsertPt->getBlock() &&
"New and old access must be in the same block");
MemoryUseOrDef *NewAccess = createDefinedAccess(I, Definition);
auto *Accesses = getOrCreateAccessList(InsertPt->getBlock());
Accesses->insert(AccessList::iterator(InsertPt), NewAccess);
+ BlockNumberingValid.erase(InsertPt->getBlock());
return NewAccess;
}
-MemoryAccess *MemorySSA::createMemoryAccessAfter(Instruction *I,
- MemoryAccess *Definition,
- MemoryAccess *InsertPt) {
+MemoryUseOrDef *MemorySSA::createMemoryAccessAfter(Instruction *I,
+ MemoryAccess *Definition,
+ MemoryAccess *InsertPt) {
assert(I->getParent() == InsertPt->getBlock() &&
"New and old access must be in the same block");
MemoryUseOrDef *NewAccess = createDefinedAccess(I, Definition);
auto *Accesses = getOrCreateAccessList(InsertPt->getBlock());
Accesses->insertAfter(AccessList::iterator(InsertPt), NewAccess);
+ BlockNumberingValid.erase(InsertPt->getBlock());
return NewAccess;
}
+void MemorySSA::spliceMemoryAccessAbove(MemoryDef *Where,
+ MemoryUseOrDef *What) {
+ assert(What != getLiveOnEntryDef() &&
+ Where != getLiveOnEntryDef() && "Can't splice (above) LOE.");
+ assert(dominates(Where, What) && "Only upwards splices are permitted.");
+
+ if (Where == What)
+ return;
+ if (isa<MemoryDef>(What)) {
+ // TODO: possibly use removeMemoryAccess' more efficient RAUW
+ What->replaceAllUsesWith(What->getDefiningAccess());
+ What->setDefiningAccess(Where->getDefiningAccess());
+ Where->setDefiningAccess(What);
+ }
+ AccessList *Src = getWritableBlockAccesses(What->getBlock());
+ AccessList *Dest = getWritableBlockAccesses(Where->getBlock());
+ Dest->splice(AccessList::iterator(Where), *Src, What);
+
+ BlockNumberingValid.erase(What->getBlock());
+ if (What->getBlock() != Where->getBlock())
+ BlockNumberingValid.erase(Where->getBlock());
+}
+
/// \brief Helper function to create new memory accesses
MemoryUseOrDef *MemorySSA::createNewAccess(Instruction *I) {
// The assume intrinsic has a control dependency which we model by claiming
@@ -542,7 +1674,7 @@ MemoryUseOrDef *MemorySSA::createNewAccess(Instruction *I) {
MUD = new MemoryDef(I->getContext(), nullptr, I, I->getParent(), NextID++);
else
MUD = new MemoryUse(I->getContext(), nullptr, I, I->getParent());
- ValueToMemoryAccess.insert(std::make_pair(I, MUD));
+ ValueToMemoryAccess[I] = MUD;
return MUD;
}
@@ -611,6 +1743,7 @@ static MemoryAccess *onlySingleValue(MemoryPhi *MP) {
void MemorySSA::removeFromLookups(MemoryAccess *MA) {
assert(MA->use_empty() &&
"Trying to remove memory access that still has uses");
+ BlockNumbering.erase(MA);
if (MemoryUseOrDef *MUD = dyn_cast<MemoryUseOrDef>(MA))
MUD->setDefiningAccess(nullptr);
// Invalidate our walker's cache if necessary
@@ -624,7 +1757,9 @@ void MemorySSA::removeFromLookups(MemoryAccess *MA) {
} else {
MemoryInst = MA->getBlock();
}
- ValueToMemoryAccess.erase(MemoryInst);
+ auto VMA = ValueToMemoryAccess.find(MemoryInst);
+ if (VMA->second == MA)
+ ValueToMemoryAccess.erase(VMA);
auto AccessIt = PerBlockAccesses.find(MA->getBlock());
std::unique_ptr<AccessList> &Accesses = AccessIt->second;
@@ -652,8 +1787,27 @@ void MemorySSA::removeMemoryAccess(MemoryAccess *MA) {
}
// Re-point the uses at our defining access
- if (!MA->use_empty())
- MA->replaceAllUsesWith(NewDefTarget);
+ if (!MA->use_empty()) {
+ // Reset optimized on users of this store, and reset the uses.
+ // A few notes:
+ // 1. This is a slightly modified version of RAUW to avoid walking the
+ // uses twice here.
+ // 2. If we wanted to be complete, we would have to reset the optimized
+ // flags on users of phi nodes if doing the below makes a phi node have all
+ // the same arguments. Instead, we prefer users to removeMemoryAccess those
+ // phi nodes, because doing it here would be N^3.
+ if (MA->hasValueHandle())
+ ValueHandleBase::ValueIsRAUWd(MA, NewDefTarget);
+ // Note: We assume MemorySSA is not used in metadata since it's not really
+ // part of the IR.
+
+ while (!MA->use_empty()) {
+ Use &U = *MA->use_begin();
+ if (MemoryUse *MU = dyn_cast<MemoryUse>(U.getUser()))
+ MU->resetOptimized();
+ U.set(NewDefTarget);
+ }
+ }
// The call below to erase will destroy MA, so we can't change the order we
// are doing things here
@@ -674,6 +1828,7 @@ void MemorySSA::verifyMemorySSA() const {
verifyDefUses(F);
verifyDomination(F);
verifyOrdering(F);
+ Walker->verify(this);
}
/// \brief Verify that the order and existence of MemoryAccesses matches the
@@ -717,70 +1872,38 @@ void MemorySSA::verifyOrdering(Function &F) const {
/// \brief Verify the domination properties of MemorySSA by checking that each
/// definition dominates all of its uses.
void MemorySSA::verifyDomination(Function &F) const {
+#ifndef NDEBUG
for (BasicBlock &B : F) {
// Phi nodes are attached to basic blocks
- if (MemoryPhi *MP = getMemoryAccess(&B)) {
- for (User *U : MP->users()) {
- BasicBlock *UseBlock;
- // Phi operands are used on edges, we simulate the right domination by
- // acting as if the use occurred at the end of the predecessor block.
- if (MemoryPhi *P = dyn_cast<MemoryPhi>(U)) {
- for (const auto &Arg : P->operands()) {
- if (Arg == MP) {
- UseBlock = P->getIncomingBlock(Arg);
- break;
- }
- }
- } else {
- UseBlock = cast<MemoryAccess>(U)->getBlock();
- }
- (void)UseBlock;
- assert(DT->dominates(MP->getBlock(), UseBlock) &&
- "Memory PHI does not dominate it's uses");
- }
- }
+ if (MemoryPhi *MP = getMemoryAccess(&B))
+ for (const Use &U : MP->uses())
+ assert(dominates(MP, U) && "Memory PHI does not dominate it's uses");
for (Instruction &I : B) {
MemoryAccess *MD = dyn_cast_or_null<MemoryDef>(getMemoryAccess(&I));
if (!MD)
continue;
- for (User *U : MD->users()) {
- BasicBlock *UseBlock;
- (void)UseBlock;
- // Things are allowed to flow to phi nodes over their predecessor edge.
- if (auto *P = dyn_cast<MemoryPhi>(U)) {
- for (const auto &Arg : P->operands()) {
- if (Arg == MD) {
- UseBlock = P->getIncomingBlock(Arg);
- break;
- }
- }
- } else {
- UseBlock = cast<MemoryAccess>(U)->getBlock();
- }
- assert(DT->dominates(MD->getBlock(), UseBlock) &&
- "Memory Def does not dominate it's uses");
- }
+ for (const Use &U : MD->uses())
+ assert(dominates(MD, U) && "Memory Def does not dominate it's uses");
}
}
+#endif
}
/// \brief Verify the def-use lists in MemorySSA, by verifying that \p Use
/// appears in the use list of \p Def.
-///
-/// llvm_unreachable is used instead of asserts because this may be called in
-/// a build without asserts. In that case, we don't want this to turn into a
-/// nop.
+
void MemorySSA::verifyUseInDefs(MemoryAccess *Def, MemoryAccess *Use) const {
+#ifndef NDEBUG
// The live on entry use may cause us to get a NULL def here
- if (!Def) {
- if (!isLiveOnEntryDef(Use))
- llvm_unreachable("Null def but use not point to live on entry def");
- } else if (std::find(Def->user_begin(), Def->user_end(), Use) ==
- Def->user_end()) {
- llvm_unreachable("Did not find use in def's use list");
- }
+ if (!Def)
+ assert(isLiveOnEntryDef(Use) &&
+ "Null def but use not point to live on entry def");
+ else
+ assert(is_contained(Def->users(), Use) &&
+ "Did not find use in def's use list");
+#endif
}
/// \brief Verify the immediate use information, by walking all the memory
@@ -798,21 +1921,35 @@ void MemorySSA::verifyDefUses(Function &F) const {
}
for (Instruction &I : B) {
- if (MemoryAccess *MA = getMemoryAccess(&I)) {
- assert(isa<MemoryUseOrDef>(MA) &&
- "Found a phi node not attached to a bb");
- verifyUseInDefs(cast<MemoryUseOrDef>(MA)->getDefiningAccess(), MA);
+ if (MemoryUseOrDef *MA = getMemoryAccess(&I)) {
+ verifyUseInDefs(MA->getDefiningAccess(), MA);
}
}
}
}
-MemoryAccess *MemorySSA::getMemoryAccess(const Value *I) const {
- return ValueToMemoryAccess.lookup(I);
+MemoryUseOrDef *MemorySSA::getMemoryAccess(const Instruction *I) const {
+ return cast_or_null<MemoryUseOrDef>(ValueToMemoryAccess.lookup(I));
}
MemoryPhi *MemorySSA::getMemoryAccess(const BasicBlock *BB) const {
- return cast_or_null<MemoryPhi>(getMemoryAccess((const Value *)BB));
+ return cast_or_null<MemoryPhi>(ValueToMemoryAccess.lookup(cast<Value>(BB)));
+}
+
+/// Perform a local numbering on blocks so that instruction ordering can be
+/// determined in constant time.
+/// TODO: We currently just number in order. If we numbered by N, we could
+/// allow at least N-1 sequences of insertBefore or insertAfter (and at least
+/// log2(N) sequences of mixed before and after) without needing to invalidate
+/// the numbering.
+void MemorySSA::renumberBlock(const BasicBlock *B) const {
+ // The pre-increment ensures the numbers really start at 1.
+ unsigned long CurrentNumber = 0;
+ const AccessList *AL = getBlockAccesses(B);
+ assert(AL != nullptr && "Asking to renumber an empty block");
+ for (const auto &I : *AL)
+ BlockNumbering[&I] = ++CurrentNumber;
+ BlockNumberingValid.insert(B);
}
/// \brief Determine, for two memory accesses in the same block,
@@ -821,9 +1958,10 @@ MemoryPhi *MemorySSA::getMemoryAccess(const BasicBlock *BB) const {
bool MemorySSA::locallyDominates(const MemoryAccess *Dominator,
const MemoryAccess *Dominatee) const {
- assert((Dominator->getBlock() == Dominatee->getBlock()) &&
- "Asking for local domination when accesses are in different blocks!");
+ const BasicBlock *DominatorBlock = Dominator->getBlock();
+ assert((DominatorBlock == Dominatee->getBlock()) &&
+ "Asking for local domination when accesses are in different blocks!");
// A node dominates itself.
if (Dominatee == Dominator)
return true;
@@ -838,14 +1976,42 @@ bool MemorySSA::locallyDominates(const MemoryAccess *Dominator,
if (isLiveOnEntryDef(Dominator))
return true;
- // Get the access list for the block
- const AccessList *AccessList = getBlockAccesses(Dominator->getBlock());
- AccessList::const_reverse_iterator It(Dominator->getIterator());
+ if (!BlockNumberingValid.count(DominatorBlock))
+ renumberBlock(DominatorBlock);
+
+ unsigned long DominatorNum = BlockNumbering.lookup(Dominator);
+ // All numbers start with 1
+ assert(DominatorNum != 0 && "Block was not numbered properly");
+ unsigned long DominateeNum = BlockNumbering.lookup(Dominatee);
+ assert(DominateeNum != 0 && "Block was not numbered properly");
+ return DominatorNum < DominateeNum;
+}
+
+bool MemorySSA::dominates(const MemoryAccess *Dominator,
+ const MemoryAccess *Dominatee) const {
+ if (Dominator == Dominatee)
+ return true;
+
+ if (isLiveOnEntryDef(Dominatee))
+ return false;
+
+ if (Dominator->getBlock() != Dominatee->getBlock())
+ return DT->dominates(Dominator->getBlock(), Dominatee->getBlock());
+ return locallyDominates(Dominator, Dominatee);
+}
- // If we hit the beginning of the access list before we hit dominatee, we must
- // dominate it
- return std::none_of(It, AccessList->rend(),
- [&](const MemoryAccess &MA) { return &MA == Dominatee; });
+bool MemorySSA::dominates(const MemoryAccess *Dominator,
+ const Use &Dominatee) const {
+ if (MemoryPhi *MP = dyn_cast<MemoryPhi>(Dominatee.getUser())) {
+ BasicBlock *UseBB = MP->getIncomingBlock(Dominatee);
+ // The def must dominate the incoming block of the phi.
+ if (UseBB != Dominator->getBlock())
+ return DT->dominates(Dominator->getBlock(), UseBB);
+ // If the UseBB and the DefBB are the same, compare locally.
+ return locallyDominates(Dominator, cast<MemoryAccess>(Dominatee));
+ }
+ // If it's not a PHI node use, the normal dominates can already handle it.
+ return dominates(Dominator, cast<MemoryAccess>(Dominatee.getUser()));
}
const static char LiveOnEntryStr[] = "liveOnEntry";
@@ -924,25 +2090,26 @@ bool MemorySSAPrinterLegacyPass::runOnFunction(Function &F) {
return false;
}
-char MemorySSAAnalysis::PassID;
+AnalysisKey MemorySSAAnalysis::Key;
-MemorySSA MemorySSAAnalysis::run(Function &F, AnalysisManager<Function> &AM) {
+MemorySSAAnalysis::Result MemorySSAAnalysis::run(Function &F,
+ FunctionAnalysisManager &AM) {
auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
auto &AA = AM.getResult<AAManager>(F);
- return MemorySSA(F, &AA, &DT);
+ return MemorySSAAnalysis::Result(make_unique<MemorySSA>(F, &AA, &DT));
}
PreservedAnalyses MemorySSAPrinterPass::run(Function &F,
FunctionAnalysisManager &AM) {
OS << "MemorySSA for function: " << F.getName() << "\n";
- AM.getResult<MemorySSAAnalysis>(F).print(OS);
+ AM.getResult<MemorySSAAnalysis>(F).getMSSA().print(OS);
return PreservedAnalyses::all();
}
PreservedAnalyses MemorySSAVerifierPass::run(Function &F,
FunctionAnalysisManager &AM) {
- AM.getResult<MemorySSAAnalysis>(F).verifyMemorySSA();
+ AM.getResult<MemorySSAAnalysis>(F).getMSSA().verifyMemorySSA();
return PreservedAnalyses::all();
}
@@ -978,41 +2145,11 @@ MemorySSAWalker::MemorySSAWalker(MemorySSA *M) : MSSA(M) {}
MemorySSA::CachingWalker::CachingWalker(MemorySSA *M, AliasAnalysis *A,
DominatorTree *D)
- : MemorySSAWalker(M), AA(A), DT(D) {}
+ : MemorySSAWalker(M), Walker(*M, *A, *D, Cache), AutoResetWalker(true) {}
MemorySSA::CachingWalker::~CachingWalker() {}
-struct MemorySSA::CachingWalker::UpwardsMemoryQuery {
- // True if we saw a phi whose predecessor was a backedge
- bool SawBackedgePhi;
- // True if our original query started off as a call
- bool IsCall;
- // The pointer location we started the query with. This will be empty if
- // IsCall is true.
- MemoryLocation StartingLoc;
- // This is the instruction we were querying about.
- const Instruction *Inst;
- // Set of visited Instructions for this query.
- DenseSet<MemoryAccessPair> Visited;
- // Vector of visited call accesses for this query. This is separated out
- // because you can always cache and lookup the result of call queries (IE when
- // IsCall == true) for every call in the chain. The calls have no AA location
- // associated with them with them, and thus, no context dependence.
- SmallVector<const MemoryAccess *, 32> VisitedCalls;
- // The MemoryAccess we actually got called with, used to test local domination
- const MemoryAccess *OriginalAccess;
-
- UpwardsMemoryQuery()
- : SawBackedgePhi(false), IsCall(false), Inst(nullptr),
- OriginalAccess(nullptr) {}
-
- UpwardsMemoryQuery(const Instruction *Inst, const MemoryAccess *Access)
- : SawBackedgePhi(false), IsCall(ImmutableCallSite(Inst)), Inst(Inst),
- OriginalAccess(Access) {}
-};
-
void MemorySSA::CachingWalker::invalidateInfo(MemoryAccess *MA) {
-
// TODO: We can do much better cache invalidation with differently stored
// caches. For now, for MemoryUses, we simply remove them
// from the cache, and kill the entire call/non-call cache for everything
@@ -1026,220 +2163,38 @@ void MemorySSA::CachingWalker::invalidateInfo(MemoryAccess *MA) {
// itself.
if (MemoryUse *MU = dyn_cast<MemoryUse>(MA)) {
- UpwardsMemoryQuery Q;
- Instruction *I = MU->getMemoryInst();
- Q.IsCall = bool(ImmutableCallSite(I));
- Q.Inst = I;
- if (!Q.IsCall)
- Q.StartingLoc = MemoryLocation::get(I);
- doCacheRemove(MA, Q, Q.StartingLoc);
+ UpwardsMemoryQuery Q(MU->getMemoryInst(), MU);
+ Cache.remove(MU, Q.StartingLoc, Q.IsCall);
+ MU->resetOptimized();
} else {
// If it is not a use, the best we can do right now is destroy the cache.
- CachedUpwardsClobberingCall.clear();
- CachedUpwardsClobberingAccess.clear();
+ Cache.clear();
}
#ifdef EXPENSIVE_CHECKS
- // Run this only when expensive checks are enabled.
verifyRemoved(MA);
#endif
}
-void MemorySSA::CachingWalker::doCacheRemove(const MemoryAccess *M,
- const UpwardsMemoryQuery &Q,
- const MemoryLocation &Loc) {
- if (Q.IsCall)
- CachedUpwardsClobberingCall.erase(M);
- else
- CachedUpwardsClobberingAccess.erase({M, Loc});
-}
-
-void MemorySSA::CachingWalker::doCacheInsert(const MemoryAccess *M,
- MemoryAccess *Result,
- const UpwardsMemoryQuery &Q,
- const MemoryLocation &Loc) {
- // This is fine for Phis, since there are times where we can't optimize them.
- // Making a def its own clobber is never correct, though.
- assert((Result != M || isa<MemoryPhi>(M)) &&
- "Something can't clobber itself!");
- ++NumClobberCacheInserts;
- if (Q.IsCall)
- CachedUpwardsClobberingCall[M] = Result;
- else
- CachedUpwardsClobberingAccess[{M, Loc}] = Result;
-}
-
-MemoryAccess *
-MemorySSA::CachingWalker::doCacheLookup(const MemoryAccess *M,
- const UpwardsMemoryQuery &Q,
- const MemoryLocation &Loc) {
- ++NumClobberCacheLookups;
- MemoryAccess *Result;
-
- if (Q.IsCall)
- Result = CachedUpwardsClobberingCall.lookup(M);
- else
- Result = CachedUpwardsClobberingAccess.lookup({M, Loc});
-
- if (Result)
- ++NumClobberCacheHits;
- return Result;
-}
-
-bool MemorySSA::CachingWalker::instructionClobbersQuery(
- const MemoryDef *MD, UpwardsMemoryQuery &Q,
- const MemoryLocation &Loc) const {
- Instruction *DefMemoryInst = MD->getMemoryInst();
- assert(DefMemoryInst && "Defining instruction not actually an instruction");
-
- if (!Q.IsCall)
- return AA->getModRefInfo(DefMemoryInst, Loc) & MRI_Mod;
-
- // If this is a call, mark it for caching
- if (ImmutableCallSite(DefMemoryInst))
- Q.VisitedCalls.push_back(MD);
- ModRefInfo I = AA->getModRefInfo(DefMemoryInst, ImmutableCallSite(Q.Inst));
- return I != MRI_NoModRef;
-}
-
-MemoryAccessPair MemorySSA::CachingWalker::UpwardsDFSWalk(
- MemoryAccess *StartingAccess, const MemoryLocation &Loc,
- UpwardsMemoryQuery &Q, bool FollowingBackedge) {
- MemoryAccess *ModifyingAccess = nullptr;
-
- auto DFI = df_begin(StartingAccess);
- for (auto DFE = df_end(StartingAccess); DFI != DFE;) {
- MemoryAccess *CurrAccess = *DFI;
- if (MSSA->isLiveOnEntryDef(CurrAccess))
- return {CurrAccess, Loc};
- // If this is a MemoryDef, check whether it clobbers our current query. This
- // needs to be done before consulting the cache, because the cache reports
- // the clobber for CurrAccess. If CurrAccess is a clobber for this query,
- // and we ask the cache for information first, then we might skip this
- // clobber, which is bad.
- if (auto *MD = dyn_cast<MemoryDef>(CurrAccess)) {
- // If we hit the top, stop following this path.
- // While we can do lookups, we can't sanely do inserts here unless we were
- // to track everything we saw along the way, since we don't know where we
- // will stop.
- if (instructionClobbersQuery(MD, Q, Loc)) {
- ModifyingAccess = CurrAccess;
- break;
- }
- }
- if (auto CacheResult = doCacheLookup(CurrAccess, Q, Loc))
- return {CacheResult, Loc};
-
- // We need to know whether it is a phi so we can track backedges.
- // Otherwise, walk all upward defs.
- if (!isa<MemoryPhi>(CurrAccess)) {
- ++DFI;
- continue;
- }
-
-#ifndef NDEBUG
- // The loop below visits the phi's children for us. Because phis are the
- // only things with multiple edges, skipping the children should always lead
- // us to the end of the loop.
- //
- // Use a copy of DFI because skipChildren would kill our search stack, which
- // would make caching anything on the way back impossible.
- auto DFICopy = DFI;
- assert(DFICopy.skipChildren() == DFE &&
- "Skipping phi's children doesn't end the DFS?");
-#endif
-
- const MemoryAccessPair PHIPair(CurrAccess, Loc);
-
- // Don't try to optimize this phi again if we've already tried to do so.
- if (!Q.Visited.insert(PHIPair).second) {
- ModifyingAccess = CurrAccess;
- break;
- }
-
- std::size_t InitialVisitedCallSize = Q.VisitedCalls.size();
-
- // Recurse on PHI nodes, since we need to change locations.
- // TODO: Allow graphtraits on pairs, which would turn this whole function
- // into a normal single depth first walk.
- MemoryAccess *FirstDef = nullptr;
- for (auto MPI = upward_defs_begin(PHIPair), MPE = upward_defs_end();
- MPI != MPE; ++MPI) {
- bool Backedge =
- !FollowingBackedge &&
- DT->dominates(CurrAccess->getBlock(), MPI.getPhiArgBlock());
-
- MemoryAccessPair CurrentPair =
- UpwardsDFSWalk(MPI->first, MPI->second, Q, Backedge);
- // All the phi arguments should reach the same point if we can bypass
- // this phi. The alternative is that they hit this phi node, which
- // means we can skip this argument.
- if (FirstDef && CurrentPair.first != PHIPair.first &&
- CurrentPair.first != FirstDef) {
- ModifyingAccess = CurrAccess;
- break;
- }
-
- if (!FirstDef)
- FirstDef = CurrentPair.first;
- }
-
- // If we exited the loop early, go with the result it gave us.
- if (!ModifyingAccess) {
- assert(FirstDef && "Found a Phi with no upward defs?");
- ModifyingAccess = FirstDef;
- } else {
- // If we can't optimize this Phi, then we can't safely cache any of the
- // calls we visited when trying to optimize it. Wipe them out now.
- Q.VisitedCalls.resize(InitialVisitedCallSize);
- }
- break;
- }
-
- if (!ModifyingAccess)
- return {MSSA->getLiveOnEntryDef(), Q.StartingLoc};
-
- const BasicBlock *OriginalBlock = StartingAccess->getBlock();
- assert(DFI.getPathLength() > 0 && "We dropped our path?");
- unsigned N = DFI.getPathLength();
- // If we found a clobbering def, the last element in the path will be our
- // clobber, so we don't want to cache that to itself. OTOH, if we optimized a
- // phi, we can add the last thing in the path to the cache, since that won't
- // be the result.
- if (DFI.getPath(N - 1) == ModifyingAccess)
- --N;
- for (; N > 1; --N) {
- MemoryAccess *CacheAccess = DFI.getPath(N - 1);
- BasicBlock *CurrBlock = CacheAccess->getBlock();
- if (!FollowingBackedge)
- doCacheInsert(CacheAccess, ModifyingAccess, Q, Loc);
- if (DT->dominates(CurrBlock, OriginalBlock) &&
- (CurrBlock != OriginalBlock || !FollowingBackedge ||
- MSSA->locallyDominates(CacheAccess, StartingAccess)))
- break;
- }
-
- // Cache everything else on the way back. The caller should cache
- // StartingAccess for us.
- for (; N > 1; --N) {
- MemoryAccess *CacheAccess = DFI.getPath(N - 1);
- doCacheInsert(CacheAccess, ModifyingAccess, Q, Loc);
- }
-
- return {ModifyingAccess, Loc};
-}
-
/// \brief Walk the use-def chains starting at \p MA and find
/// the MemoryAccess that actually clobbers Loc.
///
/// \returns our clobbering memory access
MemoryAccess *MemorySSA::CachingWalker::getClobberingMemoryAccess(
MemoryAccess *StartingAccess, UpwardsMemoryQuery &Q) {
- return UpwardsDFSWalk(StartingAccess, Q.StartingLoc, Q, false).first;
+ MemoryAccess *New = Walker.findClobber(StartingAccess, Q);
+#ifdef EXPENSIVE_CHECKS
+ MemoryAccess *NewNoCache =
+ Walker.findClobber(StartingAccess, Q, /*UseWalkerCache=*/false);
+ assert(NewNoCache == New && "Cache made us hand back a different result?");
+#endif
+ if (AutoResetWalker)
+ resetClobberWalker();
+ return New;
}
MemoryAccess *MemorySSA::CachingWalker::getClobberingMemoryAccess(
- MemoryAccess *StartingAccess, MemoryLocation &Loc) {
+ MemoryAccess *StartingAccess, const MemoryLocation &Loc) {
if (isa<MemoryPhi>(StartingAccess))
return StartingAccess;
@@ -1257,10 +2212,10 @@ MemoryAccess *MemorySSA::CachingWalker::getClobberingMemoryAccess(
UpwardsMemoryQuery Q;
Q.OriginalAccess = StartingUseOrDef;
Q.StartingLoc = Loc;
- Q.Inst = StartingUseOrDef->getMemoryInst();
+ Q.Inst = I;
Q.IsCall = false;
- if (auto CacheResult = doCacheLookup(StartingUseOrDef, Q, Q.StartingLoc))
+ if (auto *CacheResult = Cache.lookup(StartingUseOrDef, Loc, Q.IsCall))
return CacheResult;
// Unlike the other function, do not walk to the def of a def, because we are
@@ -1270,9 +2225,6 @@ MemoryAccess *MemorySSA::CachingWalker::getClobberingMemoryAccess(
: StartingUseOrDef;
MemoryAccess *Clobber = getClobberingMemoryAccess(DefiningAccess, Q);
- // Only cache this if it wouldn't make Clobber point to itself.
- if (Clobber != StartingAccess)
- doCacheInsert(Q.OriginalAccess, Clobber, Q, Q.StartingLoc);
DEBUG(dbgs() << "Starting Memory SSA clobber for " << *I << " is ");
DEBUG(dbgs() << *StartingUseOrDef << "\n");
DEBUG(dbgs() << "Final Memory SSA clobber for " << *I << " is ");
@@ -1281,28 +2233,38 @@ MemoryAccess *MemorySSA::CachingWalker::getClobberingMemoryAccess(
}
MemoryAccess *
-MemorySSA::CachingWalker::getClobberingMemoryAccess(const Instruction *I) {
- // There should be no way to lookup an instruction and get a phi as the
- // access, since we only map BB's to PHI's. So, this must be a use or def.
- auto *StartingAccess = cast<MemoryUseOrDef>(MSSA->getMemoryAccess(I));
-
- bool IsCall = bool(ImmutableCallSite(I));
-
+MemorySSA::CachingWalker::getClobberingMemoryAccess(MemoryAccess *MA) {
+ auto *StartingAccess = dyn_cast<MemoryUseOrDef>(MA);
+ // If this is a MemoryPhi, we can't do anything.
+ if (!StartingAccess)
+ return MA;
+
+ // If this is an already optimized use or def, return the optimized result.
+ // Note: Currently, we do not store the optimized def result because we'd need
+ // a separate field, since we can't use it as the defining access.
+ if (MemoryUse *MU = dyn_cast<MemoryUse>(StartingAccess))
+ if (MU->isOptimized())
+ return MU->getDefiningAccess();
+
+ const Instruction *I = StartingAccess->getMemoryInst();
+ UpwardsMemoryQuery Q(I, StartingAccess);
// We can't sanely do anything with a fences, they conservatively
// clobber all memory, and have no locations to get pointers from to
// try to disambiguate.
- if (!IsCall && I->isFenceLike())
+ if (!Q.IsCall && I->isFenceLike())
return StartingAccess;
- UpwardsMemoryQuery Q;
- Q.OriginalAccess = StartingAccess;
- Q.IsCall = IsCall;
- if (!Q.IsCall)
- Q.StartingLoc = MemoryLocation::get(I);
- Q.Inst = I;
- if (auto CacheResult = doCacheLookup(StartingAccess, Q, Q.StartingLoc))
+ if (auto *CacheResult = Cache.lookup(StartingAccess, Q.StartingLoc, Q.IsCall))
return CacheResult;
+ if (isUseTriviallyOptimizableToLiveOnEntry(*MSSA->AA, I)) {
+ MemoryAccess *LiveOnEntry = MSSA->getLiveOnEntryDef();
+ Cache.insert(StartingAccess, LiveOnEntry, Q.StartingLoc, Q.IsCall);
+ if (MemoryUse *MU = dyn_cast<MemoryUse>(StartingAccess))
+ MU->setDefiningAccess(LiveOnEntry, true);
+ return LiveOnEntry;
+ }
+
// Start with the thing we already think clobbers this location
MemoryAccess *DefiningAccess = StartingAccess->getDefiningAccess();
@@ -1312,50 +2274,32 @@ MemorySSA::CachingWalker::getClobberingMemoryAccess(const Instruction *I) {
return DefiningAccess;
MemoryAccess *Result = getClobberingMemoryAccess(DefiningAccess, Q);
- // DFS won't cache a result for DefiningAccess. So, if DefiningAccess isn't
- // our clobber, be sure that it gets a cache entry, too.
- if (Result != DefiningAccess)
- doCacheInsert(DefiningAccess, Result, Q, Q.StartingLoc);
- doCacheInsert(Q.OriginalAccess, Result, Q, Q.StartingLoc);
- // TODO: When this implementation is more mature, we may want to figure out
- // what this additional caching buys us. It's most likely A Good Thing.
- if (Q.IsCall)
- for (const MemoryAccess *MA : Q.VisitedCalls)
- if (MA != Result)
- doCacheInsert(MA, Result, Q, Q.StartingLoc);
-
DEBUG(dbgs() << "Starting Memory SSA clobber for " << *I << " is ");
DEBUG(dbgs() << *DefiningAccess << "\n");
DEBUG(dbgs() << "Final Memory SSA clobber for " << *I << " is ");
DEBUG(dbgs() << *Result << "\n");
+ if (MemoryUse *MU = dyn_cast<MemoryUse>(StartingAccess))
+ MU->setDefiningAccess(Result, true);
return Result;
}
// Verify that MA doesn't exist in any of the caches.
void MemorySSA::CachingWalker::verifyRemoved(MemoryAccess *MA) {
-#ifndef NDEBUG
- for (auto &P : CachedUpwardsClobberingAccess)
- assert(P.first.first != MA && P.second != MA &&
- "Found removed MemoryAccess in cache.");
- for (auto &P : CachedUpwardsClobberingCall)
- assert(P.first != MA && P.second != MA &&
- "Found removed MemoryAccess in cache.");
-#endif // !NDEBUG
+ assert(!Cache.contains(MA) && "Found removed MemoryAccess in cache.");
}
MemoryAccess *
-DoNothingMemorySSAWalker::getClobberingMemoryAccess(const Instruction *I) {
- MemoryAccess *MA = MSSA->getMemoryAccess(I);
+DoNothingMemorySSAWalker::getClobberingMemoryAccess(MemoryAccess *MA) {
if (auto *Use = dyn_cast<MemoryUseOrDef>(MA))
return Use->getDefiningAccess();
return MA;
}
MemoryAccess *DoNothingMemorySSAWalker::getClobberingMemoryAccess(
- MemoryAccess *StartingAccess, MemoryLocation &) {
+ MemoryAccess *StartingAccess, const MemoryLocation &) {
if (auto *Use = dyn_cast<MemoryUseOrDef>(StartingAccess))
return Use->getDefiningAccess();
return StartingAccess;
}
-}
+} // namespace llvm
diff --git a/contrib/llvm/lib/Transforms/Utils/ModuleUtils.cpp b/contrib/llvm/lib/Transforms/Utils/ModuleUtils.cpp
index eb91885..0d623df 100644
--- a/contrib/llvm/lib/Transforms/Utils/ModuleUtils.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/ModuleUtils.cpp
@@ -89,6 +89,44 @@ void llvm::appendToGlobalDtors(Module &M, Function *F, int Priority, Constant *D
appendToGlobalArray("llvm.global_dtors", M, F, Priority, Data);
}
+static void appendToUsedList(Module &M, StringRef Name, ArrayRef<GlobalValue *> Values) {
+ GlobalVariable *GV = M.getGlobalVariable(Name);
+ SmallPtrSet<Constant *, 16> InitAsSet;
+ SmallVector<Constant *, 16> Init;
+ if (GV) {
+ ConstantArray *CA = dyn_cast<ConstantArray>(GV->getInitializer());
+ for (auto &Op : CA->operands()) {
+ Constant *C = cast_or_null<Constant>(Op);
+ if (InitAsSet.insert(C).second)
+ Init.push_back(C);
+ }
+ GV->eraseFromParent();
+ }
+
+ Type *Int8PtrTy = llvm::Type::getInt8PtrTy(M.getContext());
+ for (auto *V : Values) {
+ Constant *C = ConstantExpr::getBitCast(V, Int8PtrTy);
+ if (InitAsSet.insert(C).second)
+ Init.push_back(C);
+ }
+
+ if (Init.empty())
+ return;
+
+ ArrayType *ATy = ArrayType::get(Int8PtrTy, Init.size());
+ GV = new llvm::GlobalVariable(M, ATy, false, GlobalValue::AppendingLinkage,
+ ConstantArray::get(ATy, Init), Name);
+ GV->setSection("llvm.metadata");
+}
+
+void llvm::appendToUsed(Module &M, ArrayRef<GlobalValue *> Values) {
+ appendToUsedList(M, "llvm.used", Values);
+}
+
+void llvm::appendToCompilerUsed(Module &M, ArrayRef<GlobalValue *> Values) {
+ appendToUsedList(M, "llvm.compiler.used", Values);
+}
+
Function *llvm::checkSanitizerInterfaceFunction(Constant *FuncOrBitcast) {
if (isa<Function>(FuncOrBitcast))
return cast<Function>(FuncOrBitcast);
@@ -104,7 +142,7 @@ std::pair<Function *, Function *> llvm::createSanitizerCtorAndInitFunctions(
ArrayRef<Type *> InitArgTypes, ArrayRef<Value *> InitArgs,
StringRef VersionCheckName) {
assert(!InitName.empty() && "Expected init function name");
- assert(InitArgTypes.size() == InitArgTypes.size() &&
+ assert(InitArgs.size() == InitArgTypes.size() &&
"Sanitizer's init function expects different number of arguments");
Function *Ctor = Function::Create(
FunctionType::get(Type::getVoidTy(M.getContext()), false),
@@ -126,3 +164,67 @@ std::pair<Function *, Function *> llvm::createSanitizerCtorAndInitFunctions(
}
return std::make_pair(Ctor, InitFunction);
}
+
+void llvm::filterDeadComdatFunctions(
+ Module &M, SmallVectorImpl<Function *> &DeadComdatFunctions) {
+ // Build a map from the comdat to the number of entries in that comdat we
+ // think are dead. If this fully covers the comdat group, then the entire
+ // group is dead. If we find another entry in the comdat group though, we'll
+ // have to preserve the whole group.
+ SmallDenseMap<Comdat *, int, 16> ComdatEntriesCovered;
+ for (Function *F : DeadComdatFunctions) {
+ Comdat *C = F->getComdat();
+ assert(C && "Expected all input GVs to be in a comdat!");
+ ComdatEntriesCovered[C] += 1;
+ }
+
+ auto CheckComdat = [&](Comdat &C) {
+ auto CI = ComdatEntriesCovered.find(&C);
+ if (CI == ComdatEntriesCovered.end())
+ return;
+
+ // If this could have been covered by a dead entry, just subtract one to
+ // account for it.
+ if (CI->second > 0) {
+ CI->second -= 1;
+ return;
+ }
+
+ // If we've already accounted for all the entries that were dead, the
+ // entire comdat is alive so remove it from the map.
+ ComdatEntriesCovered.erase(CI);
+ };
+
+ auto CheckAllComdats = [&] {
+ for (Function &F : M.functions())
+ if (Comdat *C = F.getComdat()) {
+ CheckComdat(*C);
+ if (ComdatEntriesCovered.empty())
+ return;
+ }
+ for (GlobalVariable &GV : M.globals())
+ if (Comdat *C = GV.getComdat()) {
+ CheckComdat(*C);
+ if (ComdatEntriesCovered.empty())
+ return;
+ }
+ for (GlobalAlias &GA : M.aliases())
+ if (Comdat *C = GA.getComdat()) {
+ CheckComdat(*C);
+ if (ComdatEntriesCovered.empty())
+ return;
+ }
+ };
+ CheckAllComdats();
+
+ if (ComdatEntriesCovered.empty()) {
+ DeadComdatFunctions.clear();
+ return;
+ }
+
+ // Remove the entries that were not covering.
+ erase_if(DeadComdatFunctions, [&](GlobalValue *GV) {
+ return ComdatEntriesCovered.find(GV->getComdat()) ==
+ ComdatEntriesCovered.end();
+ });
+}
diff --git a/contrib/llvm/lib/Transforms/Utils/NameAnonFunctions.cpp b/contrib/llvm/lib/Transforms/Utils/NameAnonGlobals.cpp
index c4f3839..34dc1cc 100644
--- a/contrib/llvm/lib/Transforms/Utils/NameAnonFunctions.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/NameAnonGlobals.cpp
@@ -1,4 +1,4 @@
-//===- NameAnonFunctions.cpp - ThinLTO Summary-based Function Import ------===//
+//===- NameAnonGlobals.cpp - ThinLTO Support: Name Unnamed Globals --------===//
//
// The LLVM Compiler Infrastructure
//
@@ -7,11 +7,13 @@
//
//===----------------------------------------------------------------------===//
//
-// This file implements naming anonymous function to make sure they can be
-// refered to by ThinLTO.
+// This file implements naming anonymous globals to make sure they can be
+// referred to by ThinLTO.
//
//===----------------------------------------------------------------------===//
+#include "llvm/Transforms/Utils/NameAnonGlobals.h"
+
#include "llvm/ADT/SmallString.h"
#include "llvm/IR/Module.h"
#include "llvm/Support/MD5.h"
@@ -19,8 +21,9 @@
using namespace llvm;
+namespace {
// Compute a "unique" hash for the module based on the name of the public
-// functions.
+// globals.
class ModuleHasher {
Module &TheModule;
std::string TheHash;
@@ -57,46 +60,62 @@ public:
return TheHash;
}
};
+} // end anonymous namespace
-// Rename all the anon functions in the module
-bool llvm::nameUnamedFunctions(Module &M) {
+// Rename all the anon globals in the module
+bool llvm::nameUnamedGlobals(Module &M) {
bool Changed = false;
ModuleHasher ModuleHash(M);
int count = 0;
- for (auto &F : M) {
- if (F.hasName())
- continue;
- F.setName(Twine("anon.") + ModuleHash.get() + "." + Twine(count++));
+ auto RenameIfNeed = [&](GlobalValue &GV) {
+ if (GV.hasName())
+ return;
+ GV.setName(Twine("anon.") + ModuleHash.get() + "." + Twine(count++));
Changed = true;
- }
+ };
+ for (auto &GO : M.global_objects())
+ RenameIfNeed(GO);
+ for (auto &GA : M.aliases())
+ RenameIfNeed(GA);
+
return Changed;
}
namespace {
-// Simple pass that provides a name to every anon function.
-class NameAnonFunction : public ModulePass {
+// Legacy pass that provides a name to every anon globals.
+class NameAnonGlobalLegacyPass : public ModulePass {
public:
/// Pass identification, replacement for typeid
static char ID;
/// Specify pass name for debug output
- const char *getPassName() const override { return "Name Anon Functions"; }
+ StringRef getPassName() const override { return "Name Anon Globals"; }
- explicit NameAnonFunction() : ModulePass(ID) {}
+ explicit NameAnonGlobalLegacyPass() : ModulePass(ID) {}
- bool runOnModule(Module &M) override { return nameUnamedFunctions(M); }
+ bool runOnModule(Module &M) override { return nameUnamedGlobals(M); }
};
-char NameAnonFunction::ID = 0;
+char NameAnonGlobalLegacyPass::ID = 0;
} // anonymous namespace
-INITIALIZE_PASS_BEGIN(NameAnonFunction, "name-anon-functions",
- "Provide a name to nameless functions", false, false)
-INITIALIZE_PASS_END(NameAnonFunction, "name-anon-functions",
- "Provide a name to nameless functions", false, false)
+PreservedAnalyses NameAnonGlobalPass::run(Module &M,
+ ModuleAnalysisManager &AM) {
+ if (!nameUnamedGlobals(M))
+ return PreservedAnalyses::all();
+
+ return PreservedAnalyses::none();
+}
+
+INITIALIZE_PASS_BEGIN(NameAnonGlobalLegacyPass, "name-anon-globals",
+ "Provide a name to nameless globals", false, false)
+INITIALIZE_PASS_END(NameAnonGlobalLegacyPass, "name-anon-globals",
+ "Provide a name to nameless globals", false, false)
namespace llvm {
-ModulePass *createNameAnonFunctionPass() { return new NameAnonFunction(); }
+ModulePass *createNameAnonGlobalPass() {
+ return new NameAnonGlobalLegacyPass();
+}
}
diff --git a/contrib/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/contrib/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
index cbf385d..35faa6f 100644
--- a/contrib/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
@@ -907,6 +907,8 @@ NextIteration:
// The currently active variable for this block is now the PHI.
IncomingVals[AllocaNo] = APN;
+ if (DbgDeclareInst *DDI = AllocaDbgDeclares[AllocaNo])
+ ConvertDebugDeclareToDebugValue(DDI, APN, DIB);
// Get the next phi node.
++PNI;
diff --git a/contrib/llvm/lib/Transforms/Utils/SSAUpdater.cpp b/contrib/llvm/lib/Transforms/Utils/SSAUpdater.cpp
index 88b39dd..8e93ee7 100644
--- a/contrib/llvm/lib/Transforms/Utils/SSAUpdater.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/SSAUpdater.cpp
@@ -482,5 +482,5 @@ bool
LoadAndStorePromoter::isInstInList(Instruction *I,
const SmallVectorImpl<Instruction*> &Insts)
const {
- return std::find(Insts.begin(), Insts.end(), I) != Insts.end();
+ return is_contained(Insts, I);
}
diff --git a/contrib/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/contrib/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index c197317..7b0bddb 100644
--- a/contrib/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -11,27 +11,39 @@
//
//===----------------------------------------------------------------------===//
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Optional.h"
#include "llvm/ADT/SetOperations.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/STLExtras.h"
#include "llvm/Analysis/ConstantFolding.h"
#include "llvm/Analysis/EHPersonalities.h"
#include "llvm/Analysis/InstructionSimplify.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CallSite.h"
#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constant.h"
#include "llvm/IR/ConstantRange.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfo.h"
#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/GlobalValue.h"
#include "llvm/IR/GlobalVariable.h"
#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/MDBuilder.h"
#include "llvm/IR/Metadata.h"
@@ -40,15 +52,29 @@
#include "llvm/IR/Operator.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/Support/Casting.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/ValueMapper.h"
#include <algorithm>
+#include <cassert>
+#include <climits>
+#include <cstddef>
+#include <cstdint>
+#include <iterator>
#include <map>
#include <set>
+#include <utility>
+#include <vector>
+
using namespace llvm;
using namespace PatternMatch;
@@ -110,6 +136,7 @@ STATISTIC(NumSinkCommons,
STATISTIC(NumSpeculations, "Number of speculative executed instructions");
namespace {
+
// The first field contains the value that the switch produces when a certain
// case group is selected, and the second field is a vector containing the
// cases composing the case group.
@@ -168,13 +195,17 @@ public:
SmallPtrSetImpl<BasicBlock *> *LoopHeaders)
: TTI(TTI), DL(DL), BonusInstThreshold(BonusInstThreshold), AC(AC),
LoopHeaders(LoopHeaders) {}
+
bool run(BasicBlock *BB);
};
-}
+
+} // end anonymous namespace
/// Return true if it is safe to merge these two
/// terminator instructions together.
-static bool SafeToMergeTerminators(TerminatorInst *SI1, TerminatorInst *SI2) {
+static bool
+SafeToMergeTerminators(TerminatorInst *SI1, TerminatorInst *SI2,
+ SmallSetVector<BasicBlock *, 4> *FailBlocks = nullptr) {
if (SI1 == SI2)
return false; // Can't merge with self!
@@ -183,18 +214,22 @@ static bool SafeToMergeTerminators(TerminatorInst *SI1, TerminatorInst *SI2) {
// conflicting incoming values from the two switch blocks.
BasicBlock *SI1BB = SI1->getParent();
BasicBlock *SI2BB = SI2->getParent();
- SmallPtrSet<BasicBlock *, 16> SI1Succs(succ_begin(SI1BB), succ_end(SI1BB));
+ SmallPtrSet<BasicBlock *, 16> SI1Succs(succ_begin(SI1BB), succ_end(SI1BB));
+ bool Fail = false;
for (BasicBlock *Succ : successors(SI2BB))
if (SI1Succs.count(Succ))
for (BasicBlock::iterator BBI = Succ->begin(); isa<PHINode>(BBI); ++BBI) {
PHINode *PN = cast<PHINode>(BBI);
if (PN->getIncomingValueForBlock(SI1BB) !=
- PN->getIncomingValueForBlock(SI2BB))
- return false;
+ PN->getIncomingValueForBlock(SI2BB)) {
+ if (FailBlocks)
+ FailBlocks->insert(Succ);
+ Fail = true;
+ }
}
- return true;
+ return !Fail;
}
/// Return true if it is safe and profitable to merge these two terminator
@@ -621,7 +656,8 @@ private:
}
}
};
-}
+
+} // end anonymous namespace
static void EraseTerminatorInstAndDCECond(TerminatorInst *TI) {
Instruction *Cond = nullptr;
@@ -706,7 +742,7 @@ static bool ValuesOverlap(std::vector<ValueEqualityComparisonCase> &C1,
if (V1->size() > V2->size())
std::swap(V1, V2);
- if (V1->size() == 0)
+ if (V1->empty())
return false;
if (V1->size() == 1) {
// Just scan V2.
@@ -874,6 +910,7 @@ bool SimplifyCFGOpt::SimplifyEqualityComparisonWithOnlyPredecessor(
}
namespace {
+
/// This class implements a stable ordering of constant
/// integers that does not depend on their address. This is important for
/// applications that sort ConstantInt's to ensure uniqueness.
@@ -882,7 +919,8 @@ struct ConstantIntOrdering {
return LHS->getValue().ult(RHS->getValue());
}
};
-}
+
+} // end anonymous namespace
static int ConstantIntSortPredicate(ConstantInt *const *P1,
ConstantInt *const *P2) {
@@ -954,7 +992,16 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI,
TerminatorInst *PTI = Pred->getTerminator();
Value *PCV = isValueEqualityComparison(PTI); // PredCondVal
- if (PCV == CV && SafeToMergeTerminators(TI, PTI)) {
+ if (PCV == CV && TI != PTI) {
+ SmallSetVector<BasicBlock*, 4> FailBlocks;
+ if (!SafeToMergeTerminators(TI, PTI, &FailBlocks)) {
+ for (auto *Succ : FailBlocks) {
+ std::vector<BasicBlock*> Blocks = { TI->getParent() };
+ if (!SplitBlockPredecessors(Succ, Blocks, ".fold.split"))
+ return false;
+ }
+ }
+
// Figure out which 'cases' to copy from SI to PSI.
std::vector<ValueEqualityComparisonCase> BBCases;
BasicBlock *BBDefault = GetValueEqualityComparisonCases(TI, BBCases);
@@ -1215,7 +1262,7 @@ static bool HoistThenElseCodeToIf(BranchInst *BI,
BIParent->getInstList().splice(BI->getIterator(), BB1->getInstList(), I1);
if (!I2->use_empty())
I2->replaceAllUsesWith(I1);
- I1->intersectOptionalDataWith(I2);
+ I1->andIRFlags(I2);
unsigned KnownIDs[] = {LLVMContext::MD_tbaa,
LLVMContext::MD_range,
LLVMContext::MD_fpmath,
@@ -1227,6 +1274,13 @@ static bool HoistThenElseCodeToIf(BranchInst *BI,
LLVMContext::MD_dereferenceable_or_null,
LLVMContext::MD_mem_parallel_loop_access};
combineMetadata(I1, I2, KnownIDs);
+
+ // I1 and I2 are being combined into a single instruction. Its debug
+ // location is the merged locations of the original instructions.
+ if (!isa<CallInst>(I1))
+ I1->setDebugLoc(
+ DILocation::getMergedLocation(I1->getDebugLoc(), I2->getDebugLoc()));
+
I2->eraseFromParent();
Changed = true;
@@ -1319,172 +1373,462 @@ HoistTerminator:
return true;
}
-/// Given an unconditional branch that goes to BBEnd,
-/// check whether BBEnd has only two predecessors and the other predecessor
-/// ends with an unconditional branch. If it is true, sink any common code
-/// in the two predecessors to BBEnd.
-static bool SinkThenElseCodeToEnd(BranchInst *BI1) {
- assert(BI1->isUnconditional());
- BasicBlock *BB1 = BI1->getParent();
- BasicBlock *BBEnd = BI1->getSuccessor(0);
-
- // Check that BBEnd has two predecessors and the other predecessor ends with
- // an unconditional branch.
- pred_iterator PI = pred_begin(BBEnd), PE = pred_end(BBEnd);
- BasicBlock *Pred0 = *PI++;
- if (PI == PE) // Only one predecessor.
- return false;
- BasicBlock *Pred1 = *PI++;
- if (PI != PE) // More than two predecessors.
+// Is it legal to place a variable in operand \c OpIdx of \c I?
+// FIXME: This should be promoted to Instruction.
+static bool canReplaceOperandWithVariable(const Instruction *I,
+ unsigned OpIdx) {
+ // We can't have a PHI with a metadata type.
+ if (I->getOperand(OpIdx)->getType()->isMetadataTy())
return false;
- BasicBlock *BB2 = (Pred0 == BB1) ? Pred1 : Pred0;
- BranchInst *BI2 = dyn_cast<BranchInst>(BB2->getTerminator());
- if (!BI2 || !BI2->isUnconditional())
+
+ // Early exit.
+ if (!isa<Constant>(I->getOperand(OpIdx)))
+ return true;
+
+ switch (I->getOpcode()) {
+ default:
+ return true;
+ case Instruction::Call:
+ case Instruction::Invoke:
+ // FIXME: many arithmetic intrinsics have no issue taking a
+ // variable, however it's hard to distingish these from
+ // specials such as @llvm.frameaddress that require a constant.
+ if (isa<IntrinsicInst>(I))
+ return false;
+
+ // Constant bundle operands may need to retain their constant-ness for
+ // correctness.
+ if (ImmutableCallSite(I).isBundleOperand(OpIdx))
+ return false;
+
+ return true;
+
+ case Instruction::ShuffleVector:
+ // Shufflevector masks are constant.
+ return OpIdx != 2;
+ case Instruction::ExtractValue:
+ case Instruction::InsertValue:
+ // All operands apart from the first are constant.
+ return OpIdx == 0;
+ case Instruction::Alloca:
return false;
+ case Instruction::GetElementPtr:
+ if (OpIdx == 0)
+ return true;
+ gep_type_iterator It = std::next(gep_type_begin(I), OpIdx - 1);
+ return It.isSequential();
+ }
+}
- // Gather the PHI nodes in BBEnd.
- SmallDenseMap<std::pair<Value *, Value *>, PHINode *> JointValueMap;
- Instruction *FirstNonPhiInBBEnd = nullptr;
- for (BasicBlock::iterator I = BBEnd->begin(), E = BBEnd->end(); I != E; ++I) {
- if (PHINode *PN = dyn_cast<PHINode>(I)) {
- Value *BB1V = PN->getIncomingValueForBlock(BB1);
- Value *BB2V = PN->getIncomingValueForBlock(BB2);
- JointValueMap[std::make_pair(BB1V, BB2V)] = PN;
- } else {
- FirstNonPhiInBBEnd = &*I;
- break;
- }
+// All instructions in Insts belong to different blocks that all unconditionally
+// branch to a common successor. Analyze each instruction and return true if it
+// would be possible to sink them into their successor, creating one common
+// instruction instead. For every value that would be required to be provided by
+// PHI node (because an operand varies in each input block), add to PHIOperands.
+static bool canSinkInstructions(
+ ArrayRef<Instruction *> Insts,
+ DenseMap<Instruction *, SmallVector<Value *, 4>> &PHIOperands) {
+ // Prune out obviously bad instructions to move. Any non-store instruction
+ // must have exactly one use, and we check later that use is by a single,
+ // common PHI instruction in the successor.
+ for (auto *I : Insts) {
+ // These instructions may change or break semantics if moved.
+ if (isa<PHINode>(I) || I->isEHPad() || isa<AllocaInst>(I) ||
+ I->getType()->isTokenTy())
+ return false;
+
+ // Conservatively return false if I is an inline-asm instruction. Sinking
+ // and merging inline-asm instructions can potentially create arguments
+ // that cannot satisfy the inline-asm constraints.
+ if (const auto *C = dyn_cast<CallInst>(I))
+ if (C->isInlineAsm())
+ return false;
+
+ // Everything must have only one use too, apart from stores which
+ // have no uses.
+ if (!isa<StoreInst>(I) && !I->hasOneUse())
+ return false;
}
- if (!FirstNonPhiInBBEnd)
- return false;
- // This does very trivial matching, with limited scanning, to find identical
- // instructions in the two blocks. We scan backward for obviously identical
- // instructions in an identical order.
- BasicBlock::InstListType::reverse_iterator RI1 = BB1->getInstList().rbegin(),
- RE1 = BB1->getInstList().rend(),
- RI2 = BB2->getInstList().rbegin(),
- RE2 = BB2->getInstList().rend();
- // Skip debug info.
- while (RI1 != RE1 && isa<DbgInfoIntrinsic>(&*RI1))
- ++RI1;
- if (RI1 == RE1)
- return false;
- while (RI2 != RE2 && isa<DbgInfoIntrinsic>(&*RI2))
- ++RI2;
- if (RI2 == RE2)
- return false;
- // Skip the unconditional branches.
- ++RI1;
- ++RI2;
+ const Instruction *I0 = Insts.front();
+ for (auto *I : Insts)
+ if (!I->isSameOperationAs(I0))
+ return false;
- bool Changed = false;
- while (RI1 != RE1 && RI2 != RE2) {
- // Skip debug info.
- while (RI1 != RE1 && isa<DbgInfoIntrinsic>(&*RI1))
- ++RI1;
- if (RI1 == RE1)
- return Changed;
- while (RI2 != RE2 && isa<DbgInfoIntrinsic>(&*RI2))
- ++RI2;
- if (RI2 == RE2)
- return Changed;
+ // All instructions in Insts are known to be the same opcode. If they aren't
+ // stores, check the only user of each is a PHI or in the same block as the
+ // instruction, because if a user is in the same block as an instruction
+ // we're contemplating sinking, it must already be determined to be sinkable.
+ if (!isa<StoreInst>(I0)) {
+ auto *PNUse = dyn_cast<PHINode>(*I0->user_begin());
+ auto *Succ = I0->getParent()->getTerminator()->getSuccessor(0);
+ if (!all_of(Insts, [&PNUse,&Succ](const Instruction *I) -> bool {
+ auto *U = cast<Instruction>(*I->user_begin());
+ return (PNUse &&
+ PNUse->getParent() == Succ &&
+ PNUse->getIncomingValueForBlock(I->getParent()) == I) ||
+ U->getParent() == I->getParent();
+ }))
+ return false;
+ }
- Instruction *I1 = &*RI1, *I2 = &*RI2;
- auto InstPair = std::make_pair(I1, I2);
- // I1 and I2 should have a single use in the same PHI node, and they
- // perform the same operation.
- // Cannot move control-flow-involving, volatile loads, vaarg, etc.
- if (isa<PHINode>(I1) || isa<PHINode>(I2) || isa<TerminatorInst>(I1) ||
- isa<TerminatorInst>(I2) || I1->isEHPad() || I2->isEHPad() ||
- isa<AllocaInst>(I1) || isa<AllocaInst>(I2) ||
- I1->mayHaveSideEffects() || I2->mayHaveSideEffects() ||
- I1->mayReadOrWriteMemory() || I2->mayReadOrWriteMemory() ||
- !I1->hasOneUse() || !I2->hasOneUse() || !JointValueMap.count(InstPair))
- return Changed;
+ for (unsigned OI = 0, OE = I0->getNumOperands(); OI != OE; ++OI) {
+ if (I0->getOperand(OI)->getType()->isTokenTy())
+ // Don't touch any operand of token type.
+ return false;
+
+ // Because SROA can't handle speculating stores of selects, try not
+ // to sink loads or stores of allocas when we'd have to create a PHI for
+ // the address operand. Also, because it is likely that loads or stores
+ // of allocas will disappear when Mem2Reg/SROA is run, don't sink them.
+ // This can cause code churn which can have unintended consequences down
+ // the line - see https://llvm.org/bugs/show_bug.cgi?id=30244.
+ // FIXME: This is a workaround for a deficiency in SROA - see
+ // https://llvm.org/bugs/show_bug.cgi?id=30188
+ if (OI == 1 && isa<StoreInst>(I0) &&
+ any_of(Insts, [](const Instruction *I) {
+ return isa<AllocaInst>(I->getOperand(1));
+ }))
+ return false;
+ if (OI == 0 && isa<LoadInst>(I0) && any_of(Insts, [](const Instruction *I) {
+ return isa<AllocaInst>(I->getOperand(0));
+ }))
+ return false;
- // Check whether we should swap the operands of ICmpInst.
- // TODO: Add support of communativity.
- ICmpInst *ICmp1 = dyn_cast<ICmpInst>(I1), *ICmp2 = dyn_cast<ICmpInst>(I2);
- bool SwapOpnds = false;
- if (ICmp1 && ICmp2 && ICmp1->getOperand(0) != ICmp2->getOperand(0) &&
- ICmp1->getOperand(1) != ICmp2->getOperand(1) &&
- (ICmp1->getOperand(0) == ICmp2->getOperand(1) ||
- ICmp1->getOperand(1) == ICmp2->getOperand(0))) {
- ICmp2->swapOperands();
- SwapOpnds = true;
+ auto SameAsI0 = [&I0, OI](const Instruction *I) {
+ assert(I->getNumOperands() == I0->getNumOperands());
+ return I->getOperand(OI) == I0->getOperand(OI);
+ };
+ if (!all_of(Insts, SameAsI0)) {
+ if (!canReplaceOperandWithVariable(I0, OI))
+ // We can't create a PHI from this GEP.
+ return false;
+ // Don't create indirect calls! The called value is the final operand.
+ if ((isa<CallInst>(I0) || isa<InvokeInst>(I0)) && OI == OE - 1) {
+ // FIXME: if the call was *already* indirect, we should do this.
+ return false;
+ }
+ for (auto *I : Insts)
+ PHIOperands[I].push_back(I->getOperand(OI));
}
- if (!I1->isSameOperationAs(I2)) {
- if (SwapOpnds)
- ICmp2->swapOperands();
- return Changed;
+ }
+ return true;
+}
+
+// Assuming canSinkLastInstruction(Blocks) has returned true, sink the last
+// instruction of every block in Blocks to their common successor, commoning
+// into one instruction.
+static bool sinkLastInstruction(ArrayRef<BasicBlock*> Blocks) {
+ auto *BBEnd = Blocks[0]->getTerminator()->getSuccessor(0);
+
+ // canSinkLastInstruction returning true guarantees that every block has at
+ // least one non-terminator instruction.
+ SmallVector<Instruction*,4> Insts;
+ for (auto *BB : Blocks) {
+ Instruction *I = BB->getTerminator();
+ do {
+ I = I->getPrevNode();
+ } while (isa<DbgInfoIntrinsic>(I) && I != &BB->front());
+ if (!isa<DbgInfoIntrinsic>(I))
+ Insts.push_back(I);
+ }
+
+ // The only checking we need to do now is that all users of all instructions
+ // are the same PHI node. canSinkLastInstruction should have checked this but
+ // it is slightly over-aggressive - it gets confused by commutative instructions
+ // so double-check it here.
+ Instruction *I0 = Insts.front();
+ if (!isa<StoreInst>(I0)) {
+ auto *PNUse = dyn_cast<PHINode>(*I0->user_begin());
+ if (!all_of(Insts, [&PNUse](const Instruction *I) -> bool {
+ auto *U = cast<Instruction>(*I->user_begin());
+ return U == PNUse;
+ }))
+ return false;
+ }
+
+ // We don't need to do any more checking here; canSinkLastInstruction should
+ // have done it all for us.
+ SmallVector<Value*, 4> NewOperands;
+ for (unsigned O = 0, E = I0->getNumOperands(); O != E; ++O) {
+ // This check is different to that in canSinkLastInstruction. There, we
+ // cared about the global view once simplifycfg (and instcombine) have
+ // completed - it takes into account PHIs that become trivially
+ // simplifiable. However here we need a more local view; if an operand
+ // differs we create a PHI and rely on instcombine to clean up the very
+ // small mess we may make.
+ bool NeedPHI = any_of(Insts, [&I0, O](const Instruction *I) {
+ return I->getOperand(O) != I0->getOperand(O);
+ });
+ if (!NeedPHI) {
+ NewOperands.push_back(I0->getOperand(O));
+ continue;
}
- // The operands should be either the same or they need to be generated
- // with a PHI node after sinking. We only handle the case where there is
- // a single pair of different operands.
- Value *DifferentOp1 = nullptr, *DifferentOp2 = nullptr;
- unsigned Op1Idx = ~0U;
- for (unsigned I = 0, E = I1->getNumOperands(); I != E; ++I) {
- if (I1->getOperand(I) == I2->getOperand(I))
- continue;
- // Early exit if we have more-than one pair of different operands or if
- // we need a PHI node to replace a constant.
- if (Op1Idx != ~0U || isa<Constant>(I1->getOperand(I)) ||
- isa<Constant>(I2->getOperand(I))) {
- // If we can't sink the instructions, undo the swapping.
- if (SwapOpnds)
- ICmp2->swapOperands();
- return Changed;
+ // Create a new PHI in the successor block and populate it.
+ auto *Op = I0->getOperand(O);
+ assert(!Op->getType()->isTokenTy() && "Can't PHI tokens!");
+ auto *PN = PHINode::Create(Op->getType(), Insts.size(),
+ Op->getName() + ".sink", &BBEnd->front());
+ for (auto *I : Insts)
+ PN->addIncoming(I->getOperand(O), I->getParent());
+ NewOperands.push_back(PN);
+ }
+
+ // Arbitrarily use I0 as the new "common" instruction; remap its operands
+ // and move it to the start of the successor block.
+ for (unsigned O = 0, E = I0->getNumOperands(); O != E; ++O)
+ I0->getOperandUse(O).set(NewOperands[O]);
+ I0->moveBefore(&*BBEnd->getFirstInsertionPt());
+
+ // The debug location for the "common" instruction is the merged locations of
+ // all the commoned instructions. We start with the original location of the
+ // "common" instruction and iteratively merge each location in the loop below.
+ const DILocation *Loc = I0->getDebugLoc();
+
+ // Update metadata and IR flags, and merge debug locations.
+ for (auto *I : Insts)
+ if (I != I0) {
+ Loc = DILocation::getMergedLocation(Loc, I->getDebugLoc());
+ combineMetadataForCSE(I0, I);
+ I0->andIRFlags(I);
+ }
+ if (!isa<CallInst>(I0))
+ I0->setDebugLoc(Loc);
+
+ if (!isa<StoreInst>(I0)) {
+ // canSinkLastInstruction checked that all instructions were used by
+ // one and only one PHI node. Find that now, RAUW it to our common
+ // instruction and nuke it.
+ assert(I0->hasOneUse());
+ auto *PN = cast<PHINode>(*I0->user_begin());
+ PN->replaceAllUsesWith(I0);
+ PN->eraseFromParent();
+ }
+
+ // Finally nuke all instructions apart from the common instruction.
+ for (auto *I : Insts)
+ if (I != I0)
+ I->eraseFromParent();
+
+ return true;
+}
+
+namespace {
+
+ // LockstepReverseIterator - Iterates through instructions
+ // in a set of blocks in reverse order from the first non-terminator.
+ // For example (assume all blocks have size n):
+ // LockstepReverseIterator I([B1, B2, B3]);
+ // *I-- = [B1[n], B2[n], B3[n]];
+ // *I-- = [B1[n-1], B2[n-1], B3[n-1]];
+ // *I-- = [B1[n-2], B2[n-2], B3[n-2]];
+ // ...
+ class LockstepReverseIterator {
+ ArrayRef<BasicBlock*> Blocks;
+ SmallVector<Instruction*,4> Insts;
+ bool Fail;
+ public:
+ LockstepReverseIterator(ArrayRef<BasicBlock*> Blocks) :
+ Blocks(Blocks) {
+ reset();
+ }
+
+ void reset() {
+ Fail = false;
+ Insts.clear();
+ for (auto *BB : Blocks) {
+ Instruction *Inst = BB->getTerminator();
+ for (Inst = Inst->getPrevNode(); Inst && isa<DbgInfoIntrinsic>(Inst);)
+ Inst = Inst->getPrevNode();
+ if (!Inst) {
+ // Block wasn't big enough.
+ Fail = true;
+ return;
+ }
+ Insts.push_back(Inst);
}
- DifferentOp1 = I1->getOperand(I);
- Op1Idx = I;
- DifferentOp2 = I2->getOperand(I);
}
- DEBUG(dbgs() << "SINK common instructions " << *I1 << "\n");
- DEBUG(dbgs() << " " << *I2 << "\n");
-
- // We insert the pair of different operands to JointValueMap and
- // remove (I1, I2) from JointValueMap.
- if (Op1Idx != ~0U) {
- auto &NewPN = JointValueMap[std::make_pair(DifferentOp1, DifferentOp2)];
- if (!NewPN) {
- NewPN =
- PHINode::Create(DifferentOp1->getType(), 2,
- DifferentOp1->getName() + ".sink", &BBEnd->front());
- NewPN->addIncoming(DifferentOp1, BB1);
- NewPN->addIncoming(DifferentOp2, BB2);
- DEBUG(dbgs() << "Create PHI node " << *NewPN << "\n";);
+ bool isValid() const {
+ return !Fail;
+ }
+
+ void operator -- () {
+ if (Fail)
+ return;
+ for (auto *&Inst : Insts) {
+ for (Inst = Inst->getPrevNode(); Inst && isa<DbgInfoIntrinsic>(Inst);)
+ Inst = Inst->getPrevNode();
+ // Already at beginning of block.
+ if (!Inst) {
+ Fail = true;
+ return;
+ }
}
- // I1 should use NewPN instead of DifferentOp1.
- I1->setOperand(Op1Idx, NewPN);
}
- PHINode *OldPN = JointValueMap[InstPair];
- JointValueMap.erase(InstPair);
-
- // We need to update RE1 and RE2 if we are going to sink the first
- // instruction in the basic block down.
- bool UpdateRE1 = (I1 == &BB1->front()), UpdateRE2 = (I2 == &BB2->front());
- // Sink the instruction.
- BBEnd->getInstList().splice(FirstNonPhiInBBEnd->getIterator(),
- BB1->getInstList(), I1);
- if (!OldPN->use_empty())
- OldPN->replaceAllUsesWith(I1);
- OldPN->eraseFromParent();
- if (!I2->use_empty())
- I2->replaceAllUsesWith(I1);
- I1->intersectOptionalDataWith(I2);
- // TODO: Use combineMetadata here to preserve what metadata we can
- // (analogous to the hoisting case above).
- I2->eraseFromParent();
+ ArrayRef<Instruction*> operator * () const {
+ return Insts;
+ }
+ };
+
+} // end anonymous namespace
- if (UpdateRE1)
- RE1 = BB1->getInstList().rend();
- if (UpdateRE2)
- RE2 = BB2->getInstList().rend();
- FirstNonPhiInBBEnd = &*I1;
+/// Given an unconditional branch that goes to BBEnd,
+/// check whether BBEnd has only two predecessors and the other predecessor
+/// ends with an unconditional branch. If it is true, sink any common code
+/// in the two predecessors to BBEnd.
+static bool SinkThenElseCodeToEnd(BranchInst *BI1) {
+ assert(BI1->isUnconditional());
+ BasicBlock *BBEnd = BI1->getSuccessor(0);
+
+ // We support two situations:
+ // (1) all incoming arcs are unconditional
+ // (2) one incoming arc is conditional
+ //
+ // (2) is very common in switch defaults and
+ // else-if patterns;
+ //
+ // if (a) f(1);
+ // else if (b) f(2);
+ //
+ // produces:
+ //
+ // [if]
+ // / \
+ // [f(1)] [if]
+ // | | \
+ // | | \
+ // | [f(2)]|
+ // \ | /
+ // [ end ]
+ //
+ // [end] has two unconditional predecessor arcs and one conditional. The
+ // conditional refers to the implicit empty 'else' arc. This conditional
+ // arc can also be caused by an empty default block in a switch.
+ //
+ // In this case, we attempt to sink code from all *unconditional* arcs.
+ // If we can sink instructions from these arcs (determined during the scan
+ // phase below) we insert a common successor for all unconditional arcs and
+ // connect that to [end], to enable sinking:
+ //
+ // [if]
+ // / \
+ // [x(1)] [if]
+ // | | \
+ // | | \
+ // | [x(2)] |
+ // \ / |
+ // [sink.split] |
+ // \ /
+ // [ end ]
+ //
+ SmallVector<BasicBlock*,4> UnconditionalPreds;
+ Instruction *Cond = nullptr;
+ for (auto *B : predecessors(BBEnd)) {
+ auto *T = B->getTerminator();
+ if (isa<BranchInst>(T) && cast<BranchInst>(T)->isUnconditional())
+ UnconditionalPreds.push_back(B);
+ else if ((isa<BranchInst>(T) || isa<SwitchInst>(T)) && !Cond)
+ Cond = T;
+ else
+ return false;
+ }
+ if (UnconditionalPreds.size() < 2)
+ return false;
+
+ bool Changed = false;
+ // We take a two-step approach to tail sinking. First we scan from the end of
+ // each block upwards in lockstep. If the n'th instruction from the end of each
+ // block can be sunk, those instructions are added to ValuesToSink and we
+ // carry on. If we can sink an instruction but need to PHI-merge some operands
+ // (because they're not identical in each instruction) we add these to
+ // PHIOperands.
+ unsigned ScanIdx = 0;
+ SmallPtrSet<Value*,4> InstructionsToSink;
+ DenseMap<Instruction*, SmallVector<Value*,4>> PHIOperands;
+ LockstepReverseIterator LRI(UnconditionalPreds);
+ while (LRI.isValid() &&
+ canSinkInstructions(*LRI, PHIOperands)) {
+ DEBUG(dbgs() << "SINK: instruction can be sunk: " << *(*LRI)[0] << "\n");
+ InstructionsToSink.insert((*LRI).begin(), (*LRI).end());
+ ++ScanIdx;
+ --LRI;
+ }
+
+ auto ProfitableToSinkInstruction = [&](LockstepReverseIterator &LRI) {
+ unsigned NumPHIdValues = 0;
+ for (auto *I : *LRI)
+ for (auto *V : PHIOperands[I])
+ if (InstructionsToSink.count(V) == 0)
+ ++NumPHIdValues;
+ DEBUG(dbgs() << "SINK: #phid values: " << NumPHIdValues << "\n");
+ unsigned NumPHIInsts = NumPHIdValues / UnconditionalPreds.size();
+ if ((NumPHIdValues % UnconditionalPreds.size()) != 0)
+ NumPHIInsts++;
+
+ return NumPHIInsts <= 1;
+ };
+
+ if (ScanIdx > 0 && Cond) {
+ // Check if we would actually sink anything first! This mutates the CFG and
+ // adds an extra block. The goal in doing this is to allow instructions that
+ // couldn't be sunk before to be sunk - obviously, speculatable instructions
+ // (such as trunc, add) can be sunk and predicated already. So we check that
+ // we're going to sink at least one non-speculatable instruction.
+ LRI.reset();
+ unsigned Idx = 0;
+ bool Profitable = false;
+ while (ProfitableToSinkInstruction(LRI) && Idx < ScanIdx) {
+ if (!isSafeToSpeculativelyExecute((*LRI)[0])) {
+ Profitable = true;
+ break;
+ }
+ --LRI;
+ ++Idx;
+ }
+ if (!Profitable)
+ return false;
+
+ DEBUG(dbgs() << "SINK: Splitting edge\n");
+ // We have a conditional edge and we're going to sink some instructions.
+ // Insert a new block postdominating all blocks we're going to sink from.
+ if (!SplitBlockPredecessors(BI1->getSuccessor(0), UnconditionalPreds,
+ ".sink.split"))
+ // Edges couldn't be split.
+ return false;
+ Changed = true;
+ }
+
+ // Now that we've analyzed all potential sinking candidates, perform the
+ // actual sink. We iteratively sink the last non-terminator of the source
+ // blocks into their common successor unless doing so would require too
+ // many PHI instructions to be generated (currently only one PHI is allowed
+ // per sunk instruction).
+ //
+ // We can use InstructionsToSink to discount values needing PHI-merging that will
+ // actually be sunk in a later iteration. This allows us to be more
+ // aggressive in what we sink. This does allow a false positive where we
+ // sink presuming a later value will also be sunk, but stop half way through
+ // and never actually sink it which means we produce more PHIs than intended.
+ // This is unlikely in practice though.
+ for (unsigned SinkIdx = 0; SinkIdx != ScanIdx; ++SinkIdx) {
+ DEBUG(dbgs() << "SINK: Sink: "
+ << *UnconditionalPreds[0]->getTerminator()->getPrevNode()
+ << "\n");
+
+ // Because we've sunk every instruction in turn, the current instruction to
+ // sink is always at index 0.
+ LRI.reset();
+ if (!ProfitableToSinkInstruction(LRI)) {
+ // Too many PHIs would be created.
+ DEBUG(dbgs() << "SINK: stopping here, too many PHIs would be created!\n");
+ break;
+ }
+
+ if (!sinkLastInstruction(UnconditionalPreds))
+ return Changed;
NumSinkCommons++;
Changed = true;
}
@@ -1539,7 +1883,7 @@ static Value *isSafeToSpeculateStore(Instruction *I, BasicBlock *BrBB,
continue;
--MaxNumInstToLookAt;
- // Could be calling an instruction that effects memory like free().
+ // Could be calling an instruction that affects memory like free().
if (CurI.mayHaveSideEffects() && !isa<StoreInst>(CurI))
return nullptr;
@@ -1822,7 +2166,7 @@ static bool FoldCondBranchOnPHI(BranchInst *BI, const DataLayout &DL) {
return false;
// Can't fold blocks that contain noduplicate or convergent calls.
- if (llvm::any_of(*BB, [](const Instruction &I) {
+ if (any_of(*BB, [](const Instruction &I) {
const CallInst *CI = dyn_cast<CallInst>(&I);
return CI && (CI->cannotDuplicate() || CI->isConvergent());
}))
@@ -2464,6 +2808,11 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI, unsigned BonusInstThreshold) {
PBI = New_PBI;
}
+ // If BI was a loop latch, it may have had associated loop metadata.
+ // We need to copy it to the new latch, that is, PBI.
+ if (MDNode *LoopMD = BI->getMetadata(LLVMContext::MD_loop))
+ PBI->setMetadata(LLVMContext::MD_loop, LoopMD);
+
// TODO: If BB is reachable from all paths through PredBlock, then we
// could replace PBI's branch probabilities with BI's.
@@ -4150,18 +4499,28 @@ static bool ForwardSwitchConditionToPHI(SwitchInst *SI) {
/// Return true if the backend will be able to handle
/// initializing an array of constants like C.
-static bool ValidLookupTableConstant(Constant *C) {
+static bool ValidLookupTableConstant(Constant *C, const TargetTransformInfo &TTI) {
if (C->isThreadDependent())
return false;
if (C->isDLLImportDependent())
return false;
- if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C))
- return CE->isGEPWithNoNotionalOverIndexing();
+ if (!isa<ConstantFP>(C) && !isa<ConstantInt>(C) &&
+ !isa<ConstantPointerNull>(C) && !isa<GlobalValue>(C) &&
+ !isa<UndefValue>(C) && !isa<ConstantExpr>(C))
+ return false;
+
+ if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C)) {
+ if (!CE->isGEPWithNoNotionalOverIndexing())
+ return false;
+ if (!ValidLookupTableConstant(CE->getOperand(0), TTI))
+ return false;
+ }
+
+ if (!TTI.shouldBuildLookupTablesForConstant(C))
+ return false;
- return isa<ConstantFP>(C) || isa<ConstantInt>(C) ||
- isa<ConstantPointerNull>(C) || isa<GlobalValue>(C) ||
- isa<UndefValue>(C);
+ return true;
}
/// If V is a Constant, return it. Otherwise, try to look up
@@ -4216,7 +4575,7 @@ static bool
GetCaseResults(SwitchInst *SI, ConstantInt *CaseVal, BasicBlock *CaseDest,
BasicBlock **CommonDest,
SmallVectorImpl<std::pair<PHINode *, Constant *>> &Res,
- const DataLayout &DL) {
+ const DataLayout &DL, const TargetTransformInfo &TTI) {
// The block from which we enter the common destination.
BasicBlock *Pred = SI->getParent();
@@ -4228,7 +4587,7 @@ GetCaseResults(SwitchInst *SI, ConstantInt *CaseVal, BasicBlock *CaseDest,
++I) {
if (TerminatorInst *T = dyn_cast<TerminatorInst>(I)) {
// If the terminator is a simple branch, continue to the next block.
- if (T->getNumSuccessors() != 1)
+ if (T->getNumSuccessors() != 1 || T->isExceptional())
return false;
Pred = CaseDest;
CaseDest = T->getSuccessor(0);
@@ -4278,7 +4637,7 @@ GetCaseResults(SwitchInst *SI, ConstantInt *CaseVal, BasicBlock *CaseDest,
return false;
// Be conservative about which kinds of constants we support.
- if (!ValidLookupTableConstant(ConstVal))
+ if (!ValidLookupTableConstant(ConstVal, TTI))
return false;
Res.push_back(std::make_pair(PHI, ConstVal));
@@ -4310,14 +4669,15 @@ static bool InitializeUniqueCases(SwitchInst *SI, PHINode *&PHI,
BasicBlock *&CommonDest,
SwitchCaseResultVectorTy &UniqueResults,
Constant *&DefaultResult,
- const DataLayout &DL) {
+ const DataLayout &DL,
+ const TargetTransformInfo &TTI) {
for (auto &I : SI->cases()) {
ConstantInt *CaseVal = I.getCaseValue();
// Resulting value at phi nodes for this case value.
SwitchCaseResultsTy Results;
if (!GetCaseResults(SI, CaseVal, I.getCaseSuccessor(), &CommonDest, Results,
- DL))
+ DL, TTI))
return false;
// Only one value per case is permitted
@@ -4335,7 +4695,7 @@ static bool InitializeUniqueCases(SwitchInst *SI, PHINode *&PHI,
SmallVector<std::pair<PHINode *, Constant *>, 1> DefaultResults;
BasicBlock *DefaultDest = SI->getDefaultDest();
GetCaseResults(SI, nullptr, SI->getDefaultDest(), &CommonDest, DefaultResults,
- DL);
+ DL, TTI);
// If the default value is not found abort unless the default destination
// is unreachable.
DefaultResult =
@@ -4414,7 +4774,8 @@ static void RemoveSwitchAfterSelectConversion(SwitchInst *SI, PHINode *PHI,
/// phi nodes in a common successor block with only two different
/// constant values, replace the switch with select.
static bool SwitchToSelect(SwitchInst *SI, IRBuilder<> &Builder,
- AssumptionCache *AC, const DataLayout &DL) {
+ AssumptionCache *AC, const DataLayout &DL,
+ const TargetTransformInfo &TTI) {
Value *const Cond = SI->getCondition();
PHINode *PHI = nullptr;
BasicBlock *CommonDest = nullptr;
@@ -4422,7 +4783,7 @@ static bool SwitchToSelect(SwitchInst *SI, IRBuilder<> &Builder,
SwitchCaseResultVectorTy UniqueResults;
// Collect all the cases that will deliver the same value from the switch.
if (!InitializeUniqueCases(SI, PHI, CommonDest, UniqueResults, DefaultResult,
- DL))
+ DL, TTI))
return false;
// Selects choose between maximum two values.
if (UniqueResults.size() != 2)
@@ -4441,6 +4802,7 @@ static bool SwitchToSelect(SwitchInst *SI, IRBuilder<> &Builder,
}
namespace {
+
/// This class represents a lookup table that can be used to replace a switch.
class SwitchLookupTable {
public:
@@ -4497,7 +4859,8 @@ private:
// For ArrayKind, this is the array.
GlobalVariable *Array;
};
-}
+
+} // end anonymous namespace
SwitchLookupTable::SwitchLookupTable(
Module &M, uint64_t TableSize, ConstantInt *Offset,
@@ -4860,7 +5223,7 @@ static bool SwitchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder,
typedef SmallVector<std::pair<PHINode *, Constant *>, 4> ResultsTy;
ResultsTy Results;
if (!GetCaseResults(SI, CaseVal, CI.getCaseSuccessor(), &CommonDest,
- Results, DL))
+ Results, DL, TTI))
return false;
// Append the result from this case to the list for each phi.
@@ -4886,8 +5249,9 @@ static bool SwitchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder,
// If the table has holes, we need a constant result for the default case
// or a bitmask that fits in a register.
SmallVector<std::pair<PHINode *, Constant *>, 4> DefaultResultsList;
- bool HasDefaultResults = GetCaseResults(SI, nullptr, SI->getDefaultDest(),
- &CommonDest, DefaultResultsList, DL);
+ bool HasDefaultResults =
+ GetCaseResults(SI, nullptr, SI->getDefaultDest(), &CommonDest,
+ DefaultResultsList, DL, TTI);
bool NeedMask = (TableHasHoles && !HasDefaultResults);
if (NeedMask) {
@@ -5044,6 +5408,111 @@ static bool SwitchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder,
return true;
}
+static bool isSwitchDense(ArrayRef<int64_t> Values) {
+ // See also SelectionDAGBuilder::isDense(), which this function was based on.
+ uint64_t Diff = (uint64_t)Values.back() - (uint64_t)Values.front();
+ uint64_t Range = Diff + 1;
+ uint64_t NumCases = Values.size();
+ // 40% is the default density for building a jump table in optsize/minsize mode.
+ uint64_t MinDensity = 40;
+
+ return NumCases * 100 >= Range * MinDensity;
+}
+
+// Try and transform a switch that has "holes" in it to a contiguous sequence
+// of cases.
+//
+// A switch such as: switch(i) {case 5: case 9: case 13: case 17:} can be
+// range-reduced to: switch ((i-5) / 4) {case 0: case 1: case 2: case 3:}.
+//
+// This converts a sparse switch into a dense switch which allows better
+// lowering and could also allow transforming into a lookup table.
+static bool ReduceSwitchRange(SwitchInst *SI, IRBuilder<> &Builder,
+ const DataLayout &DL,
+ const TargetTransformInfo &TTI) {
+ auto *CondTy = cast<IntegerType>(SI->getCondition()->getType());
+ if (CondTy->getIntegerBitWidth() > 64 ||
+ !DL.fitsInLegalInteger(CondTy->getIntegerBitWidth()))
+ return false;
+ // Only bother with this optimization if there are more than 3 switch cases;
+ // SDAG will only bother creating jump tables for 4 or more cases.
+ if (SI->getNumCases() < 4)
+ return false;
+
+ // This transform is agnostic to the signedness of the input or case values. We
+ // can treat the case values as signed or unsigned. We can optimize more common
+ // cases such as a sequence crossing zero {-4,0,4,8} if we interpret case values
+ // as signed.
+ SmallVector<int64_t,4> Values;
+ for (auto &C : SI->cases())
+ Values.push_back(C.getCaseValue()->getValue().getSExtValue());
+ std::sort(Values.begin(), Values.end());
+
+ // If the switch is already dense, there's nothing useful to do here.
+ if (isSwitchDense(Values))
+ return false;
+
+ // First, transform the values such that they start at zero and ascend.
+ int64_t Base = Values[0];
+ for (auto &V : Values)
+ V -= Base;
+
+ // Now we have signed numbers that have been shifted so that, given enough
+ // precision, there are no negative values. Since the rest of the transform
+ // is bitwise only, we switch now to an unsigned representation.
+ uint64_t GCD = 0;
+ for (auto &V : Values)
+ GCD = GreatestCommonDivisor64(GCD, (uint64_t)V);
+
+ // This transform can be done speculatively because it is so cheap - it results
+ // in a single rotate operation being inserted. This can only happen if the
+ // factor extracted is a power of 2.
+ // FIXME: If the GCD is an odd number we can multiply by the multiplicative
+ // inverse of GCD and then perform this transform.
+ // FIXME: It's possible that optimizing a switch on powers of two might also
+ // be beneficial - flag values are often powers of two and we could use a CLZ
+ // as the key function.
+ if (GCD <= 1 || !isPowerOf2_64(GCD))
+ // No common divisor found or too expensive to compute key function.
+ return false;
+
+ unsigned Shift = Log2_64(GCD);
+ for (auto &V : Values)
+ V = (int64_t)((uint64_t)V >> Shift);
+
+ if (!isSwitchDense(Values))
+ // Transform didn't create a dense switch.
+ return false;
+
+ // The obvious transform is to shift the switch condition right and emit a
+ // check that the condition actually cleanly divided by GCD, i.e.
+ // C & (1 << Shift - 1) == 0
+ // inserting a new CFG edge to handle the case where it didn't divide cleanly.
+ //
+ // A cheaper way of doing this is a simple ROTR(C, Shift). This performs the
+ // shift and puts the shifted-off bits in the uppermost bits. If any of these
+ // are nonzero then the switch condition will be very large and will hit the
+ // default case.
+
+ auto *Ty = cast<IntegerType>(SI->getCondition()->getType());
+ Builder.SetInsertPoint(SI);
+ auto *ShiftC = ConstantInt::get(Ty, Shift);
+ auto *Sub = Builder.CreateSub(SI->getCondition(), ConstantInt::get(Ty, Base));
+ auto *LShr = Builder.CreateLShr(Sub, ShiftC);
+ auto *Shl = Builder.CreateShl(Sub, Ty->getBitWidth() - Shift);
+ auto *Rot = Builder.CreateOr(LShr, Shl);
+ SI->replaceUsesOfWith(SI->getCondition(), Rot);
+
+ for (SwitchInst::CaseIt C = SI->case_begin(), E = SI->case_end(); C != E;
+ ++C) {
+ auto *Orig = C.getCaseValue();
+ auto Sub = Orig->getValue() - APInt(Ty->getBitWidth(), Base);
+ C.setValue(
+ cast<ConstantInt>(ConstantInt::get(Ty, Sub.lshr(ShiftC->getValue()))));
+ }
+ return true;
+}
+
bool SimplifyCFGOpt::SimplifySwitch(SwitchInst *SI, IRBuilder<> &Builder) {
BasicBlock *BB = SI->getParent();
@@ -5078,7 +5547,7 @@ bool SimplifyCFGOpt::SimplifySwitch(SwitchInst *SI, IRBuilder<> &Builder) {
if (EliminateDeadSwitchCases(SI, AC, DL))
return SimplifyCFG(BB, TTI, BonusInstThreshold, AC) | true;
- if (SwitchToSelect(SI, Builder, AC, DL))
+ if (SwitchToSelect(SI, Builder, AC, DL, TTI))
return SimplifyCFG(BB, TTI, BonusInstThreshold, AC) | true;
if (ForwardSwitchConditionToPHI(SI))
@@ -5087,6 +5556,9 @@ bool SimplifyCFGOpt::SimplifySwitch(SwitchInst *SI, IRBuilder<> &Builder) {
if (SwitchToLookupTable(SI, Builder, DL, TTI))
return SimplifyCFG(BB, TTI, BonusInstThreshold, AC) | true;
+ if (ReduceSwitchRange(SI, Builder, DL, TTI))
+ return SimplifyCFG(BB, TTI, BonusInstThreshold, AC) | true;
+
return false;
}
@@ -5397,7 +5869,10 @@ static bool passingValueIsAlwaysUndefined(Value *V, Instruction *I) {
// Now make sure that there are no instructions in between that can alter
// control flow (eg. calls)
- for (BasicBlock::iterator i = ++BasicBlock::iterator(I); &*i != Use; ++i)
+ for (BasicBlock::iterator
+ i = ++BasicBlock::iterator(I),
+ UI = BasicBlock::iterator(dyn_cast<Instruction>(Use));
+ i != UI; ++i)
if (i == I->getParent()->end() || i->mayHaveSideEffects())
return false;
diff --git a/contrib/llvm/lib/Transforms/Utils/SimplifyInstructions.cpp b/contrib/llvm/lib/Transforms/Utils/SimplifyInstructions.cpp
index df29906..1220490 100644
--- a/contrib/llvm/lib/Transforms/Utils/SimplifyInstructions.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/SimplifyInstructions.cpp
@@ -34,14 +34,14 @@ using namespace llvm;
STATISTIC(NumSimplified, "Number of redundant instructions removed");
-static bool runImpl(Function &F, const DominatorTree *DT, const TargetLibraryInfo *TLI,
- AssumptionCache *AC) {
+static bool runImpl(Function &F, const DominatorTree *DT,
+ const TargetLibraryInfo *TLI, AssumptionCache *AC) {
const DataLayout &DL = F.getParent()->getDataLayout();
- SmallPtrSet<const Instruction*, 8> S1, S2, *ToSimplify = &S1, *Next = &S2;
+ SmallPtrSet<const Instruction *, 8> S1, S2, *ToSimplify = &S1, *Next = &S2;
bool Changed = false;
do {
- for (BasicBlock *BB : depth_first(&F.getEntryBlock()))
+ for (BasicBlock *BB : depth_first(&F.getEntryBlock())) {
// Here be subtlety: the iterator must be incremented before the loop
// body (not sure why), so a range-for loop won't work here.
for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE;) {
@@ -51,8 +51,9 @@ static bool runImpl(Function &F, const DominatorTree *DT, const TargetLibraryInf
// empty and we only bother simplifying instructions that are in it.
if (!ToSimplify->empty() && !ToSimplify->count(I))
continue;
+
// Don't waste time simplifying unused instructions.
- if (!I->use_empty())
+ if (!I->use_empty()) {
if (Value *V = SimplifyInstruction(I, DL, TLI, DT, AC)) {
// Mark all uses for resimplification next time round the loop.
for (User *U : I->users())
@@ -61,16 +62,17 @@ static bool runImpl(Function &F, const DominatorTree *DT, const TargetLibraryInf
++NumSimplified;
Changed = true;
}
- bool res = RecursivelyDeleteTriviallyDeadInstructions(I, TLI);
- if (res) {
- // RecursivelyDeleteTriviallyDeadInstruction can remove
- // more than one instruction, so simply incrementing the
- // iterator does not work. When instructions get deleted
- // re-iterate instead.
- BI = BB->begin(); BE = BB->end();
- Changed |= res;
+ }
+ if (RecursivelyDeleteTriviallyDeadInstructions(I, TLI)) {
+ // RecursivelyDeleteTriviallyDeadInstruction can remove more than one
+ // instruction, so simply incrementing the iterator does not work.
+ // When instructions get deleted re-iterate instead.
+ BI = BB->begin();
+ BE = BB->end();
+ Changed = true;
}
}
+ }
// Place the list of instructions to simplify on the next loop iteration
// into ToSimplify.
@@ -90,6 +92,7 @@ namespace {
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
+ AU.addRequired<DominatorTreeWrapperPass>();
AU.addRequired<AssumptionCacheTracker>();
AU.addRequired<TargetLibraryInfoWrapperPass>();
}
@@ -99,9 +102,8 @@ namespace {
if (skipFunction(F))
return false;
- const DominatorTreeWrapperPass *DTWP =
- getAnalysisIfAvailable<DominatorTreeWrapperPass>();
- const DominatorTree *DT = DTWP ? &DTWP->getDomTree() : nullptr;
+ const DominatorTree *DT =
+ &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
const TargetLibraryInfo *TLI =
&getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
AssumptionCache *AC =
@@ -115,6 +117,7 @@ char InstSimplifier::ID = 0;
INITIALIZE_PASS_BEGIN(InstSimplifier, "instsimplify",
"Remove redundant instructions", false, false)
INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
INITIALIZE_PASS_END(InstSimplifier, "instsimplify",
"Remove redundant instructions", false, false)
@@ -126,11 +129,11 @@ FunctionPass *llvm::createInstructionSimplifierPass() {
}
PreservedAnalyses InstSimplifierPass::run(Function &F,
- AnalysisManager<Function> &AM) {
- auto *DT = AM.getCachedResult<DominatorTreeAnalysis>(F);
+ FunctionAnalysisManager &AM) {
+ auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
auto &AC = AM.getResult<AssumptionAnalysis>(F);
- bool Changed = runImpl(F, DT, &TLI, &AC);
+ bool Changed = runImpl(F, &DT, &TLI, &AC);
if (!Changed)
return PreservedAnalyses::all();
// FIXME: This should also 'preserve the CFG'.
diff --git a/contrib/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp b/contrib/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
index c298695..8eaeb10 100644
--- a/contrib/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
@@ -56,6 +56,38 @@ static bool ignoreCallingConv(LibFunc::Func Func) {
Func == LibFunc::llabs || Func == LibFunc::strlen;
}
+static bool isCallingConvCCompatible(CallInst *CI) {
+ switch(CI->getCallingConv()) {
+ default:
+ return false;
+ case llvm::CallingConv::C:
+ return true;
+ case llvm::CallingConv::ARM_APCS:
+ case llvm::CallingConv::ARM_AAPCS:
+ case llvm::CallingConv::ARM_AAPCS_VFP: {
+
+ // The iOS ABI diverges from the standard in some cases, so for now don't
+ // try to simplify those calls.
+ if (Triple(CI->getModule()->getTargetTriple()).isiOS())
+ return false;
+
+ auto *FuncTy = CI->getFunctionType();
+
+ if (!FuncTy->getReturnType()->isPointerTy() &&
+ !FuncTy->getReturnType()->isIntegerTy() &&
+ !FuncTy->getReturnType()->isVoidTy())
+ return false;
+
+ for (auto Param : FuncTy->params()) {
+ if (!Param->isPointerTy() && !Param->isIntegerTy())
+ return false;
+ }
+ return true;
+ }
+ }
+ return false;
+}
+
/// Return true if it only matters that the value is equal or not-equal to zero.
static bool isOnlyUsedInZeroEqualityComparison(Value *V) {
for (User *U : V->users()) {
@@ -83,7 +115,7 @@ static bool isOnlyUsedInEqualityComparison(Value *V, Value *With) {
}
static bool callHasFloatingPointArgument(const CallInst *CI) {
- return std::any_of(CI->op_begin(), CI->op_end(), [](const Use &OI) {
+ return any_of(CI->operands(), [](const Use &OI) {
return OI->getType()->isFloatingPointTy();
});
}
@@ -868,7 +900,7 @@ static Value *valueHasFloatPrecision(Value *Val) {
if (ConstantFP *Const = dyn_cast<ConstantFP>(Val)) {
APFloat F = Const->getValueAPF();
bool losesInfo;
- (void)F.convert(APFloat::IEEEsingle, APFloat::rmNearestTiesToEven,
+ (void)F.convert(APFloat::IEEEsingle(), APFloat::rmNearestTiesToEven,
&losesInfo);
if (!losesInfo)
return ConstantFP::get(Const->getContext(), F);
@@ -993,16 +1025,20 @@ Value *LibCallSimplifier::optimizePow(CallInst *CI, IRBuilder<> &B) {
Ret = optimizeUnaryDoubleFP(CI, B, true);
Value *Op1 = CI->getArgOperand(0), *Op2 = CI->getArgOperand(1);
+
+ // pow(1.0, x) -> 1.0
+ if (match(Op1, m_SpecificFP(1.0)))
+ return Op1;
+ // pow(2.0, x) -> llvm.exp2(x)
+ if (match(Op1, m_SpecificFP(2.0))) {
+ Value *Exp2 = Intrinsic::getDeclaration(CI->getModule(), Intrinsic::exp2,
+ CI->getType());
+ return B.CreateCall(Exp2, Op2, "exp2");
+ }
+
+ // There's no llvm.exp10 intrinsic yet, but, maybe, some day there will
+ // be one.
if (ConstantFP *Op1C = dyn_cast<ConstantFP>(Op1)) {
- // pow(1.0, x) -> 1.0
- if (Op1C->isExactlyValue(1.0))
- return Op1C;
- // pow(2.0, x) -> exp2(x)
- if (Op1C->isExactlyValue(2.0) &&
- hasUnaryFloatFn(TLI, Op1->getType(), LibFunc::exp2, LibFunc::exp2f,
- LibFunc::exp2l))
- return emitUnaryFloatFnCall(Op2, TLI->getName(LibFunc::exp2), B,
- Callee->getAttributes());
// pow(10.0, x) -> exp10(x)
if (Op1C->isExactlyValue(10.0) &&
hasUnaryFloatFn(TLI, Op1->getType(), LibFunc::exp10, LibFunc::exp10f,
@@ -1038,6 +1074,24 @@ Value *LibCallSimplifier::optimizePow(CallInst *CI, IRBuilder<> &B) {
if (Op2C->getValueAPF().isZero()) // pow(x, 0.0) -> 1.0
return ConstantFP::get(CI->getType(), 1.0);
+ if (Op2C->isExactlyValue(-0.5) &&
+ hasUnaryFloatFn(TLI, Op2->getType(), LibFunc::sqrt, LibFunc::sqrtf,
+ LibFunc::sqrtl)) {
+ // If -ffast-math:
+ // pow(x, -0.5) -> 1.0 / sqrt(x)
+ if (CI->hasUnsafeAlgebra()) {
+ IRBuilder<>::FastMathFlagGuard Guard(B);
+ B.setFastMathFlags(CI->getFastMathFlags());
+
+ // Here we cannot lower to an intrinsic because C99 sqrt() and llvm.sqrt
+ // are not guaranteed to have the same semantics.
+ Value *Sqrt = emitUnaryFloatFnCall(Op1, TLI->getName(LibFunc::sqrt), B,
+ Callee->getAttributes());
+
+ return B.CreateFDiv(ConstantFP::get(CI->getType(), 1.0), Sqrt, "sqrtrecip");
+ }
+ }
+
if (Op2C->isExactlyValue(0.5) &&
hasUnaryFloatFn(TLI, Op2->getType(), LibFunc::sqrt, LibFunc::sqrtf,
LibFunc::sqrtl) &&
@@ -1048,6 +1102,9 @@ Value *LibCallSimplifier::optimizePow(CallInst *CI, IRBuilder<> &B) {
if (CI->hasUnsafeAlgebra()) {
IRBuilder<>::FastMathFlagGuard Guard(B);
B.setFastMathFlags(CI->getFastMathFlags());
+
+ // Unlike other math intrinsics, sqrt has differerent semantics
+ // from the libc function. See LangRef for details.
return emitUnaryFloatFnCall(Op1, TLI->getName(LibFunc::sqrt), B,
Callee->getAttributes());
}
@@ -1082,6 +1139,10 @@ Value *LibCallSimplifier::optimizePow(CallInst *CI, IRBuilder<> &B) {
!V.isInteger())
return nullptr;
+ // Propagate fast math flags.
+ IRBuilder<>::FastMathFlagGuard Guard(B);
+ B.setFastMathFlags(CI->getFastMathFlags());
+
// We will memoize intermediate products of the Addition Chain.
Value *InnerChain[33] = {nullptr};
InnerChain[1] = Op1;
@@ -1090,9 +1151,8 @@ Value *LibCallSimplifier::optimizePow(CallInst *CI, IRBuilder<> &B) {
// We cannot readily convert a non-double type (like float) to a double.
// So we first convert V to something which could be converted to double.
bool ignored;
- V.convert(APFloat::IEEEdouble, APFloat::rmTowardZero, &ignored);
+ V.convert(APFloat::IEEEdouble(), APFloat::rmTowardZero, &ignored);
- // TODO: Should the new instructions propagate the 'fast' flag of the pow()?
Value *FMul = getPow(InnerChain, V.convertToDouble(), B);
// For negative exponents simply compute the reciprocal.
if (Op2C->isNegative())
@@ -1150,19 +1210,11 @@ Value *LibCallSimplifier::optimizeExp2(CallInst *CI, IRBuilder<> &B) {
Value *LibCallSimplifier::optimizeFabs(CallInst *CI, IRBuilder<> &B) {
Function *Callee = CI->getCalledFunction();
- Value *Ret = nullptr;
StringRef Name = Callee->getName();
if (Name == "fabs" && hasFloatVersion(Name))
- Ret = optimizeUnaryDoubleFP(CI, B, false);
+ return optimizeUnaryDoubleFP(CI, B, false);
- Value *Op = CI->getArgOperand(0);
- if (Instruction *I = dyn_cast<Instruction>(Op)) {
- // Fold fabs(x * x) -> x * x; any squared FP value must already be positive.
- if (I->getOpcode() == Instruction::FMul)
- if (I->getOperand(0) == I->getOperand(1))
- return Op;
- }
- return Ret;
+ return nullptr;
}
Value *LibCallSimplifier::optimizeFMinFMax(CallInst *CI, IRBuilder<> &B) {
@@ -1428,6 +1480,12 @@ Value *LibCallSimplifier::optimizeSinCosPi(CallInst *CI, IRBuilder<> &B) {
Value *Sin, *Cos, *SinCos;
insertSinCosCall(B, CI->getCalledFunction(), Arg, IsFloat, Sin, Cos, SinCos);
+ auto replaceTrigInsts = [this](SmallVectorImpl<CallInst *> &Calls,
+ Value *Res) {
+ for (CallInst *C : Calls)
+ replaceAllUsesWith(C, Res);
+ };
+
replaceTrigInsts(SinCalls, Sin);
replaceTrigInsts(CosCalls, Cos);
replaceTrigInsts(SinCosCalls, SinCos);
@@ -1472,32 +1530,16 @@ void LibCallSimplifier::classifyArgUse(
}
}
-void LibCallSimplifier::replaceTrigInsts(SmallVectorImpl<CallInst *> &Calls,
- Value *Res) {
- for (CallInst *C : Calls)
- replaceAllUsesWith(C, Res);
-}
-
//===----------------------------------------------------------------------===//
// Integer Library Call Optimizations
//===----------------------------------------------------------------------===//
Value *LibCallSimplifier::optimizeFFS(CallInst *CI, IRBuilder<> &B) {
- Function *Callee = CI->getCalledFunction();
- Value *Op = CI->getArgOperand(0);
-
- // Constant fold.
- if (ConstantInt *CI = dyn_cast<ConstantInt>(Op)) {
- if (CI->isZero()) // ffs(0) -> 0.
- return B.getInt32(0);
- // ffs(c) -> cttz(c)+1
- return B.getInt32(CI->getValue().countTrailingZeros() + 1);
- }
-
// ffs(x) -> x != 0 ? (i32)llvm.cttz(x)+1 : 0
+ Value *Op = CI->getArgOperand(0);
Type *ArgType = Op->getType();
- Value *F =
- Intrinsic::getDeclaration(Callee->getParent(), Intrinsic::cttz, ArgType);
+ Value *F = Intrinsic::getDeclaration(CI->getCalledFunction()->getParent(),
+ Intrinsic::cttz, ArgType);
Value *V = B.CreateCall(F, {Op, B.getTrue()}, "cttz");
V = B.CreateAdd(V, ConstantInt::get(V->getType(), 1));
V = B.CreateIntCast(V, B.getInt32Ty(), false);
@@ -1506,6 +1548,18 @@ Value *LibCallSimplifier::optimizeFFS(CallInst *CI, IRBuilder<> &B) {
return B.CreateSelect(Cond, V, B.getInt32(0));
}
+Value *LibCallSimplifier::optimizeFls(CallInst *CI, IRBuilder<> &B) {
+ // fls(x) -> (i32)(sizeInBits(x) - llvm.ctlz(x, false))
+ Value *Op = CI->getArgOperand(0);
+ Type *ArgType = Op->getType();
+ Value *F = Intrinsic::getDeclaration(CI->getCalledFunction()->getParent(),
+ Intrinsic::ctlz, ArgType);
+ Value *V = B.CreateCall(F, {Op, B.getFalse()}, "ctlz");
+ V = B.CreateSub(ConstantInt::get(V->getType(), ArgType->getIntegerBitWidth()),
+ V);
+ return B.CreateIntCast(V, CI->getType(), false);
+}
+
Value *LibCallSimplifier::optimizeAbs(CallInst *CI, IRBuilder<> &B) {
// abs(x) -> x >s -1 ? x : -x
Value *Op = CI->getArgOperand(0);
@@ -1891,7 +1945,7 @@ Value *LibCallSimplifier::optimizeStringMemoryLibCall(CallInst *CI,
if (TLI->getLibFunc(*Callee, Func) && TLI->has(Func)) {
// Make sure we never change the calling convention.
assert((ignoreCallingConv(Func) ||
- CI->getCallingConv() == llvm::CallingConv::C) &&
+ isCallingConvCCompatible(CI)) &&
"Optimizing string/memory libcall would change the calling convention");
switch (Func) {
case LibFunc::strcat:
@@ -1958,7 +2012,7 @@ Value *LibCallSimplifier::optimizeCall(CallInst *CI) {
SmallVector<OperandBundleDef, 2> OpBundles;
CI->getOperandBundlesAsDefs(OpBundles);
IRBuilder<> Builder(CI, /*FPMathTag=*/nullptr, OpBundles);
- bool isCallingConvC = CI->getCallingConv() == llvm::CallingConv::C;
+ bool isCallingConvC = isCallingConvCCompatible(CI);
// Command-line parameter overrides instruction attribute.
if (EnableUnsafeFPShrink.getNumOccurrences() > 0)
@@ -2042,6 +2096,10 @@ Value *LibCallSimplifier::optimizeCall(CallInst *CI) {
case LibFunc::ffsl:
case LibFunc::ffsll:
return optimizeFFS(CI, Builder);
+ case LibFunc::fls:
+ case LibFunc::flsl:
+ case LibFunc::flsll:
+ return optimizeFls(CI, Builder);
case LibFunc::abs:
case LibFunc::labs:
case LibFunc::llabs:
@@ -2314,7 +2372,7 @@ Value *FortifiedLibCallSimplifier::optimizeCall(CallInst *CI) {
SmallVector<OperandBundleDef, 2> OpBundles;
CI->getOperandBundlesAsDefs(OpBundles);
IRBuilder<> Builder(CI, /*FPMathTag=*/nullptr, OpBundles);
- bool isCallingConvC = CI->getCallingConv() == llvm::CallingConv::C;
+ bool isCallingConvC = isCallingConvCCompatible(CI);
// First, check that this is a known library functions and that the prototype
// is correct.
diff --git a/contrib/llvm/lib/Transforms/Utils/StripGCRelocates.cpp b/contrib/llvm/lib/Transforms/Utils/StripGCRelocates.cpp
new file mode 100644
index 0000000..f3d3fad
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Utils/StripGCRelocates.cpp
@@ -0,0 +1,80 @@
+//===- StripGCRelocates.cpp - Remove gc.relocates inserted by RewriteStatePoints===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This is a little utility pass that removes the gc.relocates inserted by
+// RewriteStatepointsForGC. Note that the generated IR is incorrect,
+// but this is useful as a single pass in itself, for analysis of IR, without
+// the GC.relocates. The statepoint and gc.result instrinsics would still be
+// present.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Statepoint.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+namespace {
+struct StripGCRelocates : public FunctionPass {
+ static char ID; // Pass identification, replacement for typeid
+ StripGCRelocates() : FunctionPass(ID) {
+ initializeStripGCRelocatesPass(*PassRegistry::getPassRegistry());
+ }
+
+ void getAnalysisUsage(AnalysisUsage &Info) const override {}
+
+ bool runOnFunction(Function &F) override;
+
+};
+char StripGCRelocates::ID = 0;
+}
+
+bool StripGCRelocates::runOnFunction(Function &F) {
+ // Nothing to do for declarations.
+ if (F.isDeclaration())
+ return false;
+ SmallVector<GCRelocateInst *, 20> GCRelocates;
+ // TODO: We currently do not handle gc.relocates that are in landing pads,
+ // i.e. not bound to a single statepoint token.
+ for (Instruction &I : instructions(F)) {
+ if (auto *GCR = dyn_cast<GCRelocateInst>(&I))
+ if (isStatepoint(GCR->getOperand(0)))
+ GCRelocates.push_back(GCR);
+ }
+ // All gc.relocates are bound to a single statepoint token. The order of
+ // visiting gc.relocates for deletion does not matter.
+ for (GCRelocateInst *GCRel : GCRelocates) {
+ Value *OrigPtr = GCRel->getDerivedPtr();
+ Value *ReplaceGCRel = OrigPtr;
+
+ // All gc_relocates are i8 addrspace(1)* typed, we need a bitcast from i8
+ // addrspace(1)* to the type of the OrigPtr, if the are not the same.
+ if (GCRel->getType() != OrigPtr->getType())
+ ReplaceGCRel = new BitCastInst(OrigPtr, GCRel->getType(), "cast", GCRel);
+
+ // Replace all uses of gc.relocate and delete the gc.relocate
+ // There maybe unncessary bitcasts back to the OrigPtr type, an instcombine
+ // pass would clear this up.
+ GCRel->replaceAllUsesWith(ReplaceGCRel);
+ GCRel->eraseFromParent();
+ }
+ return !GCRelocates.empty();
+}
+
+INITIALIZE_PASS(StripGCRelocates, "strip-gc-relocates",
+ "Strip gc.relocates inserted through RewriteStatepointsForGC",
+ true, false)
+FunctionPass *llvm::createStripGCRelocatesPass() {
+ return new StripGCRelocates();
+}
diff --git a/contrib/llvm/lib/Transforms/Utils/StripNonLineTableDebugInfo.cpp b/contrib/llvm/lib/Transforms/Utils/StripNonLineTableDebugInfo.cpp
new file mode 100644
index 0000000..66dbf33
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Utils/StripNonLineTableDebugInfo.cpp
@@ -0,0 +1,42 @@
+//===- StripNonLineTableDebugInfo.cpp -- Strip parts of Debug Info --------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/Pass.h"
+using namespace llvm;
+
+namespace {
+
+/// This pass strips all debug info that is not related line tables.
+/// The result will be the same as if the program where compiled with
+/// -gline-tables-only.
+struct StripNonLineTableDebugInfo : public ModulePass {
+ static char ID; // Pass identification, replacement for typeid
+ StripNonLineTableDebugInfo() : ModulePass(ID) {
+ initializeStripNonLineTableDebugInfoPass(*PassRegistry::getPassRegistry());
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesAll();
+ }
+
+ bool runOnModule(Module &M) override {
+ return llvm::stripNonLineTableDebugInfo(M);
+ }
+};
+}
+
+char StripNonLineTableDebugInfo::ID = 0;
+INITIALIZE_PASS(StripNonLineTableDebugInfo, "strip-nonlinetable-debuginfo",
+ "Strip all debug info except linetables", false, false)
+
+ModulePass *llvm::createStripNonLineTableDebugInfoPass() {
+ return new StripNonLineTableDebugInfo();
+}
diff --git a/contrib/llvm/lib/Transforms/Utils/SymbolRewriter.cpp b/contrib/llvm/lib/Transforms/Utils/SymbolRewriter.cpp
index 7523ca5..6d13663 100644
--- a/contrib/llvm/lib/Transforms/Utils/SymbolRewriter.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/SymbolRewriter.cpp
@@ -58,6 +58,7 @@
//===----------------------------------------------------------------------===//
#define DEBUG_TYPE "symbol-rewriter"
+#include "llvm/Transforms/Utils/SymbolRewriter.h"
#include "llvm/Pass.h"
#include "llvm/ADT/SmallString.h"
#include "llvm/IR/LegacyPassManager.h"
@@ -68,7 +69,6 @@
#include "llvm/Support/SourceMgr.h"
#include "llvm/Support/YAMLParser.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Utils/SymbolRewriter.h"
using namespace llvm;
using namespace SymbolRewriter;
@@ -361,9 +361,11 @@ parseRewriteFunctionDescriptor(yaml::Stream &YS, yaml::ScalarNode *K,
// TODO see if there is a more elegant solution to selecting the rewrite
// descriptor type
if (!Target.empty())
- DL->push_back(new ExplicitRewriteFunctionDescriptor(Source, Target, Naked));
+ DL->push_back(llvm::make_unique<ExplicitRewriteFunctionDescriptor>(
+ Source, Target, Naked));
else
- DL->push_back(new PatternRewriteFunctionDescriptor(Source, Transform));
+ DL->push_back(
+ llvm::make_unique<PatternRewriteFunctionDescriptor>(Source, Transform));
return true;
}
@@ -421,11 +423,12 @@ parseRewriteGlobalVariableDescriptor(yaml::Stream &YS, yaml::ScalarNode *K,
}
if (!Target.empty())
- DL->push_back(new ExplicitRewriteGlobalVariableDescriptor(Source, Target,
- /*Naked*/false));
+ DL->push_back(llvm::make_unique<ExplicitRewriteGlobalVariableDescriptor>(
+ Source, Target,
+ /*Naked*/ false));
else
- DL->push_back(new PatternRewriteGlobalVariableDescriptor(Source,
- Transform));
+ DL->push_back(llvm::make_unique<PatternRewriteGlobalVariableDescriptor>(
+ Source, Transform));
return true;
}
@@ -483,67 +486,80 @@ parseRewriteGlobalAliasDescriptor(yaml::Stream &YS, yaml::ScalarNode *K,
}
if (!Target.empty())
- DL->push_back(new ExplicitRewriteNamedAliasDescriptor(Source, Target,
- /*Naked*/false));
+ DL->push_back(llvm::make_unique<ExplicitRewriteNamedAliasDescriptor>(
+ Source, Target,
+ /*Naked*/ false));
else
- DL->push_back(new PatternRewriteNamedAliasDescriptor(Source, Transform));
+ DL->push_back(llvm::make_unique<PatternRewriteNamedAliasDescriptor>(
+ Source, Transform));
return true;
}
namespace {
-class RewriteSymbols : public ModulePass {
+class RewriteSymbolsLegacyPass : public ModulePass {
public:
static char ID; // Pass identification, replacement for typeid
- RewriteSymbols();
- RewriteSymbols(SymbolRewriter::RewriteDescriptorList &DL);
+ RewriteSymbolsLegacyPass();
+ RewriteSymbolsLegacyPass(SymbolRewriter::RewriteDescriptorList &DL);
bool runOnModule(Module &M) override;
private:
- void loadAndParseMapFiles();
-
- SymbolRewriter::RewriteDescriptorList Descriptors;
+ RewriteSymbolPass Impl;
};
-char RewriteSymbols::ID = 0;
+char RewriteSymbolsLegacyPass::ID = 0;
-RewriteSymbols::RewriteSymbols() : ModulePass(ID) {
- initializeRewriteSymbolsPass(*PassRegistry::getPassRegistry());
- loadAndParseMapFiles();
+RewriteSymbolsLegacyPass::RewriteSymbolsLegacyPass() : ModulePass(ID), Impl() {
+ initializeRewriteSymbolsLegacyPassPass(*PassRegistry::getPassRegistry());
}
-RewriteSymbols::RewriteSymbols(SymbolRewriter::RewriteDescriptorList &DL)
- : ModulePass(ID) {
- Descriptors.splice(Descriptors.begin(), DL);
+RewriteSymbolsLegacyPass::RewriteSymbolsLegacyPass(
+ SymbolRewriter::RewriteDescriptorList &DL)
+ : ModulePass(ID), Impl(DL) {}
+
+bool RewriteSymbolsLegacyPass::runOnModule(Module &M) {
+ return Impl.runImpl(M);
+}
}
-bool RewriteSymbols::runOnModule(Module &M) {
+namespace llvm {
+PreservedAnalyses RewriteSymbolPass::run(Module &M, ModuleAnalysisManager &AM) {
+ if (!runImpl(M))
+ return PreservedAnalyses::all();
+
+ return PreservedAnalyses::none();
+}
+
+bool RewriteSymbolPass::runImpl(Module &M) {
bool Changed;
Changed = false;
for (auto &Descriptor : Descriptors)
- Changed |= Descriptor.performOnModule(M);
+ Changed |= Descriptor->performOnModule(M);
return Changed;
}
-void RewriteSymbols::loadAndParseMapFiles() {
+void RewriteSymbolPass::loadAndParseMapFiles() {
const std::vector<std::string> MapFiles(RewriteMapFiles);
- SymbolRewriter::RewriteMapParser parser;
+ SymbolRewriter::RewriteMapParser Parser;
for (const auto &MapFile : MapFiles)
- parser.parse(MapFile, &Descriptors);
+ Parser.parse(MapFile, &Descriptors);
}
}
-INITIALIZE_PASS(RewriteSymbols, "rewrite-symbols", "Rewrite Symbols", false,
- false)
+INITIALIZE_PASS(RewriteSymbolsLegacyPass, "rewrite-symbols", "Rewrite Symbols",
+ false, false)
-ModulePass *llvm::createRewriteSymbolsPass() { return new RewriteSymbols(); }
+ModulePass *llvm::createRewriteSymbolsPass() {
+ return new RewriteSymbolsLegacyPass();
+}
ModulePass *
llvm::createRewriteSymbolsPass(SymbolRewriter::RewriteDescriptorList &DL) {
- return new RewriteSymbols(DL);
+ return new RewriteSymbolsLegacyPass(DL);
}
diff --git a/contrib/llvm/lib/Transforms/Utils/Utils.cpp b/contrib/llvm/lib/Transforms/Utils/Utils.cpp
index 8f85f19..7b9de2e 100644
--- a/contrib/llvm/lib/Transforms/Utils/Utils.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/Utils.cpp
@@ -25,16 +25,19 @@ void llvm::initializeTransformUtils(PassRegistry &Registry) {
initializeBreakCriticalEdgesPass(Registry);
initializeInstNamerPass(Registry);
initializeLCSSAWrapperPassPass(Registry);
+ initializeLibCallsShrinkWrapLegacyPassPass(Registry);
initializeLoopSimplifyPass(Registry);
- initializeLowerInvokePass(Registry);
+ initializeLowerInvokeLegacyPassPass(Registry);
initializeLowerSwitchPass(Registry);
- initializeNameAnonFunctionPass(Registry);
+ initializeNameAnonGlobalLegacyPassPass(Registry);
initializePromoteLegacyPassPass(Registry);
+ initializeStripNonLineTableDebugInfoPass(Registry);
initializeUnifyFunctionExitNodesPass(Registry);
initializeInstSimplifierPass(Registry);
initializeMetaRenamerPass(Registry);
initializeMemorySSAWrapperPassPass(Registry);
initializeMemorySSAPrinterLegacyPassPass(Registry);
+ initializeStripGCRelocatesPass(Registry);
}
/// LLVMInitializeTransformUtils - C binding for initializeTransformUtilsPasses.
diff --git a/contrib/llvm/lib/Transforms/Utils/ValueMapper.cpp b/contrib/llvm/lib/Transforms/Utils/ValueMapper.cpp
index 2eade8c..0e9baaf 100644
--- a/contrib/llvm/lib/Transforms/Utils/ValueMapper.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/ValueMapper.cpp
@@ -38,15 +38,6 @@ struct DelayedBasicBlock {
BasicBlock *OldBB;
std::unique_ptr<BasicBlock> TempBB;
- // Explicit move for MSVC.
- DelayedBasicBlock(DelayedBasicBlock &&X)
- : OldBB(std::move(X.OldBB)), TempBB(std::move(X.TempBB)) {}
- DelayedBasicBlock &operator=(DelayedBasicBlock &&X) {
- OldBB = std::move(X.OldBB);
- TempBB = std::move(X.TempBB);
- return *this;
- }
-
DelayedBasicBlock(const BlockAddress &Old)
: OldBB(Old.getBasicBlock()),
TempBB(BasicBlock::Create(Old.getContext())) {}
@@ -184,17 +175,6 @@ class MDNodeMapper {
bool HasChanged = false;
unsigned ID = ~0u;
TempMDNode Placeholder;
-
- Data() {}
- Data(Data &&X)
- : HasChanged(std::move(X.HasChanged)), ID(std::move(X.ID)),
- Placeholder(std::move(X.Placeholder)) {}
- Data &operator=(Data &&X) {
- HasChanged = std::move(X.HasChanged);
- ID = std::move(X.ID);
- Placeholder = std::move(X.Placeholder);
- return *this;
- }
};
/// A graph of uniqued nodes.
@@ -671,7 +651,7 @@ void MDNodeMapper::UniquedGraph::propagateChanges() {
if (D.HasChanged)
continue;
- if (!llvm::any_of(N->operands(), [&](const Metadata *Op) {
+ if (none_of(N->operands(), [&](const Metadata *Op) {
auto Where = Info.find(Op);
return Where != Info.end() && Where->second.HasChanged;
}))
diff --git a/contrib/llvm/lib/Transforms/Vectorize/BBVectorize.cpp b/contrib/llvm/lib/Transforms/Vectorize/BBVectorize.cpp
index af594cb..c01740b 100644
--- a/contrib/llvm/lib/Transforms/Vectorize/BBVectorize.cpp
+++ b/contrib/llvm/lib/Transforms/Vectorize/BBVectorize.cpp
@@ -3148,7 +3148,7 @@ namespace {
LLVMContext::MD_noalias, LLVMContext::MD_fpmath,
LLVMContext::MD_invariant_group};
combineMetadata(K, H, KnownIDs);
- K->intersectOptionalDataWith(H);
+ K->andIRFlags(H);
for (unsigned o = 0; o < NumOperands; ++o)
K->setOperand(o, ReplacedOperands[o]);
diff --git a/contrib/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/contrib/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
index c8906bd..c44a393 100644
--- a/contrib/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
+++ b/contrib/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
@@ -15,6 +15,7 @@
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/Triple.h"
#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/OrderedBasicBlock.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
#include "llvm/Analysis/TargetTransformInfo.h"
@@ -30,6 +31,7 @@
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Vectorize.h"
using namespace llvm;
@@ -40,13 +42,12 @@ STATISTIC(NumScalarsVectorized, "Number of scalar accesses vectorized");
namespace {
-// TODO: Remove this
-static const unsigned TargetBaseAlign = 4;
+// FIXME: Assuming stack alignment of 4 is always good enough
+static const unsigned StackAdjustedAlignment = 4;
+typedef SmallVector<Instruction *, 8> InstrList;
+typedef MapVector<Value *, InstrList> InstrListMap;
class Vectorizer {
- typedef SmallVector<Value *, 8> ValueList;
- typedef MapVector<Value *, ValueList> ValueListMap;
-
Function &F;
AliasAnalysis &AA;
DominatorTree &DT;
@@ -54,8 +55,6 @@ class Vectorizer {
TargetTransformInfo &TTI;
const DataLayout &DL;
IRBuilder<> Builder;
- ValueListMap StoreRefs;
- ValueListMap LoadRefs;
public:
Vectorizer(Function &F, AliasAnalysis &AA, DominatorTree &DT,
@@ -94,45 +93,47 @@ private:
/// Returns the first and the last instructions in Chain.
std::pair<BasicBlock::iterator, BasicBlock::iterator>
- getBoundaryInstrs(ArrayRef<Value *> Chain);
+ getBoundaryInstrs(ArrayRef<Instruction *> Chain);
/// Erases the original instructions after vectorizing.
- void eraseInstructions(ArrayRef<Value *> Chain);
+ void eraseInstructions(ArrayRef<Instruction *> Chain);
/// "Legalize" the vector type that would be produced by combining \p
/// ElementSizeBits elements in \p Chain. Break into two pieces such that the
/// total size of each piece is 1, 2 or a multiple of 4 bytes. \p Chain is
/// expected to have more than 4 elements.
- std::pair<ArrayRef<Value *>, ArrayRef<Value *>>
- splitOddVectorElts(ArrayRef<Value *> Chain, unsigned ElementSizeBits);
+ std::pair<ArrayRef<Instruction *>, ArrayRef<Instruction *>>
+ splitOddVectorElts(ArrayRef<Instruction *> Chain, unsigned ElementSizeBits);
- /// Checks for instructions which may affect the memory accessed
- /// in the chain between \p From and \p To. Returns Index, where
- /// \p Chain[0, Index) is the largest vectorizable chain prefix.
- /// The elements of \p Chain should be all loads or all stores.
- unsigned getVectorizablePrefixEndIdx(ArrayRef<Value *> Chain,
- BasicBlock::iterator From,
- BasicBlock::iterator To);
+ /// Finds the largest prefix of Chain that's vectorizable, checking for
+ /// intervening instructions which may affect the memory accessed by the
+ /// instructions within Chain.
+ ///
+ /// The elements of \p Chain must be all loads or all stores and must be in
+ /// address order.
+ ArrayRef<Instruction *> getVectorizablePrefix(ArrayRef<Instruction *> Chain);
/// Collects load and store instructions to vectorize.
- void collectInstructions(BasicBlock *BB);
+ std::pair<InstrListMap, InstrListMap> collectInstructions(BasicBlock *BB);
- /// Processes the collected instructions, the \p Map. The elements of \p Map
+ /// Processes the collected instructions, the \p Map. The values of \p Map
/// should be all loads or all stores.
- bool vectorizeChains(ValueListMap &Map);
+ bool vectorizeChains(InstrListMap &Map);
/// Finds the load/stores to consecutive memory addresses and vectorizes them.
- bool vectorizeInstructions(ArrayRef<Value *> Instrs);
+ bool vectorizeInstructions(ArrayRef<Instruction *> Instrs);
/// Vectorizes the load instructions in Chain.
- bool vectorizeLoadChain(ArrayRef<Value *> Chain,
- SmallPtrSet<Value *, 16> *InstructionsProcessed);
+ bool
+ vectorizeLoadChain(ArrayRef<Instruction *> Chain,
+ SmallPtrSet<Instruction *, 16> *InstructionsProcessed);
/// Vectorizes the store instructions in Chain.
- bool vectorizeStoreChain(ArrayRef<Value *> Chain,
- SmallPtrSet<Value *, 16> *InstructionsProcessed);
+ bool
+ vectorizeStoreChain(ArrayRef<Instruction *> Chain,
+ SmallPtrSet<Instruction *, 16> *InstructionsProcessed);
- /// Check if this load/store access is misaligned accesses
+ /// Check if this load/store access is misaligned accesses.
bool accessIsMisaligned(unsigned SzInBytes, unsigned AddressSpace,
unsigned Alignment);
};
@@ -147,7 +148,7 @@ public:
bool runOnFunction(Function &F) override;
- const char *getPassName() const override {
+ StringRef getPassName() const override {
return "GPU Load and Store Vectorizer";
}
@@ -177,6 +178,13 @@ Pass *llvm::createLoadStoreVectorizerPass() {
return new LoadStoreVectorizer();
}
+// The real propagateMetadata expects a SmallVector<Value*>, but we deal in
+// vectors of Instructions.
+static void propagateMetadata(Instruction *I, ArrayRef<Instruction *> IL) {
+ SmallVector<Value *, 8> VL(IL.begin(), IL.end());
+ propagateMetadata(I, VL);
+}
+
bool LoadStoreVectorizer::runOnFunction(Function &F) {
// Don't vectorize when the attribute NoImplicitFloat is used.
if (skipFunction(F) || F.hasFnAttribute(Attribute::NoImplicitFloat))
@@ -198,7 +206,8 @@ bool Vectorizer::run() {
// Scan the blocks in the function in post order.
for (BasicBlock *BB : post_order(&F)) {
- collectInstructions(BB);
+ InstrListMap LoadRefs, StoreRefs;
+ std::tie(LoadRefs, StoreRefs) = collectInstructions(BB);
Changed |= vectorizeChains(LoadRefs);
Changed |= vectorizeChains(StoreRefs);
}
@@ -338,6 +347,7 @@ bool Vectorizer::isConsecutiveAccess(Value *A, Value *B) {
}
void Vectorizer::reorder(Instruction *I) {
+ OrderedBasicBlock OBB(I->getParent());
SmallPtrSet<Instruction *, 16> InstructionsToMove;
SmallVector<Instruction *, 16> Worklist;
@@ -350,11 +360,14 @@ void Vectorizer::reorder(Instruction *I) {
if (!IM || IM->getOpcode() == Instruction::PHI)
continue;
- if (!DT.dominates(IM, I)) {
+ // If IM is in another BB, no need to move it, because this pass only
+ // vectorizes instructions within one BB.
+ if (IM->getParent() != I->getParent())
+ continue;
+
+ if (!OBB.dominates(IM, I)) {
InstructionsToMove.insert(IM);
Worklist.push_back(IM);
- assert(IM->getParent() == IW->getParent() &&
- "Instructions to move should be in the same basic block");
}
}
}
@@ -362,7 +375,7 @@ void Vectorizer::reorder(Instruction *I) {
// All instructions to move should follow I. Start from I, not from begin().
for (auto BBI = I->getIterator(), E = I->getParent()->end(); BBI != E;
++BBI) {
- if (!is_contained(InstructionsToMove, &*BBI))
+ if (!InstructionsToMove.count(&*BBI))
continue;
Instruction *IM = &*BBI;
--BBI;
@@ -372,8 +385,8 @@ void Vectorizer::reorder(Instruction *I) {
}
std::pair<BasicBlock::iterator, BasicBlock::iterator>
-Vectorizer::getBoundaryInstrs(ArrayRef<Value *> Chain) {
- Instruction *C0 = cast<Instruction>(Chain[0]);
+Vectorizer::getBoundaryInstrs(ArrayRef<Instruction *> Chain) {
+ Instruction *C0 = Chain[0];
BasicBlock::iterator FirstInstr = C0->getIterator();
BasicBlock::iterator LastInstr = C0->getIterator();
@@ -397,105 +410,152 @@ Vectorizer::getBoundaryInstrs(ArrayRef<Value *> Chain) {
return std::make_pair(FirstInstr, ++LastInstr);
}
-void Vectorizer::eraseInstructions(ArrayRef<Value *> Chain) {
+void Vectorizer::eraseInstructions(ArrayRef<Instruction *> Chain) {
SmallVector<Instruction *, 16> Instrs;
- for (Value *V : Chain) {
- Value *PtrOperand = getPointerOperand(V);
+ for (Instruction *I : Chain) {
+ Value *PtrOperand = getPointerOperand(I);
assert(PtrOperand && "Instruction must have a pointer operand.");
- Instrs.push_back(cast<Instruction>(V));
+ Instrs.push_back(I);
if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(PtrOperand))
Instrs.push_back(GEP);
}
// Erase instructions.
- for (Value *V : Instrs) {
- Instruction *Instr = cast<Instruction>(V);
- if (Instr->use_empty())
- Instr->eraseFromParent();
- }
+ for (Instruction *I : Instrs)
+ if (I->use_empty())
+ I->eraseFromParent();
}
-std::pair<ArrayRef<Value *>, ArrayRef<Value *>>
-Vectorizer::splitOddVectorElts(ArrayRef<Value *> Chain,
+std::pair<ArrayRef<Instruction *>, ArrayRef<Instruction *>>
+Vectorizer::splitOddVectorElts(ArrayRef<Instruction *> Chain,
unsigned ElementSizeBits) {
- unsigned ElemSizeInBytes = ElementSizeBits / 8;
- unsigned SizeInBytes = ElemSizeInBytes * Chain.size();
- unsigned NumRight = (SizeInBytes % 4) / ElemSizeInBytes;
- unsigned NumLeft = Chain.size() - NumRight;
+ unsigned ElementSizeBytes = ElementSizeBits / 8;
+ unsigned SizeBytes = ElementSizeBytes * Chain.size();
+ unsigned NumLeft = (SizeBytes - (SizeBytes % 4)) / ElementSizeBytes;
+ if (NumLeft == Chain.size())
+ --NumLeft;
+ else if (NumLeft == 0)
+ NumLeft = 1;
return std::make_pair(Chain.slice(0, NumLeft), Chain.slice(NumLeft));
}
-unsigned Vectorizer::getVectorizablePrefixEndIdx(ArrayRef<Value *> Chain,
- BasicBlock::iterator From,
- BasicBlock::iterator To) {
- SmallVector<std::pair<Value *, unsigned>, 16> MemoryInstrs;
- SmallVector<std::pair<Value *, unsigned>, 16> ChainInstrs;
+ArrayRef<Instruction *>
+Vectorizer::getVectorizablePrefix(ArrayRef<Instruction *> Chain) {
+ // These are in BB order, unlike Chain, which is in address order.
+ SmallVector<Instruction *, 16> MemoryInstrs;
+ SmallVector<Instruction *, 16> ChainInstrs;
+
+ bool IsLoadChain = isa<LoadInst>(Chain[0]);
+ DEBUG({
+ for (Instruction *I : Chain) {
+ if (IsLoadChain)
+ assert(isa<LoadInst>(I) &&
+ "All elements of Chain must be loads, or all must be stores.");
+ else
+ assert(isa<StoreInst>(I) &&
+ "All elements of Chain must be loads, or all must be stores.");
+ }
+ });
- unsigned InstrIdx = 0;
- for (auto I = From; I != To; ++I, ++InstrIdx) {
+ for (Instruction &I : make_range(getBoundaryInstrs(Chain))) {
if (isa<LoadInst>(I) || isa<StoreInst>(I)) {
- if (!is_contained(Chain, &*I))
- MemoryInstrs.push_back({&*I, InstrIdx});
+ if (!is_contained(Chain, &I))
+ MemoryInstrs.push_back(&I);
else
- ChainInstrs.push_back({&*I, InstrIdx});
- } else if (I->mayHaveSideEffects()) {
- DEBUG(dbgs() << "LSV: Found side-effecting operation: " << *I << '\n');
- return 0;
+ ChainInstrs.push_back(&I);
+ } else if (IsLoadChain && (I.mayWriteToMemory() || I.mayThrow())) {
+ DEBUG(dbgs() << "LSV: Found may-write/throw operation: " << I << '\n');
+ break;
+ } else if (!IsLoadChain && (I.mayReadOrWriteMemory() || I.mayThrow())) {
+ DEBUG(dbgs() << "LSV: Found may-read/write/throw operation: " << I
+ << '\n');
+ break;
}
}
- assert(Chain.size() == ChainInstrs.size() &&
- "All instructions in the Chain must exist in [From, To).");
+ OrderedBasicBlock OBB(Chain[0]->getParent());
- unsigned ChainIdx = 0;
- for (auto EntryChain : ChainInstrs) {
- Value *ChainInstrValue = EntryChain.first;
- unsigned ChainInstrIdx = EntryChain.second;
- for (auto EntryMem : MemoryInstrs) {
- Value *MemInstrValue = EntryMem.first;
- unsigned MemInstrIdx = EntryMem.second;
- if (isa<LoadInst>(MemInstrValue) && isa<LoadInst>(ChainInstrValue))
+ // Loop until we find an instruction in ChainInstrs that we can't vectorize.
+ unsigned ChainInstrIdx = 0;
+ Instruction *BarrierMemoryInstr = nullptr;
+
+ for (unsigned E = ChainInstrs.size(); ChainInstrIdx < E; ++ChainInstrIdx) {
+ Instruction *ChainInstr = ChainInstrs[ChainInstrIdx];
+
+ // If a barrier memory instruction was found, chain instructions that follow
+ // will not be added to the valid prefix.
+ if (BarrierMemoryInstr && OBB.dominates(BarrierMemoryInstr, ChainInstr))
+ break;
+
+ // Check (in BB order) if any instruction prevents ChainInstr from being
+ // vectorized. Find and store the first such "conflicting" instruction.
+ for (Instruction *MemInstr : MemoryInstrs) {
+ // If a barrier memory instruction was found, do not check past it.
+ if (BarrierMemoryInstr && OBB.dominates(BarrierMemoryInstr, MemInstr))
+ break;
+
+ if (isa<LoadInst>(MemInstr) && isa<LoadInst>(ChainInstr))
continue;
// We can ignore the alias as long as the load comes before the store,
// because that means we won't be moving the load past the store to
// vectorize it (the vectorized load is inserted at the location of the
// first load in the chain).
- if (isa<StoreInst>(MemInstrValue) && isa<LoadInst>(ChainInstrValue) &&
- ChainInstrIdx < MemInstrIdx)
+ if (isa<StoreInst>(MemInstr) && isa<LoadInst>(ChainInstr) &&
+ OBB.dominates(ChainInstr, MemInstr))
continue;
// Same case, but in reverse.
- if (isa<LoadInst>(MemInstrValue) && isa<StoreInst>(ChainInstrValue) &&
- ChainInstrIdx > MemInstrIdx)
+ if (isa<LoadInst>(MemInstr) && isa<StoreInst>(ChainInstr) &&
+ OBB.dominates(MemInstr, ChainInstr))
continue;
- Instruction *M0 = cast<Instruction>(MemInstrValue);
- Instruction *M1 = cast<Instruction>(ChainInstrValue);
-
- if (!AA.isNoAlias(MemoryLocation::get(M0), MemoryLocation::get(M1))) {
+ if (!AA.isNoAlias(MemoryLocation::get(MemInstr),
+ MemoryLocation::get(ChainInstr))) {
DEBUG({
- Value *Ptr0 = getPointerOperand(M0);
- Value *Ptr1 = getPointerOperand(M1);
-
- dbgs() << "LSV: Found alias.\n"
- " Aliasing instruction and pointer:\n"
- << *MemInstrValue << " aliases " << *Ptr0 << '\n'
- << " Aliased instruction and pointer:\n"
- << *ChainInstrValue << " aliases " << *Ptr1 << '\n';
+ dbgs() << "LSV: Found alias:\n"
+ " Aliasing instruction and pointer:\n"
+ << " " << *MemInstr << '\n'
+ << " " << *getPointerOperand(MemInstr) << '\n'
+ << " Aliased instruction and pointer:\n"
+ << " " << *ChainInstr << '\n'
+ << " " << *getPointerOperand(ChainInstr) << '\n';
});
-
- return ChainIdx;
+ // Save this aliasing memory instruction as a barrier, but allow other
+ // instructions that precede the barrier to be vectorized with this one.
+ BarrierMemoryInstr = MemInstr;
+ break;
}
}
- ChainIdx++;
+ // Continue the search only for store chains, since vectorizing stores that
+ // precede an aliasing load is valid. Conversely, vectorizing loads is valid
+ // up to an aliasing store, but should not pull loads from further down in
+ // the basic block.
+ if (IsLoadChain && BarrierMemoryInstr) {
+ // The BarrierMemoryInstr is a store that precedes ChainInstr.
+ assert(OBB.dominates(BarrierMemoryInstr, ChainInstr));
+ break;
+ }
}
- return Chain.size();
+
+ // Find the largest prefix of Chain whose elements are all in
+ // ChainInstrs[0, ChainInstrIdx). This is the largest vectorizable prefix of
+ // Chain. (Recall that Chain is in address order, but ChainInstrs is in BB
+ // order.)
+ SmallPtrSet<Instruction *, 8> VectorizableChainInstrs(
+ ChainInstrs.begin(), ChainInstrs.begin() + ChainInstrIdx);
+ unsigned ChainIdx = 0;
+ for (unsigned ChainLen = Chain.size(); ChainIdx < ChainLen; ++ChainIdx) {
+ if (!VectorizableChainInstrs.count(Chain[ChainIdx]))
+ break;
+ }
+ return Chain.slice(0, ChainIdx);
}
-void Vectorizer::collectInstructions(BasicBlock *BB) {
- LoadRefs.clear();
- StoreRefs.clear();
+std::pair<InstrListMap, InstrListMap>
+Vectorizer::collectInstructions(BasicBlock *BB) {
+ InstrListMap LoadRefs;
+ InstrListMap StoreRefs;
for (Instruction &I : *BB) {
if (!I.mayReadOrWriteMemory())
@@ -505,6 +565,10 @@ void Vectorizer::collectInstructions(BasicBlock *BB) {
if (!LI->isSimple())
continue;
+ // Skip if it's not legal.
+ if (!TTI.isLegalToVectorizeLoad(LI))
+ continue;
+
Type *Ty = LI->getType();
if (!VectorType::isValidElementType(Ty->getScalarType()))
continue;
@@ -525,14 +589,11 @@ void Vectorizer::collectInstructions(BasicBlock *BB) {
// Make sure all the users of a vector are constant-index extracts.
if (isa<VectorType>(Ty) && !all_of(LI->users(), [LI](const User *U) {
- const Instruction *UI = cast<Instruction>(U);
- return isa<ExtractElementInst>(UI) &&
- isa<ConstantInt>(UI->getOperand(1));
+ const ExtractElementInst *EEI = dyn_cast<ExtractElementInst>(U);
+ return EEI && isa<ConstantInt>(EEI->getOperand(1));
}))
continue;
- // TODO: Target hook to filter types.
-
// Save the load locations.
Value *ObjPtr = GetUnderlyingObject(Ptr, DL);
LoadRefs[ObjPtr].push_back(LI);
@@ -541,6 +602,10 @@ void Vectorizer::collectInstructions(BasicBlock *BB) {
if (!SI->isSimple())
continue;
+ // Skip if it's not legal.
+ if (!TTI.isLegalToVectorizeStore(SI))
+ continue;
+
Type *Ty = SI->getValueOperand()->getType();
if (!VectorType::isValidElementType(Ty->getScalarType()))
continue;
@@ -558,9 +623,8 @@ void Vectorizer::collectInstructions(BasicBlock *BB) {
continue;
if (isa<VectorType>(Ty) && !all_of(SI->users(), [SI](const User *U) {
- const Instruction *UI = cast<Instruction>(U);
- return isa<ExtractElementInst>(UI) &&
- isa<ConstantInt>(UI->getOperand(1));
+ const ExtractElementInst *EEI = dyn_cast<ExtractElementInst>(U);
+ return EEI && isa<ConstantInt>(EEI->getOperand(1));
}))
continue;
@@ -569,12 +633,14 @@ void Vectorizer::collectInstructions(BasicBlock *BB) {
StoreRefs[ObjPtr].push_back(SI);
}
}
+
+ return {LoadRefs, StoreRefs};
}
-bool Vectorizer::vectorizeChains(ValueListMap &Map) {
+bool Vectorizer::vectorizeChains(InstrListMap &Map) {
bool Changed = false;
- for (const std::pair<Value *, ValueList> &Chain : Map) {
+ for (const std::pair<Value *, InstrList> &Chain : Map) {
unsigned Size = Chain.second.size();
if (Size < 2)
continue;
@@ -584,7 +650,7 @@ bool Vectorizer::vectorizeChains(ValueListMap &Map) {
// Process the stores in chunks of 64.
for (unsigned CI = 0, CE = Size; CI < CE; CI += 64) {
unsigned Len = std::min<unsigned>(CE - CI, 64);
- ArrayRef<Value *> Chunk(&Chain.second[CI], Len);
+ ArrayRef<Instruction *> Chunk(&Chain.second[CI], Len);
Changed |= vectorizeInstructions(Chunk);
}
}
@@ -592,9 +658,9 @@ bool Vectorizer::vectorizeChains(ValueListMap &Map) {
return Changed;
}
-bool Vectorizer::vectorizeInstructions(ArrayRef<Value *> Instrs) {
+bool Vectorizer::vectorizeInstructions(ArrayRef<Instruction *> Instrs) {
DEBUG(dbgs() << "LSV: Vectorizing " << Instrs.size() << " instructions.\n");
- SmallSetVector<int, 16> Heads, Tails;
+ SmallVector<int, 16> Heads, Tails;
int ConsecutiveChain[64];
// Do a quadratic search on all of the given stores and find all of the pairs
@@ -613,34 +679,34 @@ bool Vectorizer::vectorizeInstructions(ArrayRef<Value *> Instrs) {
continue; // Should not insert.
}
- Tails.insert(j);
- Heads.insert(i);
+ Tails.push_back(j);
+ Heads.push_back(i);
ConsecutiveChain[i] = j;
}
}
}
bool Changed = false;
- SmallPtrSet<Value *, 16> InstructionsProcessed;
+ SmallPtrSet<Instruction *, 16> InstructionsProcessed;
for (int Head : Heads) {
if (InstructionsProcessed.count(Instrs[Head]))
continue;
- bool longerChainExists = false;
+ bool LongerChainExists = false;
for (unsigned TIt = 0; TIt < Tails.size(); TIt++)
if (Head == Tails[TIt] &&
!InstructionsProcessed.count(Instrs[Heads[TIt]])) {
- longerChainExists = true;
+ LongerChainExists = true;
break;
}
- if (longerChainExists)
+ if (LongerChainExists)
continue;
// We found an instr that starts a chain. Now follow the chain and try to
// vectorize it.
- SmallVector<Value *, 16> Operands;
+ SmallVector<Instruction *, 16> Operands;
int I = Head;
- while (I != -1 && (Tails.count(I) || Heads.count(I))) {
+ while (I != -1 && (is_contained(Tails, I) || is_contained(Heads, I))) {
if (InstructionsProcessed.count(Instrs[I]))
break;
@@ -661,13 +727,14 @@ bool Vectorizer::vectorizeInstructions(ArrayRef<Value *> Instrs) {
}
bool Vectorizer::vectorizeStoreChain(
- ArrayRef<Value *> Chain, SmallPtrSet<Value *, 16> *InstructionsProcessed) {
+ ArrayRef<Instruction *> Chain,
+ SmallPtrSet<Instruction *, 16> *InstructionsProcessed) {
StoreInst *S0 = cast<StoreInst>(Chain[0]);
// If the vector has an int element, default to int for the whole load.
Type *StoreTy;
- for (const auto &V : Chain) {
- StoreTy = cast<StoreInst>(V)->getValueOperand()->getType();
+ for (Instruction *I : Chain) {
+ StoreTy = cast<StoreInst>(I)->getValueOperand()->getType();
if (StoreTy->isIntOrIntVectorTy())
break;
@@ -683,40 +750,34 @@ bool Vectorizer::vectorizeStoreChain(
unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
unsigned VF = VecRegSize / Sz;
unsigned ChainSize = Chain.size();
+ unsigned Alignment = getAlignment(S0);
if (!isPowerOf2_32(Sz) || VF < 2 || ChainSize < 2) {
InstructionsProcessed->insert(Chain.begin(), Chain.end());
return false;
}
- BasicBlock::iterator First, Last;
- std::tie(First, Last) = getBoundaryInstrs(Chain);
- unsigned StopChain = getVectorizablePrefixEndIdx(Chain, First, Last);
- if (StopChain == 0) {
- // There exists a side effect instruction, no vectorization possible.
+ ArrayRef<Instruction *> NewChain = getVectorizablePrefix(Chain);
+ if (NewChain.empty()) {
+ // No vectorization possible.
InstructionsProcessed->insert(Chain.begin(), Chain.end());
return false;
}
- if (StopChain == 1) {
+ if (NewChain.size() == 1) {
// Failed after the first instruction. Discard it and try the smaller chain.
- InstructionsProcessed->insert(Chain.front());
+ InstructionsProcessed->insert(NewChain.front());
return false;
}
// Update Chain to the valid vectorizable subchain.
- Chain = Chain.slice(0, StopChain);
+ Chain = NewChain;
ChainSize = Chain.size();
- // Store size should be 1B, 2B or multiple of 4B.
- // TODO: Target hook for size constraint?
- unsigned SzInBytes = (Sz / 8) * ChainSize;
- if (SzInBytes > 2 && SzInBytes % 4 != 0) {
- DEBUG(dbgs() << "LSV: Size should be 1B, 2B "
- "or multiple of 4B. Splitting.\n");
- if (SzInBytes == 3)
- return vectorizeStoreChain(Chain.slice(0, ChainSize - 1),
- InstructionsProcessed);
-
+ // Check if it's legal to vectorize this chain. If not, split the chain and
+ // try again.
+ unsigned EltSzInBytes = Sz / 8;
+ unsigned SzInBytes = EltSzInBytes * ChainSize;
+ if (!TTI.isLegalToVectorizeStoreChain(SzInBytes, Alignment, AS)) {
auto Chains = splitOddVectorElts(Chain, Sz);
return vectorizeStoreChain(Chains.first, InstructionsProcessed) |
vectorizeStoreChain(Chains.second, InstructionsProcessed);
@@ -730,45 +791,41 @@ bool Vectorizer::vectorizeStoreChain(
else
VecTy = VectorType::get(StoreTy, Chain.size());
- // If it's more than the max vector size, break it into two pieces.
- // TODO: Target hook to control types to split to.
- if (ChainSize > VF) {
- DEBUG(dbgs() << "LSV: Vector factor is too big."
+ // If it's more than the max vector size or the target has a better
+ // vector factor, break it into two pieces.
+ unsigned TargetVF = TTI.getStoreVectorFactor(VF, Sz, SzInBytes, VecTy);
+ if (ChainSize > VF || (VF != TargetVF && TargetVF < ChainSize)) {
+ DEBUG(dbgs() << "LSV: Chain doesn't match with the vector factor."
" Creating two separate arrays.\n");
- return vectorizeStoreChain(Chain.slice(0, VF), InstructionsProcessed) |
- vectorizeStoreChain(Chain.slice(VF), InstructionsProcessed);
+ return vectorizeStoreChain(Chain.slice(0, TargetVF),
+ InstructionsProcessed) |
+ vectorizeStoreChain(Chain.slice(TargetVF), InstructionsProcessed);
}
DEBUG({
dbgs() << "LSV: Stores to vectorize:\n";
- for (Value *V : Chain)
- V->dump();
+ for (Instruction *I : Chain)
+ dbgs() << " " << *I << "\n";
});
// We won't try again to vectorize the elements of the chain, regardless of
// whether we succeed below.
InstructionsProcessed->insert(Chain.begin(), Chain.end());
- // Check alignment restrictions.
- unsigned Alignment = getAlignment(S0);
-
// If the store is going to be misaligned, don't vectorize it.
if (accessIsMisaligned(SzInBytes, AS, Alignment)) {
if (S0->getPointerAddressSpace() != 0)
return false;
- // If we're storing to an object on the stack, we control its alignment,
- // so we can cheat and change it!
- Value *V = GetUnderlyingObject(S0->getPointerOperand(), DL);
- if (AllocaInst *AI = dyn_cast_or_null<AllocaInst>(V)) {
- AI->setAlignment(TargetBaseAlign);
- Alignment = TargetBaseAlign;
- } else {
+ unsigned NewAlign = getOrEnforceKnownAlignment(S0->getPointerOperand(),
+ StackAdjustedAlignment,
+ DL, S0, nullptr, &DT);
+ if (NewAlign < StackAdjustedAlignment)
return false;
- }
}
- // Set insert point.
+ BasicBlock::iterator First, Last;
+ std::tie(First, Last) = getBoundaryInstrs(Chain);
Builder.SetInsertPoint(&*Last);
Value *Vec = UndefValue::get(VecTy);
@@ -803,9 +860,11 @@ bool Vectorizer::vectorizeStoreChain(
}
}
- Value *Bitcast =
- Builder.CreateBitCast(S0->getPointerOperand(), VecTy->getPointerTo(AS));
- StoreInst *SI = cast<StoreInst>(Builder.CreateStore(Vec, Bitcast));
+ // This cast is safe because Builder.CreateStore() always creates a bona fide
+ // StoreInst.
+ StoreInst *SI = cast<StoreInst>(
+ Builder.CreateStore(Vec, Builder.CreateBitCast(S0->getPointerOperand(),
+ VecTy->getPointerTo(AS))));
propagateMetadata(SI, Chain);
SI->setAlignment(Alignment);
@@ -816,7 +875,8 @@ bool Vectorizer::vectorizeStoreChain(
}
bool Vectorizer::vectorizeLoadChain(
- ArrayRef<Value *> Chain, SmallPtrSet<Value *, 16> *InstructionsProcessed) {
+ ArrayRef<Instruction *> Chain,
+ SmallPtrSet<Instruction *, 16> *InstructionsProcessed) {
LoadInst *L0 = cast<LoadInst>(Chain[0]);
// If the vector has an int element, default to int for the whole load.
@@ -838,39 +898,34 @@ bool Vectorizer::vectorizeLoadChain(
unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
unsigned VF = VecRegSize / Sz;
unsigned ChainSize = Chain.size();
+ unsigned Alignment = getAlignment(L0);
if (!isPowerOf2_32(Sz) || VF < 2 || ChainSize < 2) {
InstructionsProcessed->insert(Chain.begin(), Chain.end());
return false;
}
- BasicBlock::iterator First, Last;
- std::tie(First, Last) = getBoundaryInstrs(Chain);
- unsigned StopChain = getVectorizablePrefixEndIdx(Chain, First, Last);
- if (StopChain == 0) {
- // There exists a side effect instruction, no vectorization possible.
+ ArrayRef<Instruction *> NewChain = getVectorizablePrefix(Chain);
+ if (NewChain.empty()) {
+ // No vectorization possible.
InstructionsProcessed->insert(Chain.begin(), Chain.end());
return false;
}
- if (StopChain == 1) {
+ if (NewChain.size() == 1) {
// Failed after the first instruction. Discard it and try the smaller chain.
- InstructionsProcessed->insert(Chain.front());
+ InstructionsProcessed->insert(NewChain.front());
return false;
}
// Update Chain to the valid vectorizable subchain.
- Chain = Chain.slice(0, StopChain);
+ Chain = NewChain;
ChainSize = Chain.size();
- // Load size should be 1B, 2B or multiple of 4B.
- // TODO: Should size constraint be a target hook?
- unsigned SzInBytes = (Sz / 8) * ChainSize;
- if (SzInBytes > 2 && SzInBytes % 4 != 0) {
- DEBUG(dbgs() << "LSV: Size should be 1B, 2B "
- "or multiple of 4B. Splitting.\n");
- if (SzInBytes == 3)
- return vectorizeLoadChain(Chain.slice(0, ChainSize - 1),
- InstructionsProcessed);
+ // Check if it's legal to vectorize this chain. If not, split the chain and
+ // try again.
+ unsigned EltSzInBytes = Sz / 8;
+ unsigned SzInBytes = EltSzInBytes * ChainSize;
+ if (!TTI.isLegalToVectorizeLoadChain(SzInBytes, Alignment, AS)) {
auto Chains = splitOddVectorElts(Chain, Sz);
return vectorizeLoadChain(Chains.first, InstructionsProcessed) |
vectorizeLoadChain(Chains.second, InstructionsProcessed);
@@ -884,101 +939,99 @@ bool Vectorizer::vectorizeLoadChain(
else
VecTy = VectorType::get(LoadTy, Chain.size());
- // If it's more than the max vector size, break it into two pieces.
- // TODO: Target hook to control types to split to.
- if (ChainSize > VF) {
- DEBUG(dbgs() << "LSV: Vector factor is too big. "
- "Creating two separate arrays.\n");
- return vectorizeLoadChain(Chain.slice(0, VF), InstructionsProcessed) |
- vectorizeLoadChain(Chain.slice(VF), InstructionsProcessed);
+ // If it's more than the max vector size or the target has a better
+ // vector factor, break it into two pieces.
+ unsigned TargetVF = TTI.getLoadVectorFactor(VF, Sz, SzInBytes, VecTy);
+ if (ChainSize > VF || (VF != TargetVF && TargetVF < ChainSize)) {
+ DEBUG(dbgs() << "LSV: Chain doesn't match with the vector factor."
+ " Creating two separate arrays.\n");
+ return vectorizeLoadChain(Chain.slice(0, TargetVF), InstructionsProcessed) |
+ vectorizeLoadChain(Chain.slice(TargetVF), InstructionsProcessed);
}
// We won't try again to vectorize the elements of the chain, regardless of
// whether we succeed below.
InstructionsProcessed->insert(Chain.begin(), Chain.end());
- // Check alignment restrictions.
- unsigned Alignment = getAlignment(L0);
-
// If the load is going to be misaligned, don't vectorize it.
if (accessIsMisaligned(SzInBytes, AS, Alignment)) {
if (L0->getPointerAddressSpace() != 0)
return false;
- // If we're loading from an object on the stack, we control its alignment,
- // so we can cheat and change it!
- Value *V = GetUnderlyingObject(L0->getPointerOperand(), DL);
- if (AllocaInst *AI = dyn_cast_or_null<AllocaInst>(V)) {
- AI->setAlignment(TargetBaseAlign);
- Alignment = TargetBaseAlign;
- } else {
+ unsigned NewAlign = getOrEnforceKnownAlignment(L0->getPointerOperand(),
+ StackAdjustedAlignment,
+ DL, L0, nullptr, &DT);
+ if (NewAlign < StackAdjustedAlignment)
return false;
- }
+
+ Alignment = NewAlign;
}
DEBUG({
dbgs() << "LSV: Loads to vectorize:\n";
- for (Value *V : Chain)
- V->dump();
+ for (Instruction *I : Chain)
+ I->dump();
});
- // Set insert point.
+ // getVectorizablePrefix already computed getBoundaryInstrs. The value of
+ // Last may have changed since then, but the value of First won't have. If it
+ // matters, we could compute getBoundaryInstrs only once and reuse it here.
+ BasicBlock::iterator First, Last;
+ std::tie(First, Last) = getBoundaryInstrs(Chain);
Builder.SetInsertPoint(&*First);
Value *Bitcast =
Builder.CreateBitCast(L0->getPointerOperand(), VecTy->getPointerTo(AS));
-
+ // This cast is safe because Builder.CreateLoad always creates a bona fide
+ // LoadInst.
LoadInst *LI = cast<LoadInst>(Builder.CreateLoad(Bitcast));
propagateMetadata(LI, Chain);
LI->setAlignment(Alignment);
if (VecLoadTy) {
SmallVector<Instruction *, 16> InstrsToErase;
- SmallVector<Instruction *, 16> InstrsToReorder;
- InstrsToReorder.push_back(cast<Instruction>(Bitcast));
unsigned VecWidth = VecLoadTy->getNumElements();
for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
for (auto Use : Chain[I]->users()) {
+ // All users of vector loads are ExtractElement instructions with
+ // constant indices, otherwise we would have bailed before now.
Instruction *UI = cast<Instruction>(Use);
unsigned Idx = cast<ConstantInt>(UI->getOperand(1))->getZExtValue();
unsigned NewIdx = Idx + I * VecWidth;
- Value *V = Builder.CreateExtractElement(LI, Builder.getInt32(NewIdx));
- Instruction *Extracted = cast<Instruction>(V);
- if (Extracted->getType() != UI->getType())
- Extracted = cast<Instruction>(
- Builder.CreateBitCast(Extracted, UI->getType()));
+ Value *V = Builder.CreateExtractElement(LI, Builder.getInt32(NewIdx),
+ UI->getName());
+ if (V->getType() != UI->getType())
+ V = Builder.CreateBitCast(V, UI->getType());
// Replace the old instruction.
- UI->replaceAllUsesWith(Extracted);
+ UI->replaceAllUsesWith(V);
InstrsToErase.push_back(UI);
}
}
- for (Instruction *ModUser : InstrsToReorder)
- reorder(ModUser);
+ // Bitcast might not be an Instruction, if the value being loaded is a
+ // constant. In that case, no need to reorder anything.
+ if (Instruction *BitcastInst = dyn_cast<Instruction>(Bitcast))
+ reorder(BitcastInst);
for (auto I : InstrsToErase)
I->eraseFromParent();
} else {
- SmallVector<Instruction *, 16> InstrsToReorder;
- InstrsToReorder.push_back(cast<Instruction>(Bitcast));
-
for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
- Value *V = Builder.CreateExtractElement(LI, Builder.getInt32(I));
- Instruction *Extracted = cast<Instruction>(V);
- Instruction *UI = cast<Instruction>(Chain[I]);
- if (Extracted->getType() != UI->getType()) {
- Extracted = cast<Instruction>(
- Builder.CreateBitOrPointerCast(Extracted, UI->getType()));
+ Value *CV = Chain[I];
+ Value *V =
+ Builder.CreateExtractElement(LI, Builder.getInt32(I), CV->getName());
+ if (V->getType() != CV->getType()) {
+ V = Builder.CreateBitOrPointerCast(V, CV->getType());
}
// Replace the old instruction.
- UI->replaceAllUsesWith(Extracted);
+ CV->replaceAllUsesWith(V);
}
- for (Instruction *ModUser : InstrsToReorder)
- reorder(ModUser);
+ if (Instruction *BitcastInst = dyn_cast<Instruction>(Bitcast))
+ reorder(BitcastInst);
}
eraseInstructions(Chain);
@@ -990,10 +1043,14 @@ bool Vectorizer::vectorizeLoadChain(
bool Vectorizer::accessIsMisaligned(unsigned SzInBytes, unsigned AddressSpace,
unsigned Alignment) {
+ if (Alignment % SzInBytes == 0)
+ return false;
+
bool Fast = false;
- bool Allows = TTI.allowsMisalignedMemoryAccesses(SzInBytes * 8, AddressSpace,
+ bool Allows = TTI.allowsMisalignedMemoryAccesses(F.getParent()->getContext(),
+ SzInBytes * 8, AddressSpace,
Alignment, &Fast);
- // TODO: Remove TargetBaseAlign
- return !(Allows && Fast) && (Alignment % SzInBytes) != 0 &&
- (Alignment % TargetBaseAlign) != 0;
+ DEBUG(dbgs() << "LSV: Target said misaligned is allowed? " << Allows
+ << " and fast? " << Fast << "\n";);
+ return !Allows || !Fast;
}
diff --git a/contrib/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/contrib/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index ee5733d..dac7032 100644
--- a/contrib/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/contrib/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -80,6 +80,7 @@
#include "llvm/IR/Module.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
#include "llvm/IR/Value.h"
#include "llvm/IR/ValueHandle.h"
#include "llvm/IR/Verifier.h"
@@ -191,7 +192,7 @@ static cl::opt<bool> EnableIndVarRegisterHeur(
cl::desc("Count the induction variable only once when interleaving"));
static cl::opt<bool> EnableCondStoresVectorization(
- "enable-cond-stores-vec", cl::init(false), cl::Hidden,
+ "enable-cond-stores-vec", cl::init(true), cl::Hidden,
cl::desc("Enable if predication of stores during vectorization."));
static cl::opt<unsigned> MaxNestedScalarReductionIC(
@@ -213,6 +214,32 @@ static cl::opt<unsigned> PragmaVectorizeSCEVCheckThreshold(
cl::desc("The maximum number of SCEV checks allowed with a "
"vectorize(enable) pragma"));
+/// Create an analysis remark that explains why vectorization failed
+///
+/// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p
+/// RemarkName is the identifier for the remark. If \p I is passed it is an
+/// instruction that prevents vectorization. Otherwise \p TheLoop is used for
+/// the location of the remark. \return the remark object that can be
+/// streamed to.
+static OptimizationRemarkAnalysis
+createMissedAnalysis(const char *PassName, StringRef RemarkName, Loop *TheLoop,
+ Instruction *I = nullptr) {
+ Value *CodeRegion = TheLoop->getHeader();
+ DebugLoc DL = TheLoop->getStartLoc();
+
+ if (I) {
+ CodeRegion = I->getParent();
+ // If there is no debug location attached to the instruction, revert back to
+ // using the loop's.
+ if (I->getDebugLoc())
+ DL = I->getDebugLoc();
+ }
+
+ OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion);
+ R << "loop not vectorized: ";
+ return R;
+}
+
namespace {
// Forward declarations.
@@ -221,70 +248,13 @@ class LoopVectorizationLegality;
class LoopVectorizationCostModel;
class LoopVectorizationRequirements;
-// A traits type that is intended to be used in graph algorithms. The graph it
-// models starts at the loop header, and traverses the BasicBlocks that are in
-// the loop body, but not the loop header. Since the loop header is skipped,
-// the back edges are excluded.
-struct LoopBodyTraits {
- using NodeRef = std::pair<const Loop *, BasicBlock *>;
-
- // This wraps a const Loop * into the iterator, so we know which edges to
- // filter out.
- class WrappedSuccIterator
- : public iterator_adaptor_base<
- WrappedSuccIterator, succ_iterator,
- typename std::iterator_traits<succ_iterator>::iterator_category,
- NodeRef, std::ptrdiff_t, NodeRef *, NodeRef> {
- using BaseT = iterator_adaptor_base<
- WrappedSuccIterator, succ_iterator,
- typename std::iterator_traits<succ_iterator>::iterator_category,
- NodeRef, std::ptrdiff_t, NodeRef *, NodeRef>;
-
- const Loop *L;
-
- public:
- WrappedSuccIterator(succ_iterator Begin, const Loop *L)
- : BaseT(Begin), L(L) {}
-
- NodeRef operator*() const { return {L, *I}; }
- };
-
- struct LoopBodyFilter {
- bool operator()(NodeRef N) const {
- const Loop *L = N.first;
- return N.second != L->getHeader() && L->contains(N.second);
- }
- };
-
- using ChildIteratorType =
- filter_iterator<WrappedSuccIterator, LoopBodyFilter>;
-
- static NodeRef getEntryNode(const Loop &G) { return {&G, G.getHeader()}; }
-
- static ChildIteratorType child_begin(NodeRef Node) {
- return make_filter_range(make_range<WrappedSuccIterator>(
- {succ_begin(Node.second), Node.first},
- {succ_end(Node.second), Node.first}),
- LoopBodyFilter{})
- .begin();
- }
-
- static ChildIteratorType child_end(NodeRef Node) {
- return make_filter_range(make_range<WrappedSuccIterator>(
- {succ_begin(Node.second), Node.first},
- {succ_end(Node.second), Node.first}),
- LoopBodyFilter{})
- .end();
- }
-};
-
/// Returns true if the given loop body has a cycle, excluding the loop
/// itself.
static bool hasCyclesInLoopBody(const Loop &L) {
if (!L.empty())
return true;
- for (const auto SCC :
+ for (const auto &SCC :
make_range(scc_iterator<Loop, LoopBodyTraits>::begin(L),
scc_iterator<Loop, LoopBodyTraits>::end(L))) {
if (SCC.size() > 1) {
@@ -346,6 +316,41 @@ static GetElementPtrInst *getGEPInstruction(Value *Ptr) {
return nullptr;
}
+/// A helper function that returns the pointer operand of a load or store
+/// instruction.
+static Value *getPointerOperand(Value *I) {
+ if (auto *LI = dyn_cast<LoadInst>(I))
+ return LI->getPointerOperand();
+ if (auto *SI = dyn_cast<StoreInst>(I))
+ return SI->getPointerOperand();
+ return nullptr;
+}
+
+/// A helper function that returns true if the given type is irregular. The
+/// type is irregular if its allocated size doesn't equal the store size of an
+/// element of the corresponding vector type at the given vectorization factor.
+static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) {
+
+ // Determine if an array of VF elements of type Ty is "bitcast compatible"
+ // with a <VF x Ty> vector.
+ if (VF > 1) {
+ auto *VectorTy = VectorType::get(Ty, VF);
+ return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy);
+ }
+
+ // If the vectorization factor is one, we just check if an array of type Ty
+ // requires padding between elements.
+ return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
+}
+
+/// A helper function that returns the reciprocal of the block probability of
+/// predicated blocks. If we return X, we are assuming the predicated block
+/// will execute once for for every X iterations of the loop header.
+///
+/// TODO: We should use actual block probability here, if available. Currently,
+/// we always assume predicated blocks have a 50% chance of executing.
+static unsigned getReciprocalPredBlockProb() { return 2; }
+
/// InnerLoopVectorizer vectorizes loops which contain only one basic
/// block to a specified vectorization factor (VF).
/// This class performs the widening of scalars into vectors, or multiple
@@ -366,29 +371,21 @@ public:
LoopInfo *LI, DominatorTree *DT,
const TargetLibraryInfo *TLI,
const TargetTransformInfo *TTI, AssumptionCache *AC,
- unsigned VecWidth, unsigned UnrollFactor)
+ OptimizationRemarkEmitter *ORE, unsigned VecWidth,
+ unsigned UnrollFactor, LoopVectorizationLegality *LVL,
+ LoopVectorizationCostModel *CM)
: OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
- AC(AC), VF(VecWidth), UF(UnrollFactor),
+ AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
Builder(PSE.getSE()->getContext()), Induction(nullptr),
- OldInduction(nullptr), WidenMap(UnrollFactor), TripCount(nullptr),
- VectorTripCount(nullptr), Legal(nullptr), AddedSafetyChecks(false) {}
+ OldInduction(nullptr), VectorLoopValueMap(UnrollFactor, VecWidth),
+ TripCount(nullptr), VectorTripCount(nullptr), Legal(LVL), Cost(CM),
+ AddedSafetyChecks(false) {}
// Perform the actual loop widening (vectorization).
- // MinimumBitWidths maps scalar integer values to the smallest bitwidth they
- // can be validly truncated to. The cost model has assumed this truncation
- // will happen when vectorizing. VecValuesToIgnore contains scalar values
- // that the cost model has chosen to ignore because they will not be
- // vectorized.
- void vectorize(LoopVectorizationLegality *L,
- const MapVector<Instruction *, uint64_t> &MinimumBitWidths,
- SmallPtrSetImpl<const Value *> &VecValuesToIgnore) {
- MinBWs = &MinimumBitWidths;
- ValuesNotWidened = &VecValuesToIgnore;
- Legal = L;
+ void vectorize() {
// Create a new empty loop. Unlink the old loop and connect the new one.
createEmptyLoop();
// Widen each instruction in the old loop to a new one in the new loop.
- // Use the Legality module to find the induction and reduction variables.
vectorizeLoop();
}
@@ -400,11 +397,18 @@ public:
protected:
/// A small list of PHINodes.
typedef SmallVector<PHINode *, 4> PhiVector;
- /// When we unroll loops we have multiple vector values for each scalar.
- /// This data structure holds the unrolled and vectorized values that
- /// originated from one scalar instruction.
+
+ /// A type for vectorized values in the new loop. Each value from the
+ /// original loop, when vectorized, is represented by UF vector values in the
+ /// new unrolled loop, where UF is the unroll factor.
typedef SmallVector<Value *, 2> VectorParts;
+ /// A type for scalarized values in the new loop. Each value from the
+ /// original loop, when scalarized, is represented by UF x VF scalar values
+ /// in the new unrolled loop, where UF is the unroll factor and VF is the
+ /// vectorization factor.
+ typedef SmallVector<SmallVector<Value *, 4>, 2> ScalarParts;
+
// When we if-convert we need to create edge masks. We have to cache values
// so that we don't end up with exponential recursion/IR.
typedef DenseMap<std::pair<BasicBlock *, BasicBlock *>, VectorParts>
@@ -434,7 +438,20 @@ protected:
/// See PR14725.
void fixLCSSAPHIs();
- /// Shrinks vector element sizes based on information in "MinBWs".
+ /// Iteratively sink the scalarized operands of a predicated instruction into
+ /// the block that was created for it.
+ void sinkScalarOperands(Instruction *PredInst);
+
+ /// Predicate conditional instructions that require predication on their
+ /// respective conditions.
+ void predicateInstructions();
+
+ /// Collect the instructions from the original loop that would be trivially
+ /// dead in the vectorized loop if generated.
+ void collectTriviallyDeadInstructions();
+
+ /// Shrinks vector element sizes to the smallest bitwidth they can be legally
+ /// represented as.
void truncateToMinimalBitwidths();
/// A helper function that computes the predicate of the block BB, assuming
@@ -451,19 +468,19 @@ protected:
/// Vectorize a single PHINode in a block. This method handles the induction
/// variable canonicalization. It supports both VF = 1 for unrolled loops and
/// arbitrary length vectors.
- void widenPHIInstruction(Instruction *PN, VectorParts &Entry, unsigned UF,
- unsigned VF, PhiVector *PV);
+ void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF,
+ PhiVector *PV);
/// Insert the new loop to the loop hierarchy and pass manager
/// and update the analysis passes.
void updateAnalysis();
/// This instruction is un-vectorizable. Implement it as a sequence
- /// of scalars. If \p IfPredicateStore is true we need to 'hide' each
+ /// of scalars. If \p IfPredicateInstr is true we need to 'hide' each
/// scalarized instruction behind an if block predicated on the control
/// dependence of the instruction.
virtual void scalarizeInstruction(Instruction *Instr,
- bool IfPredicateStore = false);
+ bool IfPredicateInstr = false);
/// Vectorize Load and Store instructions,
virtual void vectorizeMemoryInstruction(Instruction *Instr);
@@ -477,7 +494,10 @@ protected:
/// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...)
/// to each vector element of Val. The sequence starts at StartIndex.
- virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step);
+ /// \p Opcode is relevant for FP induction variable.
+ virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step,
+ Instruction::BinaryOps Opcode =
+ Instruction::BinaryOpsEnd);
/// Compute scalar induction steps. \p ScalarIV is the scalar induction
/// variable on which to base the steps, \p Step is the size of the step, and
@@ -488,23 +508,39 @@ protected:
/// Create a vector induction phi node based on an existing scalar one. This
/// currently only works for integer induction variables with a constant
- /// step. If \p TruncType is non-null, instead of widening the original IV,
- /// we widen a version of the IV truncated to \p TruncType.
+ /// step. \p EntryVal is the value from the original loop that maps to the
+ /// vector phi node. If \p EntryVal is a truncate instruction, instead of
+ /// widening the original IV, we widen a version of the IV truncated to \p
+ /// EntryVal's type.
void createVectorIntInductionPHI(const InductionDescriptor &II,
- VectorParts &Entry, IntegerType *TruncType);
+ Instruction *EntryVal);
/// Widen an integer induction variable \p IV. If \p Trunc is provided, the
- /// induction variable will first be truncated to the corresponding type. The
- /// widened values are placed in \p Entry.
- void widenIntInduction(PHINode *IV, VectorParts &Entry,
- TruncInst *Trunc = nullptr);
-
- /// When we go over instructions in the basic block we rely on previous
- /// values within the current basic block or on loop invariant values.
- /// When we widen (vectorize) values we place them in the map. If the values
- /// are not within the map, they have to be loop invariant, so we simply
- /// broadcast them into a vector.
- VectorParts &getVectorValue(Value *V);
+ /// induction variable will first be truncated to the corresponding type.
+ void widenIntInduction(PHINode *IV, TruncInst *Trunc = nullptr);
+
+ /// Returns true if an instruction \p I should be scalarized instead of
+ /// vectorized for the chosen vectorization factor.
+ bool shouldScalarizeInstruction(Instruction *I) const;
+
+ /// Returns true if we should generate a scalar version of \p IV.
+ bool needsScalarInduction(Instruction *IV) const;
+
+ /// Return a constant reference to the VectorParts corresponding to \p V from
+ /// the original loop. If the value has already been vectorized, the
+ /// corresponding vector entry in VectorLoopValueMap is returned. If,
+ /// however, the value has a scalar entry in VectorLoopValueMap, we construct
+ /// new vector values on-demand by inserting the scalar values into vectors
+ /// with an insertelement sequence. If the value has been neither vectorized
+ /// nor scalarized, it must be loop invariant, so we simply broadcast the
+ /// value into vectors.
+ const VectorParts &getVectorValue(Value *V);
+
+ /// Return a value in the new loop corresponding to \p V from the original
+ /// loop at unroll index \p Part and vector index \p Lane. If the value has
+ /// been vectorized but not scalarized, the necessary extractelement
+ /// instruction will be generated.
+ Value *getScalarValue(Value *V, unsigned Part, unsigned Lane);
/// Try to vectorize the interleaved access group that \p Instr belongs to.
void vectorizeInterleaveGroup(Instruction *Instr);
@@ -547,44 +583,112 @@ protected:
/// vector of instructions.
void addMetadata(ArrayRef<Value *> To, Instruction *From);
- /// This is a helper class that holds the vectorizer state. It maps scalar
- /// instructions to vector instructions. When the code is 'unrolled' then
- /// then a single scalar value is mapped to multiple vector parts. The parts
- /// are stored in the VectorPart type.
+ /// This is a helper class for maintaining vectorization state. It's used for
+ /// mapping values from the original loop to their corresponding values in
+ /// the new loop. Two mappings are maintained: one for vectorized values and
+ /// one for scalarized values. Vectorized values are represented with UF
+ /// vector values in the new loop, and scalarized values are represented with
+ /// UF x VF scalar values in the new loop. UF and VF are the unroll and
+ /// vectorization factors, respectively.
+ ///
+ /// Entries can be added to either map with initVector and initScalar, which
+ /// initialize and return a constant reference to the new entry. If a
+ /// non-constant reference to a vector entry is required, getVector can be
+ /// used to retrieve a mutable entry. We currently directly modify the mapped
+ /// values during "fix-up" operations that occur once the first phase of
+ /// widening is complete. These operations include type truncation and the
+ /// second phase of recurrence widening.
+ ///
+ /// Otherwise, entries from either map should be accessed using the
+ /// getVectorValue or getScalarValue functions from InnerLoopVectorizer.
+ /// getVectorValue and getScalarValue coordinate to generate a vector or
+ /// scalar value on-demand if one is not yet available. When vectorizing a
+ /// loop, we visit the definition of an instruction before its uses. When
+ /// visiting the definition, we either vectorize or scalarize the
+ /// instruction, creating an entry for it in the corresponding map. (In some
+ /// cases, such as induction variables, we will create both vector and scalar
+ /// entries.) Then, as we encounter uses of the definition, we derive values
+ /// for each scalar or vector use unless such a value is already available.
+ /// For example, if we scalarize a definition and one of its uses is vector,
+ /// we build the required vector on-demand with an insertelement sequence
+ /// when visiting the use. Otherwise, if the use is scalar, we can use the
+ /// existing scalar definition.
struct ValueMap {
- /// C'tor. UnrollFactor controls the number of vectors ('parts') that
- /// are mapped.
- ValueMap(unsigned UnrollFactor) : UF(UnrollFactor) {}
-
- /// \return True if 'Key' is saved in the Value Map.
- bool has(Value *Key) const { return MapStorage.count(Key); }
-
- /// Initializes a new entry in the map. Sets all of the vector parts to the
- /// save value in 'Val'.
- /// \return A reference to a vector with splat values.
- VectorParts &splat(Value *Key, Value *Val) {
- VectorParts &Entry = MapStorage[Key];
- Entry.assign(UF, Val);
- return Entry;
+
+ /// Construct an empty map with the given unroll and vectorization factors.
+ ValueMap(unsigned UnrollFactor, unsigned VecWidth)
+ : UF(UnrollFactor), VF(VecWidth) {
+ // The unroll and vectorization factors are only used in asserts builds
+ // to verify map entries are sized appropriately.
+ (void)UF;
+ (void)VF;
}
- ///\return A reference to the value that is stored at 'Key'.
- VectorParts &get(Value *Key) {
- VectorParts &Entry = MapStorage[Key];
- if (Entry.empty())
- Entry.resize(UF);
- assert(Entry.size() == UF);
- return Entry;
+ /// \return True if the map has a vector entry for \p Key.
+ bool hasVector(Value *Key) const { return VectorMapStorage.count(Key); }
+
+ /// \return True if the map has a scalar entry for \p Key.
+ bool hasScalar(Value *Key) const { return ScalarMapStorage.count(Key); }
+
+ /// \brief Map \p Key to the given VectorParts \p Entry, and return a
+ /// constant reference to the new vector map entry. The given key should
+ /// not already be in the map, and the given VectorParts should be
+ /// correctly sized for the current unroll factor.
+ const VectorParts &initVector(Value *Key, const VectorParts &Entry) {
+ assert(!hasVector(Key) && "Vector entry already initialized");
+ assert(Entry.size() == UF && "VectorParts has wrong dimensions");
+ VectorMapStorage[Key] = Entry;
+ return VectorMapStorage[Key];
}
+ /// \brief Map \p Key to the given ScalarParts \p Entry, and return a
+ /// constant reference to the new scalar map entry. The given key should
+ /// not already be in the map, and the given ScalarParts should be
+ /// correctly sized for the current unroll and vectorization factors.
+ const ScalarParts &initScalar(Value *Key, const ScalarParts &Entry) {
+ assert(!hasScalar(Key) && "Scalar entry already initialized");
+ assert(Entry.size() == UF &&
+ all_of(make_range(Entry.begin(), Entry.end()),
+ [&](const SmallVectorImpl<Value *> &Values) -> bool {
+ return Values.size() == VF;
+ }) &&
+ "ScalarParts has wrong dimensions");
+ ScalarMapStorage[Key] = Entry;
+ return ScalarMapStorage[Key];
+ }
+
+ /// \return A reference to the vector map entry corresponding to \p Key.
+ /// The key should already be in the map. This function should only be used
+ /// when it's necessary to update values that have already been vectorized.
+ /// This is the case for "fix-up" operations including type truncation and
+ /// the second phase of recurrence vectorization. If a non-const reference
+ /// isn't required, getVectorValue should be used instead.
+ VectorParts &getVector(Value *Key) {
+ assert(hasVector(Key) && "Vector entry not initialized");
+ return VectorMapStorage.find(Key)->second;
+ }
+
+ /// Retrieve an entry from the vector or scalar maps. The preferred way to
+ /// access an existing mapped entry is with getVectorValue or
+ /// getScalarValue from InnerLoopVectorizer. Until those functions can be
+ /// moved inside ValueMap, we have to declare them as friends.
+ friend const VectorParts &InnerLoopVectorizer::getVectorValue(Value *V);
+ friend Value *InnerLoopVectorizer::getScalarValue(Value *V, unsigned Part,
+ unsigned Lane);
+
private:
- /// The unroll factor. Each entry in the map stores this number of vector
- /// elements.
+ /// The unroll factor. Each entry in the vector map contains UF vector
+ /// values.
unsigned UF;
- /// Map storage. We use std::map and not DenseMap because insertions to a
- /// dense map invalidates its iterators.
- std::map<Value *, VectorParts> MapStorage;
+ /// The vectorization factor. Each entry in the scalar map contains UF x VF
+ /// scalar values.
+ unsigned VF;
+
+ /// The vector and scalar map storage. We use std::map and not DenseMap
+ /// because insertions to DenseMap invalidate its iterators.
+ std::map<Value *, VectorParts> VectorMapStorage;
+ std::map<Value *, ScalarParts> ScalarMapStorage;
};
/// The original loop.
@@ -605,6 +709,8 @@ protected:
const TargetTransformInfo *TTI;
/// Assumption Cache.
AssumptionCache *AC;
+ /// Interface to emit optimization remarks.
+ OptimizationRemarkEmitter *ORE;
/// \brief LoopVersioning. It's only set up (non-null) if memchecks were
/// used.
@@ -646,41 +752,42 @@ protected:
PHINode *Induction;
/// The induction variable of the old basic block.
PHINode *OldInduction;
- /// Maps scalars to widened vectors.
- ValueMap WidenMap;
-
- /// A map of induction variables from the original loop to their
- /// corresponding VF * UF scalarized values in the vectorized loop. The
- /// purpose of ScalarIVMap is similar to that of WidenMap. Whereas WidenMap
- /// maps original loop values to their vector versions in the new loop,
- /// ScalarIVMap maps induction variables from the original loop that are not
- /// vectorized to their scalar equivalents in the vector loop. Maintaining a
- /// separate map for scalarized induction variables allows us to avoid
- /// unnecessary scalar-to-vector-to-scalar conversions.
- DenseMap<Value *, SmallVector<Value *, 8>> ScalarIVMap;
+
+ /// Maps values from the original loop to their corresponding values in the
+ /// vectorized loop. A key value can map to either vector values, scalar
+ /// values or both kinds of values, depending on whether the key was
+ /// vectorized and scalarized.
+ ValueMap VectorLoopValueMap;
/// Store instructions that should be predicated, as a pair
/// <StoreInst, Predicate>
- SmallVector<std::pair<StoreInst *, Value *>, 4> PredicatedStores;
+ SmallVector<std::pair<Instruction *, Value *>, 4> PredicatedInstructions;
EdgeMaskCache MaskCache;
/// Trip count of the original loop.
Value *TripCount;
/// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
Value *VectorTripCount;
- /// Map of scalar integer values to the smallest bitwidth they can be legally
- /// represented as. The vector equivalents of these values should be truncated
- /// to this type.
- const MapVector<Instruction *, uint64_t> *MinBWs;
-
- /// A set of values that should not be widened. This is taken from
- /// VecValuesToIgnore in the cost model.
- SmallPtrSetImpl<const Value *> *ValuesNotWidened;
-
+ /// The legality analysis.
LoopVectorizationLegality *Legal;
+ /// The profitablity analysis.
+ LoopVectorizationCostModel *Cost;
+
// Record whether runtime checks are added.
bool AddedSafetyChecks;
+
+ // Holds instructions from the original loop whose counterparts in the
+ // vectorized loop would be trivially dead if generated. For example,
+ // original induction update instructions can become dead because we
+ // separately emit induction "steps" when generating code for the new loop.
+ // Similarly, we create a new latch condition when setting up the structure
+ // of the new loop, so the old one can become dead.
+ SmallPtrSet<Instruction *, 4> DeadInstructions;
+
+ // Holds the end values for each induction variable. We save the end values
+ // so we can later fix-up the external users of the induction variables.
+ DenseMap<PHINode *, Value *> IVEndValues;
};
class InnerLoopUnroller : public InnerLoopVectorizer {
@@ -689,16 +796,20 @@ public:
LoopInfo *LI, DominatorTree *DT,
const TargetLibraryInfo *TLI,
const TargetTransformInfo *TTI, AssumptionCache *AC,
- unsigned UnrollFactor)
- : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, 1,
- UnrollFactor) {}
+ OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
+ LoopVectorizationLegality *LVL,
+ LoopVectorizationCostModel *CM)
+ : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1,
+ UnrollFactor, LVL, CM) {}
private:
void scalarizeInstruction(Instruction *Instr,
- bool IfPredicateStore = false) override;
+ bool IfPredicateInstr = false) override;
void vectorizeMemoryInstruction(Instruction *Instr) override;
Value *getBroadcastInstrs(Value *V) override;
- Value *getStepVector(Value *Val, int StartIdx, Value *Step) override;
+ Value *getStepVector(Value *Val, int StartIdx, Value *Step,
+ Instruction::BinaryOps Opcode =
+ Instruction::BinaryOpsEnd) override;
Value *reverseVector(Value *Vec) override;
};
@@ -1149,12 +1260,13 @@ public:
FK_Enabled = 1, ///< Forcing enabled.
};
- LoopVectorizeHints(const Loop *L, bool DisableInterleaving)
+ LoopVectorizeHints(const Loop *L, bool DisableInterleaving,
+ OptimizationRemarkEmitter &ORE)
: Width("vectorize.width", VectorizerParams::VectorizationFactor,
HK_WIDTH),
Interleave("interleave.count", DisableInterleaving, HK_UNROLL),
Force("vectorize.enable", FK_Undefined, HK_FORCE),
- PotentiallyUnsafe(false), TheLoop(L) {
+ PotentiallyUnsafe(false), TheLoop(L), ORE(ORE) {
// Populate values with existing loop metadata.
getHintsFromMetadata();
@@ -1176,17 +1288,13 @@ public:
bool allowVectorization(Function *F, Loop *L, bool AlwaysVectorize) const {
if (getForce() == LoopVectorizeHints::FK_Disabled) {
DEBUG(dbgs() << "LV: Not vectorizing: #pragma vectorize disable.\n");
- emitOptimizationRemarkAnalysis(F->getContext(),
- vectorizeAnalysisPassName(), *F,
- L->getStartLoc(), emitRemark());
+ emitRemarkWithHints();
return false;
}
if (!AlwaysVectorize && getForce() != LoopVectorizeHints::FK_Enabled) {
DEBUG(dbgs() << "LV: Not vectorizing: No #pragma vectorize enable.\n");
- emitOptimizationRemarkAnalysis(F->getContext(),
- vectorizeAnalysisPassName(), *F,
- L->getStartLoc(), emitRemark());
+ emitRemarkWithHints();
return false;
}
@@ -1197,11 +1305,12 @@ public:
// FIXME: Add interleave.disable metadata. This will allow
// vectorize.disable to be used without disabling the pass and errors
// to differentiate between disabled vectorization and a width of 1.
- emitOptimizationRemarkAnalysis(
- F->getContext(), vectorizeAnalysisPassName(), *F, L->getStartLoc(),
- "loop not vectorized: vectorization and interleaving are explicitly "
- "disabled, or vectorize width and interleave count are both set to "
- "1");
+ ORE.emit(OptimizationRemarkAnalysis(vectorizeAnalysisPassName(),
+ "AllDisabled", L->getStartLoc(),
+ L->getHeader())
+ << "loop not vectorized: vectorization and interleaving are "
+ "explicitly disabled, or vectorize width and interleave "
+ "count are both set to 1");
return false;
}
@@ -1209,23 +1318,27 @@ public:
}
/// Dumps all the hint information.
- std::string emitRemark() const {
- VectorizationReport R;
+ void emitRemarkWithHints() const {
+ using namespace ore;
if (Force.Value == LoopVectorizeHints::FK_Disabled)
- R << "vectorization is explicitly disabled";
+ ORE.emit(OptimizationRemarkMissed(LV_NAME, "MissedExplicitlyDisabled",
+ TheLoop->getStartLoc(),
+ TheLoop->getHeader())
+ << "loop not vectorized: vectorization is explicitly disabled");
else {
- R << "use -Rpass-analysis=loop-vectorize for more info";
+ OptimizationRemarkMissed R(LV_NAME, "MissedDetails",
+ TheLoop->getStartLoc(), TheLoop->getHeader());
+ R << "loop not vectorized";
if (Force.Value == LoopVectorizeHints::FK_Enabled) {
- R << " (Force=true";
+ R << " (Force=" << NV("Force", true);
if (Width.Value != 0)
- R << ", Vector Width=" << Width.Value;
+ R << ", Vector Width=" << NV("VectorWidth", Width.Value);
if (Interleave.Value != 0)
- R << ", Interleave Count=" << Interleave.Value;
+ R << ", Interleave Count=" << NV("InterleaveCount", Interleave.Value);
R << ")";
}
+ ORE.emit(R);
}
-
- return R.str();
}
unsigned getWidth() const { return Width.Value; }
@@ -1241,7 +1354,7 @@ public:
return LV_NAME;
if (getForce() == LoopVectorizeHints::FK_Undefined && getWidth() == 0)
return LV_NAME;
- return DiagnosticInfoOptimizationRemarkAnalysis::AlwaysPrint;
+ return OptimizationRemarkAnalysis::AlwaysPrint;
}
bool allowReordering() const {
@@ -1379,19 +1492,23 @@ private:
/// The loop these hints belong to.
const Loop *TheLoop;
+
+ /// Interface to emit optimization remarks.
+ OptimizationRemarkEmitter &ORE;
};
-static void emitAnalysisDiag(const Function *TheFunction, const Loop *TheLoop,
+static void emitAnalysisDiag(const Loop *TheLoop,
const LoopVectorizeHints &Hints,
+ OptimizationRemarkEmitter &ORE,
const LoopAccessReport &Message) {
const char *Name = Hints.vectorizeAnalysisPassName();
- LoopAccessReport::emitAnalysis(Message, TheFunction, TheLoop, Name);
+ LoopAccessReport::emitAnalysis(Message, TheLoop, Name, ORE);
}
static void emitMissedWarning(Function *F, Loop *L,
- const LoopVectorizeHints &LH) {
- emitOptimizationRemarkMissed(F->getContext(), LV_NAME, *F, L->getStartLoc(),
- LH.emitRemark());
+ const LoopVectorizeHints &LH,
+ OptimizationRemarkEmitter *ORE) {
+ LH.emitRemarkWithHints();
if (LH.getForce() == LoopVectorizeHints::FK_Enabled) {
if (LH.getWidth() != 1)
@@ -1425,12 +1542,12 @@ public:
TargetLibraryInfo *TLI, AliasAnalysis *AA, Function *F,
const TargetTransformInfo *TTI,
std::function<const LoopAccessInfo &(Loop &)> *GetLAA, LoopInfo *LI,
- LoopVectorizationRequirements *R, LoopVectorizeHints *H)
- : NumPredStores(0), TheLoop(L), PSE(PSE), TLI(TLI), TheFunction(F),
- TTI(TTI), DT(DT), GetLAA(GetLAA), LAI(nullptr),
- InterleaveInfo(PSE, L, DT, LI), Induction(nullptr),
- WidestIndTy(nullptr), HasFunNoNaNAttr(false), Requirements(R),
- Hints(H) {}
+ OptimizationRemarkEmitter *ORE, LoopVectorizationRequirements *R,
+ LoopVectorizeHints *H)
+ : NumPredStores(0), TheLoop(L), PSE(PSE), TLI(TLI), TTI(TTI), DT(DT),
+ GetLAA(GetLAA), LAI(nullptr), ORE(ORE), InterleaveInfo(PSE, L, DT, LI),
+ Induction(nullptr), WidestIndTy(nullptr), HasFunNoNaNAttr(false),
+ Requirements(R), Hints(H) {}
/// ReductionList contains the reduction descriptors for all
/// of the reductions that were found in the loop.
@@ -1490,9 +1607,12 @@ public:
/// Returns true if the value V is uniform within the loop.
bool isUniform(Value *V);
- /// Returns true if this instruction will remain scalar after vectorization.
+ /// Returns true if \p I is known to be uniform after vectorization.
bool isUniformAfterVectorization(Instruction *I) { return Uniforms.count(I); }
+ /// Returns true if \p I is known to be scalar after vectorization.
+ bool isScalarAfterVectorization(Instruction *I) { return Scalars.count(I); }
+
/// Returns the information that we collected about runtime memory check.
const RuntimePointerChecking *getRuntimePointerChecking() const {
return LAI->getRuntimePointerChecking();
@@ -1545,6 +1665,17 @@ public:
bool isLegalMaskedGather(Type *DataType) {
return TTI->isLegalMaskedGather(DataType);
}
+ /// Returns true if the target machine can represent \p V as a masked gather
+ /// or scatter operation.
+ bool isLegalGatherOrScatter(Value *V) {
+ auto *LI = dyn_cast<LoadInst>(V);
+ auto *SI = dyn_cast<StoreInst>(V);
+ if (!LI && !SI)
+ return false;
+ auto *Ptr = getPointerOperand(V);
+ auto *Ty = cast<PointerType>(Ptr->getType())->getElementType();
+ return (LI && isLegalMaskedGather(Ty)) || (SI && isLegalMaskedScatter(Ty));
+ }
/// Returns true if vector representation of the instruction \p I
/// requires mask.
@@ -1553,6 +1684,21 @@ public:
unsigned getNumLoads() const { return LAI->getNumLoads(); }
unsigned getNumPredStores() const { return NumPredStores; }
+ /// Returns true if \p I is an instruction that will be scalarized with
+ /// predication. Such instructions include conditional stores and
+ /// instructions that may divide by zero.
+ bool isScalarWithPredication(Instruction *I);
+
+ /// Returns true if \p I is a memory instruction that has a consecutive or
+ /// consecutive-like pointer operand. Consecutive-like pointers are pointers
+ /// that are treated like consecutive pointers during vectorization. The
+ /// pointer operands of interleaved accesses are an example.
+ bool hasConsecutiveLikePtrOperand(Instruction *I);
+
+ /// Returns true if \p I is a memory instruction that must be scalarized
+ /// during vectorization.
+ bool memoryInstructionMustBeScalarized(Instruction *I, unsigned VF = 1);
+
private:
/// Check if a single basic block loop is vectorizable.
/// At this point we know that this is a loop with a constant trip count
@@ -1569,9 +1715,24 @@ private:
/// transformation.
bool canVectorizeWithIfConvert();
- /// Collect the variables that need to stay uniform after vectorization.
+ /// Collect the instructions that are uniform after vectorization. An
+ /// instruction is uniform if we represent it with a single scalar value in
+ /// the vectorized loop corresponding to each vector iteration. Examples of
+ /// uniform instructions include pointer operands of consecutive or
+ /// interleaved memory accesses. Note that although uniformity implies an
+ /// instruction will be scalar, the reverse is not true. In general, a
+ /// scalarized instruction will be represented by VF scalar values in the
+ /// vectorized loop, each corresponding to an iteration of the original
+ /// scalar loop.
void collectLoopUniforms();
+ /// Collect the instructions that are scalar after vectorization. An
+ /// instruction is scalar if it is known to be uniform or will be scalarized
+ /// during vectorization. Non-uniform scalarized instructions will be
+ /// represented by VF values in the vectorized loop, each corresponding to an
+ /// iteration of the original scalar loop.
+ void collectLoopScalars();
+
/// Return true if all of the instructions in the block can be speculatively
/// executed. \p SafePtrs is a list of addresses that are known to be legal
/// and we know that we can read from them without segfault.
@@ -1588,7 +1749,19 @@ private:
/// VectorizationReport because the << operator of VectorizationReport returns
/// LoopAccessReport.
void emitAnalysis(const LoopAccessReport &Message) const {
- emitAnalysisDiag(TheFunction, TheLoop, *Hints, Message);
+ emitAnalysisDiag(TheLoop, *Hints, *ORE, Message);
+ }
+
+ /// Create an analysis remark that explains why vectorization failed
+ ///
+ /// \p RemarkName is the identifier for the remark. If \p I is passed it is
+ /// an instruction that prevents vectorization. Otherwise the loop is used
+ /// for the location of the remark. \return the remark object that can be
+ /// streamed to.
+ OptimizationRemarkAnalysis
+ createMissedAnalysis(StringRef RemarkName, Instruction *I = nullptr) const {
+ return ::createMissedAnalysis(Hints->vectorizeAnalysisPassName(),
+ RemarkName, TheLoop, I);
}
/// \brief If an access has a symbolic strides, this maps the pointer value to
@@ -1613,8 +1786,6 @@ private:
PredicatedScalarEvolution &PSE;
/// Target Library Info.
TargetLibraryInfo *TLI;
- /// Parent function
- Function *TheFunction;
/// Target Transform Info
const TargetTransformInfo *TTI;
/// Dominator Tree.
@@ -1624,6 +1795,8 @@ private:
// And the loop-accesses info corresponding to this loop. This pointer is
// null until canVectorizeMemory sets it up.
const LoopAccessInfo *LAI;
+ /// Interface to emit optimization remarks.
+ OptimizationRemarkEmitter *ORE;
/// The interleave access information contains groups of interleaved accesses
/// with the same stride and close to each other.
@@ -1648,10 +1821,13 @@ private:
/// Allowed outside users. This holds the induction and reduction
/// vars which can be accessed from outside the loop.
SmallPtrSet<Value *, 4> AllowedExit;
- /// This set holds the variables which are known to be uniform after
- /// vectorization.
+
+ /// Holds the instructions known to be uniform after vectorization.
SmallPtrSet<Instruction *, 4> Uniforms;
+ /// Holds the instructions known to be scalar after vectorization.
+ SmallPtrSet<Instruction *, 4> Scalars;
+
/// Can we assume the absence of NaNs.
bool HasFunNoNaNAttr;
@@ -1679,10 +1855,11 @@ public:
LoopInfo *LI, LoopVectorizationLegality *Legal,
const TargetTransformInfo &TTI,
const TargetLibraryInfo *TLI, DemandedBits *DB,
- AssumptionCache *AC, const Function *F,
+ AssumptionCache *AC,
+ OptimizationRemarkEmitter *ORE, const Function *F,
const LoopVectorizeHints *Hints)
: TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB),
- AC(AC), TheFunction(F), Hints(Hints) {}
+ AC(AC), ORE(ORE), TheFunction(F), Hints(Hints) {}
/// Information about vectorization costs
struct VectorizationFactor {
@@ -1707,13 +1884,6 @@ public:
unsigned selectInterleaveCount(bool OptForSize, unsigned VF,
unsigned LoopCost);
- /// \return The most profitable unroll factor.
- /// This method finds the best unroll-factor based on register pressure and
- /// other parameters. VF and LoopCost are the selected vectorization factor
- /// and the cost of the selected VF.
- unsigned computeInterleaveCount(bool OptForSize, unsigned VF,
- unsigned LoopCost);
-
/// \brief A struct that represents some properties of the register usage
/// of a loop.
struct RegisterUsage {
@@ -1732,6 +1902,29 @@ public:
/// Collect values we want to ignore in the cost model.
void collectValuesToIgnore();
+ /// \returns The smallest bitwidth each instruction can be represented with.
+ /// The vector equivalents of these instructions should be truncated to this
+ /// type.
+ const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
+ return MinBWs;
+ }
+
+ /// \returns True if it is more profitable to scalarize instruction \p I for
+ /// vectorization factor \p VF.
+ bool isProfitableToScalarize(Instruction *I, unsigned VF) const {
+ auto Scalars = InstsToScalarize.find(VF);
+ assert(Scalars != InstsToScalarize.end() &&
+ "VF not yet analyzed for scalarization profitability");
+ return Scalars->second.count(I);
+ }
+
+ /// \returns True if instruction \p I can be truncated to a smaller bitwidth
+ /// for vectorization factor \p VF.
+ bool canTruncateToMinimalBitwidth(Instruction *I, unsigned VF) const {
+ return VF > 1 && MinBWs.count(I) && !isProfitableToScalarize(I, VF) &&
+ !Legal->isScalarAfterVectorization(I);
+ }
+
private:
/// The vectorization cost is a combination of the cost itself and a boolean
/// indicating whether any of the contributing operations will actually
@@ -1760,20 +1953,44 @@ private:
/// as a vector operation.
bool isConsecutiveLoadOrStore(Instruction *I);
- /// Report an analysis message to assist the user in diagnosing loops that are
- /// not vectorized. These are handled as LoopAccessReport rather than
- /// VectorizationReport because the << operator of VectorizationReport returns
- /// LoopAccessReport.
- void emitAnalysis(const LoopAccessReport &Message) const {
- emitAnalysisDiag(TheFunction, TheLoop, *Hints, Message);
+ /// Create an analysis remark that explains why vectorization failed
+ ///
+ /// \p RemarkName is the identifier for the remark. \return the remark object
+ /// that can be streamed to.
+ OptimizationRemarkAnalysis createMissedAnalysis(StringRef RemarkName) {
+ return ::createMissedAnalysis(Hints->vectorizeAnalysisPassName(),
+ RemarkName, TheLoop);
}
-public:
/// Map of scalar integer values to the smallest bitwidth they can be legally
/// represented as. The vector equivalents of these values should be truncated
/// to this type.
MapVector<Instruction *, uint64_t> MinBWs;
+ /// A type representing the costs for instructions if they were to be
+ /// scalarized rather than vectorized. The entries are Instruction-Cost
+ /// pairs.
+ typedef DenseMap<Instruction *, unsigned> ScalarCostsTy;
+
+ /// A map holding scalar costs for different vectorization factors. The
+ /// presence of a cost for an instruction in the mapping indicates that the
+ /// instruction will be scalarized when vectorizing with the associated
+ /// vectorization factor. The entries are VF-ScalarCostTy pairs.
+ DenseMap<unsigned, ScalarCostsTy> InstsToScalarize;
+
+ /// Returns the expected difference in cost from scalarizing the expression
+ /// feeding a predicated instruction \p PredInst. The instructions to
+ /// scalarize and their scalar costs are collected in \p ScalarCosts. A
+ /// non-negative return value implies the expression will be scalarized.
+ /// Currently, only single-use chains are considered for scalarization.
+ int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
+ unsigned VF);
+
+ /// Collects the instructions to scalarize for each predicated instruction in
+ /// the loop.
+ void collectInstsToScalarize(unsigned VF);
+
+public:
/// The loop that we evaluate.
Loop *TheLoop;
/// Predicated scalar evolution analysis.
@@ -1790,6 +2007,9 @@ public:
DemandedBits *DB;
/// Assumption cache.
AssumptionCache *AC;
+ /// Interface to emit optimization remarks.
+ OptimizationRemarkEmitter *ORE;
+
const Function *TheFunction;
/// Loop Vectorize Hint.
const LoopVectorizeHints *Hints;
@@ -1813,8 +2033,8 @@ public:
/// followed by a non-expert user.
class LoopVectorizationRequirements {
public:
- LoopVectorizationRequirements()
- : NumRuntimePointerChecks(0), UnsafeAlgebraInst(nullptr) {}
+ LoopVectorizationRequirements(OptimizationRemarkEmitter &ORE)
+ : NumRuntimePointerChecks(0), UnsafeAlgebraInst(nullptr), ORE(ORE) {}
void addUnsafeAlgebraInst(Instruction *I) {
// First unsafe algebra instruction.
@@ -1825,13 +2045,15 @@ public:
void addRuntimePointerChecks(unsigned Num) { NumRuntimePointerChecks = Num; }
bool doesNotMeet(Function *F, Loop *L, const LoopVectorizeHints &Hints) {
- const char *Name = Hints.vectorizeAnalysisPassName();
+ const char *PassName = Hints.vectorizeAnalysisPassName();
bool Failed = false;
if (UnsafeAlgebraInst && !Hints.allowReordering()) {
- emitOptimizationRemarkAnalysisFPCommute(
- F->getContext(), Name, *F, UnsafeAlgebraInst->getDebugLoc(),
- VectorizationReport() << "cannot prove it is safe to reorder "
- "floating-point operations");
+ ORE.emit(
+ OptimizationRemarkAnalysisFPCommute(PassName, "CantReorderFPOps",
+ UnsafeAlgebraInst->getDebugLoc(),
+ UnsafeAlgebraInst->getParent())
+ << "loop not vectorized: cannot prove it is safe to reorder "
+ "floating-point operations");
Failed = true;
}
@@ -1842,10 +2064,11 @@ public:
NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold;
if ((ThresholdReached && !Hints.allowReordering()) ||
PragmaThresholdReached) {
- emitOptimizationRemarkAnalysisAliasing(
- F->getContext(), Name, *F, L->getStartLoc(),
- VectorizationReport()
- << "cannot prove it is safe to reorder memory operations");
+ ORE.emit(OptimizationRemarkAnalysisAliasing(PassName, "CantReorderMemOps",
+ L->getStartLoc(),
+ L->getHeader())
+ << "loop not vectorized: cannot prove it is safe to reorder "
+ "memory operations");
DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
Failed = true;
}
@@ -1856,6 +2079,9 @@ public:
private:
unsigned NumRuntimePointerChecks;
Instruction *UnsafeAlgebraInst;
+
+ /// Interface to emit optimization remarks.
+ OptimizationRemarkEmitter &ORE;
};
static void addAcyclicInnerLoop(Loop &L, SmallVectorImpl<Loop *> &V) {
@@ -1897,12 +2123,13 @@ struct LoopVectorize : public FunctionPass {
auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
+ auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
std::function<const LoopAccessInfo &(Loop &)> GetLAA =
[&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
- GetLAA);
+ GetLAA, *ORE);
}
void getAnalysisUsage(AnalysisUsage &AU) const override {
@@ -1917,6 +2144,7 @@ struct LoopVectorize : public FunctionPass {
AU.addRequired<AAResultsWrapperPass>();
AU.addRequired<LoopAccessLegacyAnalysis>();
AU.addRequired<DemandedBitsWrapperPass>();
+ AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
AU.addPreserved<LoopInfoWrapperPass>();
AU.addPreserved<DominatorTreeWrapperPass>();
AU.addPreserved<BasicAAWrapperPass>();
@@ -1949,7 +2177,7 @@ Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
}
void InnerLoopVectorizer::createVectorIntInductionPHI(
- const InductionDescriptor &II, VectorParts &Entry, IntegerType *TruncType) {
+ const InductionDescriptor &II, Instruction *EntryVal) {
Value *Start = II.getStartValue();
ConstantInt *Step = II.getConstIntStepValue();
assert(Step && "Can not widen an IV with a non-constant step");
@@ -1957,7 +2185,8 @@ void InnerLoopVectorizer::createVectorIntInductionPHI(
// Construct the initial value of the vector IV in the vector loop preheader
auto CurrIP = Builder.saveIP();
Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
- if (TruncType) {
+ if (isa<TruncInst>(EntryVal)) {
+ auto *TruncType = cast<IntegerType>(EntryVal->getType());
Step = ConstantInt::getSigned(TruncType, Step->getSExtValue());
Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
}
@@ -1972,18 +2201,45 @@ void InnerLoopVectorizer::createVectorIntInductionPHI(
// factor. The last of those goes into the PHI.
PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
&*LoopVectorBody->getFirstInsertionPt());
- Value *LastInduction = VecInd;
+ Instruction *LastInduction = VecInd;
+ VectorParts Entry(UF);
for (unsigned Part = 0; Part < UF; ++Part) {
Entry[Part] = LastInduction;
- LastInduction = Builder.CreateAdd(LastInduction, SplatVF, "step.add");
+ LastInduction = cast<Instruction>(
+ Builder.CreateAdd(LastInduction, SplatVF, "step.add"));
}
+ VectorLoopValueMap.initVector(EntryVal, Entry);
+ if (isa<TruncInst>(EntryVal))
+ addMetadata(Entry, EntryVal);
+
+ // Move the last step to the end of the latch block. This ensures consistent
+ // placement of all induction updates.
+ auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
+ auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
+ auto *ICmp = cast<Instruction>(Br->getCondition());
+ LastInduction->moveBefore(ICmp);
+ LastInduction->setName("vec.ind.next");
VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
- VecInd->addIncoming(LastInduction, LoopVectorBody);
+ VecInd->addIncoming(LastInduction, LoopVectorLatch);
}
-void InnerLoopVectorizer::widenIntInduction(PHINode *IV, VectorParts &Entry,
- TruncInst *Trunc) {
+bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const {
+ return Legal->isScalarAfterVectorization(I) ||
+ Cost->isProfitableToScalarize(I, VF);
+}
+
+bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
+ if (shouldScalarizeInstruction(IV))
+ return true;
+ auto isScalarInst = [&](User *U) -> bool {
+ auto *I = cast<Instruction>(U);
+ return (OrigLoop->contains(I) && shouldScalarizeInstruction(I));
+ };
+ return any_of(IV->users(), isScalarInst);
+}
+
+void InnerLoopVectorizer::widenIntInduction(PHINode *IV, TruncInst *Trunc) {
auto II = Legal->getInductionVars()->find(IV);
assert(II != Legal->getInductionVars()->end() && "IV is not an induction");
@@ -1991,12 +2247,25 @@ void InnerLoopVectorizer::widenIntInduction(PHINode *IV, VectorParts &Entry,
auto ID = II->second;
assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
- // If a truncate instruction was provided, get the smaller type.
- auto *TruncType = Trunc ? cast<IntegerType>(Trunc->getType()) : nullptr;
+ // The scalar value to broadcast. This will be derived from the canonical
+ // induction variable.
+ Value *ScalarIV = nullptr;
// The step of the induction.
Value *Step = nullptr;
+ // The value from the original loop to which we are mapping the new induction
+ // variable.
+ Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
+
+ // True if we have vectorized the induction variable.
+ auto VectorizedIV = false;
+
+ // Determine if we want a scalar version of the induction variable. This is
+ // true if the induction variable itself is not widened, or if it has at
+ // least one user in the loop that is not widened.
+ auto NeedsScalarIV = VF > 1 && needsScalarInduction(EntryVal);
+
// If the induction variable has a constant integer step value, go ahead and
// get it now.
if (ID.getConstIntStepValue())
@@ -2006,40 +2275,50 @@ void InnerLoopVectorizer::widenIntInduction(PHINode *IV, VectorParts &Entry,
// create the phi node, we will splat the scalar induction variable in each
// loop iteration.
if (VF > 1 && IV->getType() == Induction->getType() && Step &&
- !ValuesNotWidened->count(IV))
- return createVectorIntInductionPHI(ID, Entry, TruncType);
-
- // The scalar value to broadcast. This will be derived from the canonical
- // induction variable.
- Value *ScalarIV = nullptr;
-
- // Define the scalar induction variable and step values. If we were given a
- // truncation type, truncate the canonical induction variable and constant
- // step. Otherwise, derive these values from the induction descriptor.
- if (TruncType) {
- assert(Step && "Truncation requires constant integer step");
- auto StepInt = cast<ConstantInt>(Step)->getSExtValue();
- ScalarIV = Builder.CreateCast(Instruction::Trunc, Induction, TruncType);
- Step = ConstantInt::getSigned(TruncType, StepInt);
- } else {
- ScalarIV = Induction;
- auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
- if (IV != OldInduction) {
- ScalarIV = Builder.CreateSExtOrTrunc(ScalarIV, IV->getType());
- ScalarIV = ID.transform(Builder, ScalarIV, PSE.getSE(), DL);
- ScalarIV->setName("offset.idx");
- }
- if (!Step) {
- SCEVExpander Exp(*PSE.getSE(), DL, "induction");
- Step = Exp.expandCodeFor(ID.getStep(), ID.getStep()->getType(),
- &*Builder.GetInsertPoint());
+ !shouldScalarizeInstruction(EntryVal)) {
+ createVectorIntInductionPHI(ID, EntryVal);
+ VectorizedIV = true;
+ }
+
+ // If we haven't yet vectorized the induction variable, or if we will create
+ // a scalar one, we need to define the scalar induction variable and step
+ // values. If we were given a truncation type, truncate the canonical
+ // induction variable and constant step. Otherwise, derive these values from
+ // the induction descriptor.
+ if (!VectorizedIV || NeedsScalarIV) {
+ if (Trunc) {
+ auto *TruncType = cast<IntegerType>(Trunc->getType());
+ assert(Step && "Truncation requires constant integer step");
+ auto StepInt = cast<ConstantInt>(Step)->getSExtValue();
+ ScalarIV = Builder.CreateCast(Instruction::Trunc, Induction, TruncType);
+ Step = ConstantInt::getSigned(TruncType, StepInt);
+ } else {
+ ScalarIV = Induction;
+ auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
+ if (IV != OldInduction) {
+ ScalarIV = Builder.CreateSExtOrTrunc(ScalarIV, IV->getType());
+ ScalarIV = ID.transform(Builder, ScalarIV, PSE.getSE(), DL);
+ ScalarIV->setName("offset.idx");
+ }
+ if (!Step) {
+ SCEVExpander Exp(*PSE.getSE(), DL, "induction");
+ Step = Exp.expandCodeFor(ID.getStep(), ID.getStep()->getType(),
+ &*Builder.GetInsertPoint());
+ }
}
}
- // Splat the scalar induction variable, and build the necessary step vectors.
- Value *Broadcasted = getBroadcastInstrs(ScalarIV);
- for (unsigned Part = 0; Part < UF; ++Part)
- Entry[Part] = getStepVector(Broadcasted, VF * Part, Step);
+ // If we haven't yet vectorized the induction variable, splat the scalar
+ // induction variable, and build the necessary step vectors.
+ if (!VectorizedIV) {
+ Value *Broadcasted = getBroadcastInstrs(ScalarIV);
+ VectorParts Entry(UF);
+ for (unsigned Part = 0; Part < UF; ++Part)
+ Entry[Part] = getStepVector(Broadcasted, VF * Part, Step);
+ VectorLoopValueMap.initVector(EntryVal, Entry);
+ if (Trunc)
+ addMetadata(Entry, Trunc);
+ }
// If an induction variable is only used for counting loop iterations or
// calculating addresses, it doesn't need to be widened. Create scalar steps
@@ -2047,38 +2326,64 @@ void InnerLoopVectorizer::widenIntInduction(PHINode *IV, VectorParts &Entry,
// addition of the scalar steps will not increase the number of instructions
// in the loop in the common case prior to InstCombine. We will be trading
// one vector extract for each scalar step.
- if (VF > 1 && ValuesNotWidened->count(IV)) {
- auto *EntryVal = Trunc ? cast<Value>(Trunc) : IV;
+ if (NeedsScalarIV)
buildScalarSteps(ScalarIV, Step, EntryVal);
- }
}
-Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx,
- Value *Step) {
+Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
+ Instruction::BinaryOps BinOp) {
+ // Create and check the types.
assert(Val->getType()->isVectorTy() && "Must be a vector");
- assert(Val->getType()->getScalarType()->isIntegerTy() &&
- "Elem must be an integer");
- assert(Step->getType() == Val->getType()->getScalarType() &&
- "Step has wrong type");
- // Create the types.
- Type *ITy = Val->getType()->getScalarType();
- VectorType *Ty = cast<VectorType>(Val->getType());
- int VLen = Ty->getNumElements();
+ int VLen = Val->getType()->getVectorNumElements();
+
+ Type *STy = Val->getType()->getScalarType();
+ assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
+ "Induction Step must be an integer or FP");
+ assert(Step->getType() == STy && "Step has wrong type");
+
SmallVector<Constant *, 8> Indices;
+ if (STy->isIntegerTy()) {
+ // Create a vector of consecutive numbers from zero to VF.
+ for (int i = 0; i < VLen; ++i)
+ Indices.push_back(ConstantInt::get(STy, StartIdx + i));
+
+ // Add the consecutive indices to the vector value.
+ Constant *Cv = ConstantVector::get(Indices);
+ assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
+ Step = Builder.CreateVectorSplat(VLen, Step);
+ assert(Step->getType() == Val->getType() && "Invalid step vec");
+ // FIXME: The newly created binary instructions should contain nsw/nuw flags,
+ // which can be found from the original scalar operations.
+ Step = Builder.CreateMul(Cv, Step);
+ return Builder.CreateAdd(Val, Step, "induction");
+ }
+
+ // Floating point induction.
+ assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
+ "Binary Opcode should be specified for FP induction");
// Create a vector of consecutive numbers from zero to VF.
for (int i = 0; i < VLen; ++i)
- Indices.push_back(ConstantInt::get(ITy, StartIdx + i));
+ Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i)));
// Add the consecutive indices to the vector value.
Constant *Cv = ConstantVector::get(Indices);
- assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
+
Step = Builder.CreateVectorSplat(VLen, Step);
- assert(Step->getType() == Val->getType() && "Invalid step vec");
- // FIXME: The newly created binary instructions should contain nsw/nuw flags,
- // which can be found from the original scalar operations.
- Step = Builder.CreateMul(Cv, Step);
- return Builder.CreateAdd(Val, Step, "induction");
+
+ // Floating point operations had to be 'fast' to enable the induction.
+ FastMathFlags Flags;
+ Flags.setUnsafeAlgebra();
+
+ Value *MulOp = Builder.CreateFMul(Cv, Step);
+ if (isa<Instruction>(MulOp))
+ // Have to check, MulOp may be a constant
+ cast<Instruction>(MulOp)->setFastMathFlags(Flags);
+
+ Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
+ if (isa<Instruction>(BOp))
+ cast<Instruction>(BOp)->setFastMathFlags(Flags);
+ return BOp;
}
void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
@@ -2092,98 +2397,34 @@ void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
assert(ScalarIVTy->isIntegerTy() && ScalarIVTy == Step->getType() &&
"Val and Step should have the same integer type");
- // Compute the scalar steps and save the results in ScalarIVMap.
- for (unsigned Part = 0; Part < UF; ++Part)
- for (unsigned I = 0; I < VF; ++I) {
- auto *StartIdx = ConstantInt::get(ScalarIVTy, VF * Part + I);
+ // Determine the number of scalars we need to generate for each unroll
+ // iteration. If EntryVal is uniform, we only need to generate the first
+ // lane. Otherwise, we generate all VF values.
+ unsigned Lanes =
+ Legal->isUniformAfterVectorization(cast<Instruction>(EntryVal)) ? 1 : VF;
+
+ // Compute the scalar steps and save the results in VectorLoopValueMap.
+ ScalarParts Entry(UF);
+ for (unsigned Part = 0; Part < UF; ++Part) {
+ Entry[Part].resize(VF);
+ for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
+ auto *StartIdx = ConstantInt::get(ScalarIVTy, VF * Part + Lane);
auto *Mul = Builder.CreateMul(StartIdx, Step);
auto *Add = Builder.CreateAdd(ScalarIV, Mul);
- ScalarIVMap[EntryVal].push_back(Add);
+ Entry[Part][Lane] = Add;
}
+ }
+ VectorLoopValueMap.initScalar(EntryVal, Entry);
}
int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
- assert(Ptr->getType()->isPointerTy() && "Unexpected non-ptr");
- auto *SE = PSE.getSE();
- // Make sure that the pointer does not point to structs.
- if (Ptr->getType()->getPointerElementType()->isAggregateType())
- return 0;
-
- // If this value is a pointer induction variable, we know it is consecutive.
- PHINode *Phi = dyn_cast_or_null<PHINode>(Ptr);
- if (Phi && Inductions.count(Phi)) {
- InductionDescriptor II = Inductions[Phi];
- return II.getConsecutiveDirection();
- }
-
- GetElementPtrInst *Gep = getGEPInstruction(Ptr);
- if (!Gep)
- return 0;
-
- unsigned NumOperands = Gep->getNumOperands();
- Value *GpPtr = Gep->getPointerOperand();
- // If this GEP value is a consecutive pointer induction variable and all of
- // the indices are constant, then we know it is consecutive.
- Phi = dyn_cast<PHINode>(GpPtr);
- if (Phi && Inductions.count(Phi)) {
-
- // Make sure that the pointer does not point to structs.
- PointerType *GepPtrType = cast<PointerType>(GpPtr->getType());
- if (GepPtrType->getElementType()->isAggregateType())
- return 0;
-
- // Make sure that all of the index operands are loop invariant.
- for (unsigned i = 1; i < NumOperands; ++i)
- if (!SE->isLoopInvariant(PSE.getSCEV(Gep->getOperand(i)), TheLoop))
- return 0;
-
- InductionDescriptor II = Inductions[Phi];
- return II.getConsecutiveDirection();
- }
-
- unsigned InductionOperand = getGEPInductionOperand(Gep);
-
- // Check that all of the gep indices are uniform except for our induction
- // operand.
- for (unsigned i = 0; i != NumOperands; ++i)
- if (i != InductionOperand &&
- !SE->isLoopInvariant(PSE.getSCEV(Gep->getOperand(i)), TheLoop))
- return 0;
-
- // We can emit wide load/stores only if the last non-zero index is the
- // induction variable.
- const SCEV *Last = nullptr;
- if (!getSymbolicStrides() || !getSymbolicStrides()->count(Gep))
- Last = PSE.getSCEV(Gep->getOperand(InductionOperand));
- else {
- // Because of the multiplication by a stride we can have a s/zext cast.
- // We are going to replace this stride by 1 so the cast is safe to ignore.
- //
- // %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
- // %0 = trunc i64 %indvars.iv to i32
- // %mul = mul i32 %0, %Stride1
- // %idxprom = zext i32 %mul to i64 << Safe cast.
- // %arrayidx = getelementptr inbounds i32* %B, i64 %idxprom
- //
- Last = replaceSymbolicStrideSCEV(PSE, *getSymbolicStrides(),
- Gep->getOperand(InductionOperand), Gep);
- if (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(Last))
- Last =
- (C->getSCEVType() == scSignExtend || C->getSCEVType() == scZeroExtend)
- ? C->getOperand()
- : Last;
- }
- if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Last)) {
- const SCEV *Step = AR->getStepRecurrence(*SE);
- // The memory is consecutive because the last index is consecutive
- // and all other indices are loop invariant.
- if (Step->isOne())
- return 1;
- if (Step->isAllOnesValue())
- return -1;
- }
+ const ValueToValueMap &Strides = getSymbolicStrides() ? *getSymbolicStrides() :
+ ValueToValueMap();
+ int Stride = getPtrStride(PSE, Ptr, TheLoop, Strides, true, false);
+ if (Stride == 1 || Stride == -1)
+ return Stride;
return 0;
}
@@ -2191,23 +2432,112 @@ bool LoopVectorizationLegality::isUniform(Value *V) {
return LAI->isUniform(V);
}
-InnerLoopVectorizer::VectorParts &
+const InnerLoopVectorizer::VectorParts &
InnerLoopVectorizer::getVectorValue(Value *V) {
assert(V != Induction && "The new induction variable should not be used.");
assert(!V->getType()->isVectorTy() && "Can't widen a vector");
+ assert(!V->getType()->isVoidTy() && "Type does not produce a value");
// If we have a stride that is replaced by one, do it here.
if (Legal->hasStride(V))
V = ConstantInt::get(V->getType(), 1);
// If we have this scalar in the map, return it.
- if (WidenMap.has(V))
- return WidenMap.get(V);
+ if (VectorLoopValueMap.hasVector(V))
+ return VectorLoopValueMap.VectorMapStorage[V];
+
+ // If the value has not been vectorized, check if it has been scalarized
+ // instead. If it has been scalarized, and we actually need the value in
+ // vector form, we will construct the vector values on demand.
+ if (VectorLoopValueMap.hasScalar(V)) {
+
+ // Initialize a new vector map entry.
+ VectorParts Entry(UF);
+
+ // If we've scalarized a value, that value should be an instruction.
+ auto *I = cast<Instruction>(V);
+
+ // If we aren't vectorizing, we can just copy the scalar map values over to
+ // the vector map.
+ if (VF == 1) {
+ for (unsigned Part = 0; Part < UF; ++Part)
+ Entry[Part] = getScalarValue(V, Part, 0);
+ return VectorLoopValueMap.initVector(V, Entry);
+ }
+
+ // Get the last scalar instruction we generated for V. If the value is
+ // known to be uniform after vectorization, this corresponds to lane zero
+ // of the last unroll iteration. Otherwise, the last instruction is the one
+ // we created for the last vector lane of the last unroll iteration.
+ unsigned LastLane = Legal->isUniformAfterVectorization(I) ? 0 : VF - 1;
+ auto *LastInst = cast<Instruction>(getScalarValue(V, UF - 1, LastLane));
+
+ // Set the insert point after the last scalarized instruction. This ensures
+ // the insertelement sequence will directly follow the scalar definitions.
+ auto OldIP = Builder.saveIP();
+ auto NewIP = std::next(BasicBlock::iterator(LastInst));
+ Builder.SetInsertPoint(&*NewIP);
+
+ // However, if we are vectorizing, we need to construct the vector values.
+ // If the value is known to be uniform after vectorization, we can just
+ // broadcast the scalar value corresponding to lane zero for each unroll
+ // iteration. Otherwise, we construct the vector values using insertelement
+ // instructions. Since the resulting vectors are stored in
+ // VectorLoopValueMap, we will only generate the insertelements once.
+ for (unsigned Part = 0; Part < UF; ++Part) {
+ Value *VectorValue = nullptr;
+ if (Legal->isUniformAfterVectorization(I)) {
+ VectorValue = getBroadcastInstrs(getScalarValue(V, Part, 0));
+ } else {
+ VectorValue = UndefValue::get(VectorType::get(V->getType(), VF));
+ for (unsigned Lane = 0; Lane < VF; ++Lane)
+ VectorValue = Builder.CreateInsertElement(
+ VectorValue, getScalarValue(V, Part, Lane),
+ Builder.getInt32(Lane));
+ }
+ Entry[Part] = VectorValue;
+ }
+ Builder.restoreIP(OldIP);
+ return VectorLoopValueMap.initVector(V, Entry);
+ }
// If this scalar is unknown, assume that it is a constant or that it is
// loop invariant. Broadcast V and save the value for future uses.
Value *B = getBroadcastInstrs(V);
- return WidenMap.splat(V, B);
+ return VectorLoopValueMap.initVector(V, VectorParts(UF, B));
+}
+
+Value *InnerLoopVectorizer::getScalarValue(Value *V, unsigned Part,
+ unsigned Lane) {
+
+ // If the value is not an instruction contained in the loop, it should
+ // already be scalar.
+ if (OrigLoop->isLoopInvariant(V))
+ return V;
+
+ assert(Lane > 0 ? !Legal->isUniformAfterVectorization(cast<Instruction>(V))
+ : true && "Uniform values only have lane zero");
+
+ // If the value from the original loop has not been vectorized, it is
+ // represented by UF x VF scalar values in the new loop. Return the requested
+ // scalar value.
+ if (VectorLoopValueMap.hasScalar(V))
+ return VectorLoopValueMap.ScalarMapStorage[V][Part][Lane];
+
+ // If the value has not been scalarized, get its entry in VectorLoopValueMap
+ // for the given unroll part. If this entry is not a vector type (i.e., the
+ // vectorization factor is one), there is no need to generate an
+ // extractelement instruction.
+ auto *U = getVectorValue(V)[Part];
+ if (!U->getType()->isVectorTy()) {
+ assert(VF == 1 && "Value not scalarized has non-vector type");
+ return U;
+ }
+
+ // Otherwise, the value from the original loop has been vectorized and is
+ // represented by UF vector values. Extract and return the requested scalar
+ // value from the appropriate vector lane.
+ return Builder.CreateExtractElement(U, Builder.getInt32(Lane));
}
Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
@@ -2355,7 +2685,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) {
LoadInst *LI = dyn_cast<LoadInst>(Instr);
StoreInst *SI = dyn_cast<StoreInst>(Instr);
- Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand();
+ Value *Ptr = getPointerOperand(Instr);
// Prepare for the vector type of the interleaved load/store.
Type *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
@@ -2365,15 +2695,20 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) {
// Prepare for the new pointers.
setDebugLocFromInst(Builder, Ptr);
- VectorParts &PtrParts = getVectorValue(Ptr);
SmallVector<Value *, 2> NewPtrs;
unsigned Index = Group->getIndex(Instr);
+
+ // If the group is reverse, adjust the index to refer to the last vector lane
+ // instead of the first. We adjust the index from the first vector lane,
+ // rather than directly getting the pointer for lane VF - 1, because the
+ // pointer operand of the interleaved access is supposed to be uniform. For
+ // uniform instructions, we're only required to generate a value for the
+ // first vector lane in each unroll iteration.
+ if (Group->isReverse())
+ Index += (VF - 1) * Group->getFactor();
+
for (unsigned Part = 0; Part < UF; Part++) {
- // Extract the pointer for current instruction from the pointer vector. A
- // reverse access uses the pointer in the last lane.
- Value *NewPtr = Builder.CreateExtractElement(
- PtrParts[Part],
- Group->isReverse() ? Builder.getInt32(VF - 1) : Builder.getInt32(0));
+ Value *NewPtr = getScalarValue(Ptr, Part, 0);
// Notice current instruction could be any index. Need to adjust the address
// to the member of index 0.
@@ -2397,20 +2732,30 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) {
// Vectorize the interleaved load group.
if (LI) {
+
+ // For each unroll part, create a wide load for the group.
+ SmallVector<Value *, 2> NewLoads;
for (unsigned Part = 0; Part < UF; Part++) {
- Instruction *NewLoadInstr = Builder.CreateAlignedLoad(
+ auto *NewLoad = Builder.CreateAlignedLoad(
NewPtrs[Part], Group->getAlignment(), "wide.vec");
+ addMetadata(NewLoad, Instr);
+ NewLoads.push_back(NewLoad);
+ }
- for (unsigned i = 0; i < InterleaveFactor; i++) {
- Instruction *Member = Group->getMember(i);
+ // For each member in the group, shuffle out the appropriate data from the
+ // wide loads.
+ for (unsigned I = 0; I < InterleaveFactor; ++I) {
+ Instruction *Member = Group->getMember(I);
- // Skip the gaps in the group.
- if (!Member)
- continue;
+ // Skip the gaps in the group.
+ if (!Member)
+ continue;
- Constant *StrideMask = getStridedMask(Builder, i, InterleaveFactor, VF);
+ VectorParts Entry(UF);
+ Constant *StrideMask = getStridedMask(Builder, I, InterleaveFactor, VF);
+ for (unsigned Part = 0; Part < UF; Part++) {
Value *StridedVec = Builder.CreateShuffleVector(
- NewLoadInstr, UndefVec, StrideMask, "strided.vec");
+ NewLoads[Part], UndefVec, StrideMask, "strided.vec");
// If this member has different type, cast the result type.
if (Member->getType() != ScalarTy) {
@@ -2418,12 +2763,10 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) {
StridedVec = Builder.CreateBitOrPointerCast(StridedVec, OtherVTy);
}
- VectorParts &Entry = WidenMap.get(Member);
Entry[Part] =
Group->isReverse() ? reverseVector(StridedVec) : StridedVec;
}
-
- addMetadata(NewLoadInstr, Instr);
+ VectorLoopValueMap.initVector(Member, Entry);
}
return;
}
@@ -2479,7 +2822,7 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
Type *ScalarDataTy = LI ? LI->getType() : SI->getValueOperand()->getType();
Type *DataTy = VectorType::get(ScalarDataTy, VF);
- Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand();
+ Value *Ptr = getPointerOperand(Instr);
unsigned Alignment = LI ? LI->getAlignment() : SI->getAlignment();
// An alignment of 0 means target abi alignment. We need to use the scalar's
// target abi alignment in such a case.
@@ -2487,93 +2830,57 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
if (!Alignment)
Alignment = DL.getABITypeAlignment(ScalarDataTy);
unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
- uint64_t ScalarAllocatedSize = DL.getTypeAllocSize(ScalarDataTy);
- uint64_t VectorElementSize = DL.getTypeStoreSize(DataTy) / VF;
- if (SI && Legal->blockNeedsPredication(SI->getParent()) &&
- !Legal->isMaskRequired(SI))
- return scalarizeInstruction(Instr, true);
+ // Scalarize the memory instruction if necessary.
+ if (Legal->memoryInstructionMustBeScalarized(Instr, VF))
+ return scalarizeInstruction(Instr, Legal->isScalarWithPredication(Instr));
- if (ScalarAllocatedSize != VectorElementSize)
- return scalarizeInstruction(Instr);
-
- // If the pointer is loop invariant scalarize the load.
- if (LI && Legal->isUniform(Ptr))
- return scalarizeInstruction(Instr);
-
- // If the pointer is non-consecutive and gather/scatter is not supported
- // scalarize the instruction.
+ // Determine if the pointer operand of the access is either consecutive or
+ // reverse consecutive.
int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
bool Reverse = ConsecutiveStride < 0;
- bool CreateGatherScatter =
- !ConsecutiveStride && ((LI && Legal->isLegalMaskedGather(ScalarDataTy)) ||
- (SI && Legal->isLegalMaskedScatter(ScalarDataTy)));
- if (!ConsecutiveStride && !CreateGatherScatter)
- return scalarizeInstruction(Instr);
+ // Determine if either a gather or scatter operation is legal.
+ bool CreateGatherScatter =
+ !ConsecutiveStride && Legal->isLegalGatherOrScatter(Instr);
- Constant *Zero = Builder.getInt32(0);
- VectorParts &Entry = WidenMap.get(Instr);
VectorParts VectorGep;
// Handle consecutive loads/stores.
GetElementPtrInst *Gep = getGEPInstruction(Ptr);
if (ConsecutiveStride) {
- if (Gep && Legal->isInductionVariable(Gep->getPointerOperand())) {
- setDebugLocFromInst(Builder, Gep);
- Value *PtrOperand = Gep->getPointerOperand();
- Value *FirstBasePtr = getVectorValue(PtrOperand)[0];
- FirstBasePtr = Builder.CreateExtractElement(FirstBasePtr, Zero);
-
- // Create the new GEP with the new induction variable.
- GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
- Gep2->setOperand(0, FirstBasePtr);
- Gep2->setName("gep.indvar.base");
- Ptr = Builder.Insert(Gep2);
- } else if (Gep) {
- setDebugLocFromInst(Builder, Gep);
- assert(PSE.getSE()->isLoopInvariant(PSE.getSCEV(Gep->getPointerOperand()),
- OrigLoop) &&
- "Base ptr must be invariant");
- // The last index does not have to be the induction. It can be
- // consecutive and be a function of the index. For example A[I+1];
+ if (Gep) {
unsigned NumOperands = Gep->getNumOperands();
- unsigned InductionOperand = getGEPInductionOperand(Gep);
- // Create the new GEP with the new induction variable.
+#ifndef NDEBUG
+ // The original GEP that identified as a consecutive memory access
+ // should have only one loop-variant operand.
+ unsigned NumOfLoopVariantOps = 0;
+ for (unsigned i = 0; i < NumOperands; ++i)
+ if (!PSE.getSE()->isLoopInvariant(PSE.getSCEV(Gep->getOperand(i)),
+ OrigLoop))
+ NumOfLoopVariantOps++;
+ assert(NumOfLoopVariantOps == 1 &&
+ "Consecutive GEP should have only one loop-variant operand");
+#endif
GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
-
- for (unsigned i = 0; i < NumOperands; ++i) {
- Value *GepOperand = Gep->getOperand(i);
- Instruction *GepOperandInst = dyn_cast<Instruction>(GepOperand);
-
- // Update last index or loop invariant instruction anchored in loop.
- if (i == InductionOperand ||
- (GepOperandInst && OrigLoop->contains(GepOperandInst))) {
- assert((i == InductionOperand ||
- PSE.getSE()->isLoopInvariant(PSE.getSCEV(GepOperandInst),
- OrigLoop)) &&
- "Must be last index or loop invariant");
-
- VectorParts &GEPParts = getVectorValue(GepOperand);
-
- // If GepOperand is an induction variable, and there's a scalarized
- // version of it available, use it. Otherwise, we will need to create
- // an extractelement instruction.
- Value *Index = ScalarIVMap.count(GepOperand)
- ? ScalarIVMap[GepOperand][0]
- : Builder.CreateExtractElement(GEPParts[0], Zero);
-
- Gep2->setOperand(i, Index);
- Gep2->setName("gep.indvar.idx");
- }
- }
+ Gep2->setName("gep.indvar");
+
+ // A new GEP is created for a 0-lane value of the first unroll iteration.
+ // The GEPs for the rest of the unroll iterations are computed below as an
+ // offset from this GEP.
+ for (unsigned i = 0; i < NumOperands; ++i)
+ // We can apply getScalarValue() for all GEP indices. It returns an
+ // original value for loop-invariant operand and 0-lane for consecutive
+ // operand.
+ Gep2->setOperand(i, getScalarValue(Gep->getOperand(i),
+ 0, /* First unroll iteration */
+ 0 /* 0-lane of the vector */ ));
+ setDebugLocFromInst(Builder, Gep);
Ptr = Builder.Insert(Gep2);
+
} else { // No GEP
- // Use the induction element ptr.
- assert(isa<PHINode>(Ptr) && "Invalid induction ptr");
setDebugLocFromInst(Builder, Ptr);
- VectorParts &PtrVal = getVectorValue(Ptr);
- Ptr = Builder.CreateExtractElement(PtrVal[0], Zero);
+ Ptr = getScalarValue(Ptr, 0, 0);
}
} else {
// At this point we should vector version of GEP for Gather or Scatter
@@ -2660,6 +2967,7 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
// Handle loads.
assert(LI && "Must have a load instruction");
setDebugLocFromInst(Builder, LI);
+ VectorParts Entry(UF);
for (unsigned Part = 0; Part < UF; ++Part) {
Instruction *NewLI;
if (CreateGatherScatter) {
@@ -2692,70 +3000,45 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
}
addMetadata(NewLI, LI);
}
+ VectorLoopValueMap.initVector(Instr, Entry);
}
void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
- bool IfPredicateStore) {
+ bool IfPredicateInstr) {
assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
+ DEBUG(dbgs() << "LV: Scalarizing"
+ << (IfPredicateInstr ? " and predicating:" : ":") << *Instr
+ << '\n');
// Holds vector parameters or scalars, in case of uniform vals.
SmallVector<VectorParts, 4> Params;
setDebugLocFromInst(Builder, Instr);
- // Find all of the vectorized parameters.
- for (Value *SrcOp : Instr->operands()) {
- // If we are accessing the old induction variable, use the new one.
- if (SrcOp == OldInduction) {
- Params.push_back(getVectorValue(SrcOp));
- continue;
- }
-
- // Try using previously calculated values.
- auto *SrcInst = dyn_cast<Instruction>(SrcOp);
-
- // If the src is an instruction that appeared earlier in the basic block,
- // then it should already be vectorized.
- if (SrcInst && OrigLoop->contains(SrcInst)) {
- assert(WidenMap.has(SrcInst) && "Source operand is unavailable");
- // The parameter is a vector value from earlier.
- Params.push_back(WidenMap.get(SrcInst));
- } else {
- // The parameter is a scalar from outside the loop. Maybe even a constant.
- VectorParts Scalars;
- Scalars.append(UF, SrcOp);
- Params.push_back(Scalars);
- }
- }
-
- assert(Params.size() == Instr->getNumOperands() &&
- "Invalid number of operands");
-
// Does this instruction return a value ?
bool IsVoidRetTy = Instr->getType()->isVoidTy();
- Value *UndefVec =
- IsVoidRetTy ? nullptr
- : UndefValue::get(VectorType::get(Instr->getType(), VF));
- // Create a new entry in the WidenMap and initialize it to Undef or Null.
- VectorParts &VecResults = WidenMap.splat(Instr, UndefVec);
+ // Initialize a new scalar map entry.
+ ScalarParts Entry(UF);
VectorParts Cond;
- if (IfPredicateStore) {
- assert(Instr->getParent()->getSinglePredecessor() &&
- "Only support single predecessor blocks");
- Cond = createEdgeMask(Instr->getParent()->getSinglePredecessor(),
- Instr->getParent());
- }
+ if (IfPredicateInstr)
+ Cond = createBlockInMask(Instr->getParent());
+
+ // Determine the number of scalars we need to generate for each unroll
+ // iteration. If the instruction is uniform, we only need to generate the
+ // first lane. Otherwise, we generate all VF values.
+ unsigned Lanes = Legal->isUniformAfterVectorization(Instr) ? 1 : VF;
// For each vector unroll 'part':
for (unsigned Part = 0; Part < UF; ++Part) {
+ Entry[Part].resize(VF);
// For each scalar that we create:
- for (unsigned Width = 0; Width < VF; ++Width) {
+ for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
// Start if-block.
Value *Cmp = nullptr;
- if (IfPredicateStore) {
- Cmp = Builder.CreateExtractElement(Cond[Part], Builder.getInt32(Width));
+ if (IfPredicateInstr) {
+ Cmp = Builder.CreateExtractElement(Cond[Part], Builder.getInt32(Lane));
Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cmp,
ConstantInt::get(Cmp->getType(), 1));
}
@@ -2763,18 +3046,11 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
Instruction *Cloned = Instr->clone();
if (!IsVoidRetTy)
Cloned->setName(Instr->getName() + ".cloned");
- // Replace the operands of the cloned instructions with extracted scalars.
- for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
- // If the operand is an induction variable, and there's a scalarized
- // version of it available, use it. Otherwise, we will need to create
- // an extractelement instruction if vectorizing.
- auto *NewOp = Params[op][Part];
- auto *ScalarOp = Instr->getOperand(op);
- if (ScalarIVMap.count(ScalarOp))
- NewOp = ScalarIVMap[ScalarOp][VF * Part + Width];
- else if (NewOp->getType()->isVectorTy())
- NewOp = Builder.CreateExtractElement(NewOp, Builder.getInt32(Width));
+ // Replace the operands of the cloned instructions with their scalar
+ // equivalents in the new loop.
+ for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
+ auto *NewOp = getScalarValue(Instr->getOperand(op), Part, Lane);
Cloned->setOperand(op, NewOp);
}
addNewMetadata(Cloned, Instr);
@@ -2782,22 +3058,20 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
// Place the cloned scalar in the new loop.
Builder.Insert(Cloned);
+ // Add the cloned scalar to the scalar map entry.
+ Entry[Part][Lane] = Cloned;
+
// If we just cloned a new assumption, add it the assumption cache.
if (auto *II = dyn_cast<IntrinsicInst>(Cloned))
if (II->getIntrinsicID() == Intrinsic::assume)
AC->registerAssumption(II);
- // If the original scalar returns a value we need to place it in a vector
- // so that future users will be able to use it.
- if (!IsVoidRetTy)
- VecResults[Part] = Builder.CreateInsertElement(VecResults[Part], Cloned,
- Builder.getInt32(Width));
// End if-block.
- if (IfPredicateStore)
- PredicatedStores.push_back(
- std::make_pair(cast<StoreInst>(Cloned), Cmp));
+ if (IfPredicateInstr)
+ PredicatedInstructions.push_back(std::make_pair(Cloned, Cmp));
}
}
+ VectorLoopValueMap.initScalar(Instr, Entry);
}
PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
@@ -2811,10 +3085,12 @@ PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
Latch = Header;
IRBuilder<> Builder(&*Header->getFirstInsertionPt());
- setDebugLocFromInst(Builder, getDebugLocFromInstOrOperands(OldInduction));
+ Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction);
+ setDebugLocFromInst(Builder, OldInst);
auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index");
Builder.SetInsertPoint(Latch->getTerminator());
+ setDebugLocFromInst(Builder, OldInst);
// Create i+1 and fill the PHINode.
Value *Next = Builder.CreateAdd(Induction, Step, "index.next");
@@ -3146,14 +3422,16 @@ void InnerLoopVectorizer::createEmptyLoop() {
// Create phi nodes to merge from the backedge-taken check block.
PHINode *BCResumeVal = PHINode::Create(
OrigPhi->getType(), 3, "bc.resume.val", ScalarPH->getTerminator());
- Value *EndValue;
+ Value *&EndValue = IVEndValues[OrigPhi];
if (OrigPhi == OldInduction) {
// We know what the end value is.
EndValue = CountRoundDown;
} else {
IRBuilder<> B(LoopBypassBlocks.back()->getTerminator());
- Value *CRD = B.CreateSExtOrTrunc(CountRoundDown,
- II.getStep()->getType(), "cast.crd");
+ Type *StepType = II.getStep()->getType();
+ Instruction::CastOps CastOp =
+ CastInst::getCastOpcode(CountRoundDown, true, StepType, true);
+ Value *CRD = B.CreateCast(CastOp, CountRoundDown, StepType, "cast.crd");
const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
EndValue = II.transform(B, CRD, PSE.getSE(), DL);
EndValue->setName("ind.end");
@@ -3163,9 +3441,6 @@ void InnerLoopVectorizer::createEmptyLoop() {
// or the value at the end of the vectorized loop.
BCResumeVal->addIncoming(EndValue, MiddleBlock);
- // Fix up external users of the induction variable.
- fixupIVUsers(OrigPhi, II, CountRoundDown, EndValue, MiddleBlock);
-
// Fix the scalar body counter (PHI node).
unsigned BlockIdx = OrigPhi->getBasicBlockIndex(ScalarPH);
@@ -3201,7 +3476,7 @@ void InnerLoopVectorizer::createEmptyLoop() {
if (MDNode *LID = OrigLoop->getLoopID())
Lp->setLoopID(LID);
- LoopVectorizeHints Hints(Lp, true);
+ LoopVectorizeHints Hints(Lp, true, *ORE);
Hints.setAlreadyVectorized();
}
@@ -3324,8 +3599,9 @@ static Value *addFastMathFlag(Value *V) {
return V;
}
-/// Estimate the overhead of scalarizing a value. Insert and Extract are set if
-/// the result needs to be inserted and/or extracted from vectors.
+/// \brief Estimate the overhead of scalarizing a value based on its type.
+/// Insert and Extract are set if the result needs to be inserted and/or
+/// extracted from vectors.
static unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract,
const TargetTransformInfo &TTI) {
if (Ty->isVoidTy())
@@ -3335,15 +3611,46 @@ static unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract,
unsigned Cost = 0;
for (unsigned I = 0, E = Ty->getVectorNumElements(); I < E; ++I) {
- if (Insert)
- Cost += TTI.getVectorInstrCost(Instruction::InsertElement, Ty, I);
if (Extract)
Cost += TTI.getVectorInstrCost(Instruction::ExtractElement, Ty, I);
+ if (Insert)
+ Cost += TTI.getVectorInstrCost(Instruction::InsertElement, Ty, I);
}
return Cost;
}
+/// \brief Estimate the overhead of scalarizing an Instruction based on the
+/// types of its operands and return value.
+static unsigned getScalarizationOverhead(SmallVectorImpl<Type *> &OpTys,
+ Type *RetTy,
+ const TargetTransformInfo &TTI) {
+ unsigned ScalarizationCost =
+ getScalarizationOverhead(RetTy, true, false, TTI);
+
+ for (Type *Ty : OpTys)
+ ScalarizationCost += getScalarizationOverhead(Ty, false, true, TTI);
+
+ return ScalarizationCost;
+}
+
+/// \brief Estimate the overhead of scalarizing an instruction. This is a
+/// convenience wrapper for the type-based getScalarizationOverhead API.
+static unsigned getScalarizationOverhead(Instruction *I, unsigned VF,
+ const TargetTransformInfo &TTI) {
+ if (VF == 1)
+ return 0;
+
+ Type *RetTy = ToVectorTy(I->getType(), VF);
+
+ SmallVector<Type *, 4> OpTys;
+ unsigned OperandsNum = I->getNumOperands();
+ for (unsigned OpInd = 0; OpInd < OperandsNum; ++OpInd)
+ OpTys.push_back(ToVectorTy(I->getOperand(OpInd)->getType(), VF));
+
+ return getScalarizationOverhead(OpTys, RetTy, TTI);
+}
+
// Estimate cost of a call instruction CI if it were vectorized with factor VF.
// Return the cost of the instruction, including scalarization overhead if it's
// needed. The flag NeedToScalarize shows if the call needs to be scalarized -
@@ -3374,10 +3681,7 @@ static unsigned getVectorCallCost(CallInst *CI, unsigned VF,
// Compute costs of unpacking argument values for the scalar calls and
// packing the return values to a vector.
- unsigned ScalarizationCost =
- getScalarizationOverhead(RetTy, true, false, TTI);
- for (Type *Ty : Tys)
- ScalarizationCost += getScalarizationOverhead(Ty, false, true, TTI);
+ unsigned ScalarizationCost = getScalarizationOverhead(Tys, RetTy, TTI);
unsigned Cost = ScalarCallCost * VF + ScalarizationCost;
@@ -3434,8 +3738,13 @@ void InnerLoopVectorizer::truncateToMinimalBitwidths() {
// later and will remove any ext/trunc pairs.
//
SmallPtrSet<Value *, 4> Erased;
- for (const auto &KV : *MinBWs) {
- VectorParts &Parts = WidenMap.get(KV.first);
+ for (const auto &KV : Cost->getMinimalBitwidths()) {
+ // If the value wasn't vectorized, we must maintain the original scalar
+ // type. The absence of the value from VectorLoopValueMap indicates that it
+ // wasn't vectorized.
+ if (!VectorLoopValueMap.hasVector(KV.first))
+ continue;
+ VectorParts &Parts = VectorLoopValueMap.getVector(KV.first);
for (Value *&I : Parts) {
if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I))
continue;
@@ -3526,8 +3835,13 @@ void InnerLoopVectorizer::truncateToMinimalBitwidths() {
}
// We'll have created a bunch of ZExts that are now parentless. Clean up.
- for (const auto &KV : *MinBWs) {
- VectorParts &Parts = WidenMap.get(KV.first);
+ for (const auto &KV : Cost->getMinimalBitwidths()) {
+ // If the value wasn't vectorized, we must maintain the original scalar
+ // type. The absence of the value from VectorLoopValueMap indicates that it
+ // wasn't vectorized.
+ if (!VectorLoopValueMap.hasVector(KV.first))
+ continue;
+ VectorParts &Parts = VectorLoopValueMap.getVector(KV.first);
for (Value *&I : Parts) {
ZExtInst *Inst = dyn_cast<ZExtInst>(I);
if (Inst && Inst->use_empty()) {
@@ -3558,6 +3872,11 @@ void InnerLoopVectorizer::vectorizeLoop() {
// are vectorized, so we can use them to construct the PHI.
PhiVector PHIsToFix;
+ // Collect instructions from the original loop that will become trivially
+ // dead in the vectorized loop. We don't need to vectorize these
+ // instructions.
+ collectTriviallyDeadInstructions();
+
// Scan the loop in a topological order to ensure that defs are vectorized
// before users.
LoopBlocksDFS DFS(OrigLoop);
@@ -3605,7 +3924,7 @@ void InnerLoopVectorizer::vectorizeLoop() {
Builder.SetInsertPoint(LoopBypassBlocks[1]->getTerminator());
// This is the vector-clone of the value that leaves the loop.
- VectorParts &VectorExit = getVectorValue(LoopExitInst);
+ const VectorParts &VectorExit = getVectorValue(LoopExitInst);
Type *VecTy = VectorExit[0]->getType();
// Find the reduction identity variable. Zero for addition, or, xor,
@@ -3644,10 +3963,10 @@ void InnerLoopVectorizer::vectorizeLoop() {
// Reductions do not have to start at zero. They can start with
// any loop invariant values.
- VectorParts &VecRdxPhi = WidenMap.get(Phi);
+ const VectorParts &VecRdxPhi = getVectorValue(Phi);
BasicBlock *Latch = OrigLoop->getLoopLatch();
Value *LoopVal = Phi->getIncomingValueForBlock(Latch);
- VectorParts &Val = getVectorValue(LoopVal);
+ const VectorParts &Val = getVectorValue(LoopVal);
for (unsigned part = 0; part < UF; ++part) {
// Make sure to add the reduction stat value only to the
// first unroll part.
@@ -3664,7 +3983,7 @@ void InnerLoopVectorizer::vectorizeLoop() {
// instructions.
Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
- VectorParts RdxParts = getVectorValue(LoopExitInst);
+ VectorParts &RdxParts = VectorLoopValueMap.getVector(LoopExitInst);
setDebugLocFromInst(Builder, LoopExitInst);
// If the vector reduction can be performed in a smaller type, we truncate
@@ -3792,22 +4111,25 @@ void InnerLoopVectorizer::vectorizeLoop() {
Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
} // end of for each Phi in PHIsToFix.
- fixLCSSAPHIs();
-
- // Make sure DomTree is updated.
+ // Update the dominator tree.
+ //
+ // FIXME: After creating the structure of the new loop, the dominator tree is
+ // no longer up-to-date, and it remains that way until we update it
+ // here. An out-of-date dominator tree is problematic for SCEV,
+ // because SCEVExpander uses it to guide code generation. The
+ // vectorizer use SCEVExpanders in several places. Instead, we should
+ // keep the dominator tree up-to-date as we go.
updateAnalysis();
- // Predicate any stores.
- for (auto KV : PredicatedStores) {
- BasicBlock::iterator I(KV.first);
- auto *BB = SplitBlock(I->getParent(), &*std::next(I), DT, LI);
- auto *T = SplitBlockAndInsertIfThen(KV.second, &*I, /*Unreachable=*/false,
- /*BranchWeights=*/nullptr, DT, LI);
- I->moveBefore(T);
- I->getParent()->setName("pred.store.if");
- BB->setName("pred.store.continue");
- }
- DEBUG(DT->verifyDomTree());
+ // Fix-up external users of the induction variables.
+ for (auto &Entry : *Legal->getInductionVars())
+ fixupIVUsers(Entry.first, Entry.second,
+ getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
+ IVEndValues[Entry.first], LoopMiddleBlock);
+
+ fixLCSSAPHIs();
+ predicateInstructions();
+
// Remove redundant induction instructions.
cse(LoopVectorBody);
}
@@ -3882,7 +4204,7 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
// We constructed a temporary phi node in the first phase of vectorization.
// This phi node will eventually be deleted.
- auto &PhiParts = getVectorValue(Phi);
+ VectorParts &PhiParts = VectorLoopValueMap.getVector(Phi);
Builder.SetInsertPoint(cast<Instruction>(PhiParts[0]));
// Create a phi node for the new recurrence. The current value will either be
@@ -3974,10 +4296,217 @@ void InnerLoopVectorizer::fixLCSSAPHIs() {
}
}
+void InnerLoopVectorizer::collectTriviallyDeadInstructions() {
+ BasicBlock *Latch = OrigLoop->getLoopLatch();
+
+ // We create new control-flow for the vectorized loop, so the original
+ // condition will be dead after vectorization if it's only used by the
+ // branch.
+ auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
+ if (Cmp && Cmp->hasOneUse())
+ DeadInstructions.insert(Cmp);
+
+ // We create new "steps" for induction variable updates to which the original
+ // induction variables map. An original update instruction will be dead if
+ // all its users except the induction variable are dead.
+ for (auto &Induction : *Legal->getInductionVars()) {
+ PHINode *Ind = Induction.first;
+ auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
+ if (all_of(IndUpdate->users(), [&](User *U) -> bool {
+ return U == Ind || DeadInstructions.count(cast<Instruction>(U));
+ }))
+ DeadInstructions.insert(IndUpdate);
+ }
+}
+
+void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
+
+ // The basic block and loop containing the predicated instruction.
+ auto *PredBB = PredInst->getParent();
+ auto *VectorLoop = LI->getLoopFor(PredBB);
+
+ // Initialize a worklist with the operands of the predicated instruction.
+ SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
+
+ // Holds instructions that we need to analyze again. An instruction may be
+ // reanalyzed if we don't yet know if we can sink it or not.
+ SmallVector<Instruction *, 8> InstsToReanalyze;
+
+ // Returns true if a given use occurs in the predicated block. Phi nodes use
+ // their operands in their corresponding predecessor blocks.
+ auto isBlockOfUsePredicated = [&](Use &U) -> bool {
+ auto *I = cast<Instruction>(U.getUser());
+ BasicBlock *BB = I->getParent();
+ if (auto *Phi = dyn_cast<PHINode>(I))
+ BB = Phi->getIncomingBlock(
+ PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
+ return BB == PredBB;
+ };
+
+ // Iteratively sink the scalarized operands of the predicated instruction
+ // into the block we created for it. When an instruction is sunk, it's
+ // operands are then added to the worklist. The algorithm ends after one pass
+ // through the worklist doesn't sink a single instruction.
+ bool Changed;
+ do {
+
+ // Add the instructions that need to be reanalyzed to the worklist, and
+ // reset the changed indicator.
+ Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
+ InstsToReanalyze.clear();
+ Changed = false;
+
+ while (!Worklist.empty()) {
+ auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
+
+ // We can't sink an instruction if it is a phi node, is already in the
+ // predicated block, is not in the loop, or may have side effects.
+ if (!I || isa<PHINode>(I) || I->getParent() == PredBB ||
+ !VectorLoop->contains(I) || I->mayHaveSideEffects())
+ continue;
+
+ // It's legal to sink the instruction if all its uses occur in the
+ // predicated block. Otherwise, there's nothing to do yet, and we may
+ // need to reanalyze the instruction.
+ if (!all_of(I->uses(), isBlockOfUsePredicated)) {
+ InstsToReanalyze.push_back(I);
+ continue;
+ }
+
+ // Move the instruction to the beginning of the predicated block, and add
+ // it's operands to the worklist.
+ I->moveBefore(&*PredBB->getFirstInsertionPt());
+ Worklist.insert(I->op_begin(), I->op_end());
+
+ // The sinking may have enabled other instructions to be sunk, so we will
+ // need to iterate.
+ Changed = true;
+ }
+ } while (Changed);
+}
+
+void InnerLoopVectorizer::predicateInstructions() {
+
+ // For each instruction I marked for predication on value C, split I into its
+ // own basic block to form an if-then construct over C. Since I may be fed by
+ // an extractelement instruction or other scalar operand, we try to
+ // iteratively sink its scalar operands into the predicated block. If I feeds
+ // an insertelement instruction, we try to move this instruction into the
+ // predicated block as well. For non-void types, a phi node will be created
+ // for the resulting value (either vector or scalar).
+ //
+ // So for some predicated instruction, e.g. the conditional sdiv in:
+ //
+ // for.body:
+ // ...
+ // %add = add nsw i32 %mul, %0
+ // %cmp5 = icmp sgt i32 %2, 7
+ // br i1 %cmp5, label %if.then, label %if.end
+ //
+ // if.then:
+ // %div = sdiv i32 %0, %1
+ // br label %if.end
+ //
+ // if.end:
+ // %x.0 = phi i32 [ %div, %if.then ], [ %add, %for.body ]
+ //
+ // the sdiv at this point is scalarized and if-converted using a select.
+ // The inactive elements in the vector are not used, but the predicated
+ // instruction is still executed for all vector elements, essentially:
+ //
+ // vector.body:
+ // ...
+ // %17 = add nsw <2 x i32> %16, %wide.load
+ // %29 = extractelement <2 x i32> %wide.load, i32 0
+ // %30 = extractelement <2 x i32> %wide.load51, i32 0
+ // %31 = sdiv i32 %29, %30
+ // %32 = insertelement <2 x i32> undef, i32 %31, i32 0
+ // %35 = extractelement <2 x i32> %wide.load, i32 1
+ // %36 = extractelement <2 x i32> %wide.load51, i32 1
+ // %37 = sdiv i32 %35, %36
+ // %38 = insertelement <2 x i32> %32, i32 %37, i32 1
+ // %predphi = select <2 x i1> %26, <2 x i32> %38, <2 x i32> %17
+ //
+ // Predication will now re-introduce the original control flow to avoid false
+ // side-effects by the sdiv instructions on the inactive elements, yielding
+ // (after cleanup):
+ //
+ // vector.body:
+ // ...
+ // %5 = add nsw <2 x i32> %4, %wide.load
+ // %8 = icmp sgt <2 x i32> %wide.load52, <i32 7, i32 7>
+ // %9 = extractelement <2 x i1> %8, i32 0
+ // br i1 %9, label %pred.sdiv.if, label %pred.sdiv.continue
+ //
+ // pred.sdiv.if:
+ // %10 = extractelement <2 x i32> %wide.load, i32 0
+ // %11 = extractelement <2 x i32> %wide.load51, i32 0
+ // %12 = sdiv i32 %10, %11
+ // %13 = insertelement <2 x i32> undef, i32 %12, i32 0
+ // br label %pred.sdiv.continue
+ //
+ // pred.sdiv.continue:
+ // %14 = phi <2 x i32> [ undef, %vector.body ], [ %13, %pred.sdiv.if ]
+ // %15 = extractelement <2 x i1> %8, i32 1
+ // br i1 %15, label %pred.sdiv.if54, label %pred.sdiv.continue55
+ //
+ // pred.sdiv.if54:
+ // %16 = extractelement <2 x i32> %wide.load, i32 1
+ // %17 = extractelement <2 x i32> %wide.load51, i32 1
+ // %18 = sdiv i32 %16, %17
+ // %19 = insertelement <2 x i32> %14, i32 %18, i32 1
+ // br label %pred.sdiv.continue55
+ //
+ // pred.sdiv.continue55:
+ // %20 = phi <2 x i32> [ %14, %pred.sdiv.continue ], [ %19, %pred.sdiv.if54 ]
+ // %predphi = select <2 x i1> %8, <2 x i32> %20, <2 x i32> %5
+
+ for (auto KV : PredicatedInstructions) {
+ BasicBlock::iterator I(KV.first);
+ BasicBlock *Head = I->getParent();
+ auto *BB = SplitBlock(Head, &*std::next(I), DT, LI);
+ auto *T = SplitBlockAndInsertIfThen(KV.second, &*I, /*Unreachable=*/false,
+ /*BranchWeights=*/nullptr, DT, LI);
+ I->moveBefore(T);
+ sinkScalarOperands(&*I);
+
+ I->getParent()->setName(Twine("pred.") + I->getOpcodeName() + ".if");
+ BB->setName(Twine("pred.") + I->getOpcodeName() + ".continue");
+
+ // If the instruction is non-void create a Phi node at reconvergence point.
+ if (!I->getType()->isVoidTy()) {
+ Value *IncomingTrue = nullptr;
+ Value *IncomingFalse = nullptr;
+
+ if (I->hasOneUse() && isa<InsertElementInst>(*I->user_begin())) {
+ // If the predicated instruction is feeding an insert-element, move it
+ // into the Then block; Phi node will be created for the vector.
+ InsertElementInst *IEI = cast<InsertElementInst>(*I->user_begin());
+ IEI->moveBefore(T);
+ IncomingTrue = IEI; // the new vector with the inserted element.
+ IncomingFalse = IEI->getOperand(0); // the unmodified vector
+ } else {
+ // Phi node will be created for the scalar predicated instruction.
+ IncomingTrue = &*I;
+ IncomingFalse = UndefValue::get(I->getType());
+ }
+
+ BasicBlock *PostDom = I->getParent()->getSingleSuccessor();
+ assert(PostDom && "Then block has multiple successors");
+ PHINode *Phi =
+ PHINode::Create(IncomingTrue->getType(), 2, "", &PostDom->front());
+ IncomingTrue->replaceAllUsesWith(Phi);
+ Phi->addIncoming(IncomingFalse, Head);
+ Phi->addIncoming(IncomingTrue, I->getParent());
+ }
+ }
+
+ DEBUG(DT->verifyDomTree());
+}
+
InnerLoopVectorizer::VectorParts
InnerLoopVectorizer::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) {
- assert(std::find(pred_begin(Dst), pred_end(Dst), Src) != pred_end(Dst) &&
- "Invalid edge");
+ assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
// Look for cached value.
std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
@@ -4033,12 +4562,12 @@ InnerLoopVectorizer::createBlockInMask(BasicBlock *BB) {
return BlockMask;
}
-void InnerLoopVectorizer::widenPHIInstruction(
- Instruction *PN, InnerLoopVectorizer::VectorParts &Entry, unsigned UF,
- unsigned VF, PhiVector *PV) {
+void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
+ unsigned VF, PhiVector *PV) {
PHINode *P = cast<PHINode>(PN);
// Handle recurrences.
if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) {
+ VectorParts Entry(UF);
for (unsigned part = 0; part < UF; ++part) {
// This is phase one of vectorizing PHIs.
Type *VecTy =
@@ -4046,6 +4575,7 @@ void InnerLoopVectorizer::widenPHIInstruction(
Entry[part] = PHINode::Create(
VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt());
}
+ VectorLoopValueMap.initVector(P, Entry);
PV->push_back(P);
return;
}
@@ -4066,10 +4596,11 @@ void InnerLoopVectorizer::widenPHIInstruction(
// SELECT(Mask3, In3,
// SELECT(Mask2, In2,
// ( ...)))
+ VectorParts Entry(UF);
for (unsigned In = 0; In < NumIncoming; In++) {
VectorParts Cond =
createEdgeMask(P->getIncomingBlock(In), P->getParent());
- VectorParts &In0 = getVectorValue(P->getIncomingValue(In));
+ const VectorParts &In0 = getVectorValue(P->getIncomingValue(In));
for (unsigned part = 0; part < UF; ++part) {
// We might have single edge PHIs (blocks) - use an identity
@@ -4083,6 +4614,7 @@ void InnerLoopVectorizer::widenPHIInstruction(
"predphi");
}
}
+ VectorLoopValueMap.initVector(P, Entry);
return;
}
@@ -4099,46 +4631,95 @@ void InnerLoopVectorizer::widenPHIInstruction(
case InductionDescriptor::IK_NoInduction:
llvm_unreachable("Unknown induction");
case InductionDescriptor::IK_IntInduction:
- return widenIntInduction(P, Entry);
- case InductionDescriptor::IK_PtrInduction:
+ return widenIntInduction(P);
+ case InductionDescriptor::IK_PtrInduction: {
// Handle the pointer induction variable case.
assert(P->getType()->isPointerTy() && "Unexpected type.");
// This is the normalized GEP that starts counting at zero.
Value *PtrInd = Induction;
PtrInd = Builder.CreateSExtOrTrunc(PtrInd, II.getStep()->getType());
- // This is the vector of results. Notice that we don't generate
- // vector geps because scalar geps result in better code.
- for (unsigned part = 0; part < UF; ++part) {
- if (VF == 1) {
- int EltIndex = part;
- Constant *Idx = ConstantInt::get(PtrInd->getType(), EltIndex);
- Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
- Value *SclrGep = II.transform(Builder, GlobalIdx, PSE.getSE(), DL);
- SclrGep->setName("next.gep");
- Entry[part] = SclrGep;
- continue;
- }
-
- Value *VecVal = UndefValue::get(VectorType::get(P->getType(), VF));
- for (unsigned int i = 0; i < VF; ++i) {
- int EltIndex = i + part * VF;
- Constant *Idx = ConstantInt::get(PtrInd->getType(), EltIndex);
+ // Determine the number of scalars we need to generate for each unroll
+ // iteration. If the instruction is uniform, we only need to generate the
+ // first lane. Otherwise, we generate all VF values.
+ unsigned Lanes = Legal->isUniformAfterVectorization(P) ? 1 : VF;
+ // These are the scalar results. Notice that we don't generate vector GEPs
+ // because scalar GEPs result in better code.
+ ScalarParts Entry(UF);
+ for (unsigned Part = 0; Part < UF; ++Part) {
+ Entry[Part].resize(VF);
+ for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
+ Constant *Idx = ConstantInt::get(PtrInd->getType(), Lane + Part * VF);
Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
Value *SclrGep = II.transform(Builder, GlobalIdx, PSE.getSE(), DL);
SclrGep->setName("next.gep");
- VecVal = Builder.CreateInsertElement(VecVal, SclrGep,
- Builder.getInt32(i), "insert.gep");
+ Entry[Part][Lane] = SclrGep;
}
- Entry[part] = VecVal;
}
+ VectorLoopValueMap.initScalar(P, Entry);
return;
}
+ case InductionDescriptor::IK_FpInduction: {
+ assert(P->getType() == II.getStartValue()->getType() &&
+ "Types must match");
+ // Handle other induction variables that are now based on the
+ // canonical one.
+ assert(P != OldInduction && "Primary induction can be integer only");
+
+ Value *V = Builder.CreateCast(Instruction::SIToFP, Induction, P->getType());
+ V = II.transform(Builder, V, PSE.getSE(), DL);
+ V->setName("fp.offset.idx");
+
+ // Now we have scalar op: %fp.offset.idx = StartVal +/- Induction*StepVal
+
+ Value *Broadcasted = getBroadcastInstrs(V);
+ // After broadcasting the induction variable we need to make the vector
+ // consecutive by adding StepVal*0, StepVal*1, StepVal*2, etc.
+ Value *StepVal = cast<SCEVUnknown>(II.getStep())->getValue();
+ VectorParts Entry(UF);
+ for (unsigned part = 0; part < UF; ++part)
+ Entry[part] = getStepVector(Broadcasted, VF * part, StepVal,
+ II.getInductionOpcode());
+ VectorLoopValueMap.initVector(P, Entry);
+ return;
+ }
+ }
+}
+
+/// A helper function for checking whether an integer division-related
+/// instruction may divide by zero (in which case it must be predicated if
+/// executed conditionally in the scalar code).
+/// TODO: It may be worthwhile to generalize and check isKnownNonZero().
+/// Non-zero divisors that are non compile-time constants will not be
+/// converted into multiplication, so we will still end up scalarizing
+/// the division, but can do so w/o predication.
+static bool mayDivideByZero(Instruction &I) {
+ assert((I.getOpcode() == Instruction::UDiv ||
+ I.getOpcode() == Instruction::SDiv ||
+ I.getOpcode() == Instruction::URem ||
+ I.getOpcode() == Instruction::SRem) &&
+ "Unexpected instruction");
+ Value *Divisor = I.getOperand(1);
+ auto *CInt = dyn_cast<ConstantInt>(Divisor);
+ return !CInt || CInt->isZero();
}
void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {
// For each instruction in the old loop.
for (Instruction &I : *BB) {
- VectorParts &Entry = WidenMap.get(&I);
+
+ // If the instruction will become trivially dead when vectorized, we don't
+ // need to generate it.
+ if (DeadInstructions.count(&I))
+ continue;
+
+ // Scalarize instructions that should remain scalar after vectorization.
+ if (VF > 1 &&
+ !(isa<BranchInst>(&I) || isa<PHINode>(&I) ||
+ isa<DbgInfoIntrinsic>(&I)) &&
+ shouldScalarizeInstruction(&I)) {
+ scalarizeInstruction(&I, Legal->isScalarWithPredication(&I));
+ continue;
+ }
switch (I.getOpcode()) {
case Instruction::Br:
@@ -4147,21 +4728,27 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {
continue;
case Instruction::PHI: {
// Vectorize PHINodes.
- widenPHIInstruction(&I, Entry, UF, VF, PV);
+ widenPHIInstruction(&I, UF, VF, PV);
continue;
} // End of PHI.
+ case Instruction::UDiv:
+ case Instruction::SDiv:
+ case Instruction::SRem:
+ case Instruction::URem:
+ // Scalarize with predication if this instruction may divide by zero and
+ // block execution is conditional, otherwise fallthrough.
+ if (Legal->isScalarWithPredication(&I)) {
+ scalarizeInstruction(&I, true);
+ continue;
+ }
case Instruction::Add:
case Instruction::FAdd:
case Instruction::Sub:
case Instruction::FSub:
case Instruction::Mul:
case Instruction::FMul:
- case Instruction::UDiv:
- case Instruction::SDiv:
case Instruction::FDiv:
- case Instruction::URem:
- case Instruction::SRem:
case Instruction::FRem:
case Instruction::Shl:
case Instruction::LShr:
@@ -4172,10 +4759,11 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {
// Just widen binops.
auto *BinOp = cast<BinaryOperator>(&I);
setDebugLocFromInst(Builder, BinOp);
- VectorParts &A = getVectorValue(BinOp->getOperand(0));
- VectorParts &B = getVectorValue(BinOp->getOperand(1));
+ const VectorParts &A = getVectorValue(BinOp->getOperand(0));
+ const VectorParts &B = getVectorValue(BinOp->getOperand(1));
// Use this vector value for all users of the original instruction.
+ VectorParts Entry(UF);
for (unsigned Part = 0; Part < UF; ++Part) {
Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A[Part], B[Part]);
@@ -4185,6 +4773,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {
Entry[Part] = V;
}
+ VectorLoopValueMap.initVector(&I, Entry);
addMetadata(Entry, BinOp);
break;
}
@@ -4201,20 +4790,19 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {
// loop. This means that we can't just use the original 'cond' value.
// We have to take the 'vectorized' value and pick the first lane.
// Instcombine will make this a no-op.
- VectorParts &Cond = getVectorValue(I.getOperand(0));
- VectorParts &Op0 = getVectorValue(I.getOperand(1));
- VectorParts &Op1 = getVectorValue(I.getOperand(2));
+ const VectorParts &Cond = getVectorValue(I.getOperand(0));
+ const VectorParts &Op0 = getVectorValue(I.getOperand(1));
+ const VectorParts &Op1 = getVectorValue(I.getOperand(2));
- Value *ScalarCond =
- (VF == 1)
- ? Cond[0]
- : Builder.CreateExtractElement(Cond[0], Builder.getInt32(0));
+ auto *ScalarCond = getScalarValue(I.getOperand(0), 0, 0);
+ VectorParts Entry(UF);
for (unsigned Part = 0; Part < UF; ++Part) {
Entry[Part] = Builder.CreateSelect(
InvariantCond ? ScalarCond : Cond[Part], Op0[Part], Op1[Part]);
}
+ VectorLoopValueMap.initVector(&I, Entry);
addMetadata(Entry, &I);
break;
}
@@ -4225,8 +4813,9 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {
bool FCmp = (I.getOpcode() == Instruction::FCmp);
auto *Cmp = dyn_cast<CmpInst>(&I);
setDebugLocFromInst(Builder, Cmp);
- VectorParts &A = getVectorValue(Cmp->getOperand(0));
- VectorParts &B = getVectorValue(Cmp->getOperand(1));
+ const VectorParts &A = getVectorValue(Cmp->getOperand(0));
+ const VectorParts &B = getVectorValue(Cmp->getOperand(1));
+ VectorParts Entry(UF);
for (unsigned Part = 0; Part < UF; ++Part) {
Value *C = nullptr;
if (FCmp) {
@@ -4238,6 +4827,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {
Entry[Part] = C;
}
+ VectorLoopValueMap.initVector(&I, Entry);
addMetadata(Entry, &I);
break;
}
@@ -4268,8 +4858,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {
auto ID = Legal->getInductionVars()->lookup(OldInduction);
if (isa<TruncInst>(CI) && CI->getOperand(0) == OldInduction &&
ID.getConstIntStepValue()) {
- widenIntInduction(OldInduction, Entry, cast<TruncInst>(CI));
- addMetadata(Entry, &I);
+ widenIntInduction(OldInduction, cast<TruncInst>(CI));
break;
}
@@ -4277,9 +4866,11 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {
Type *DestTy =
(VF == 1) ? CI->getType() : VectorType::get(CI->getType(), VF);
- VectorParts &A = getVectorValue(CI->getOperand(0));
+ const VectorParts &A = getVectorValue(CI->getOperand(0));
+ VectorParts Entry(UF);
for (unsigned Part = 0; Part < UF; ++Part)
Entry[Part] = Builder.CreateCast(CI->getOpcode(), A[Part], DestTy);
+ VectorLoopValueMap.initVector(&I, Entry);
addMetadata(Entry, &I);
break;
}
@@ -4318,6 +4909,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {
break;
}
+ VectorParts Entry(UF);
for (unsigned Part = 0; Part < UF; ++Part) {
SmallVector<Value *, 4> Args;
for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
@@ -4325,7 +4917,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {
// Some intrinsics have a scalar argument - don't replace it with a
// vector.
if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i)) {
- VectorParts &VectorArg = getVectorValue(CI->getArgOperand(i));
+ const VectorParts &VectorArg = getVectorValue(CI->getArgOperand(i));
Arg = VectorArg[Part];
}
Args.push_back(Arg);
@@ -4363,6 +4955,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {
Entry[Part] = V;
}
+ VectorLoopValueMap.initVector(&I, Entry);
addMetadata(Entry, &I);
break;
}
@@ -4414,7 +5007,8 @@ static bool canIfConvertPHINodes(BasicBlock *BB) {
bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
if (!EnableIfConversion) {
- emitAnalysis(VectorizationReport() << "if-conversion is disabled");
+ ORE->emit(createMissedAnalysis("IfConversionDisabled")
+ << "if-conversion is disabled");
return false;
}
@@ -4428,12 +5022,9 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
if (blockNeedsPredication(BB))
continue;
- for (Instruction &I : *BB) {
- if (auto *LI = dyn_cast<LoadInst>(&I))
- SafePointes.insert(LI->getPointerOperand());
- else if (auto *SI = dyn_cast<StoreInst>(&I))
- SafePointes.insert(SI->getPointerOperand());
- }
+ for (Instruction &I : *BB)
+ if (auto *Ptr = getPointerOperand(&I))
+ SafePointes.insert(Ptr);
}
// Collect the blocks that need predication.
@@ -4441,21 +5032,21 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
for (BasicBlock *BB : TheLoop->blocks()) {
// We don't support switch statements inside loops.
if (!isa<BranchInst>(BB->getTerminator())) {
- emitAnalysis(VectorizationReport(BB->getTerminator())
- << "loop contains a switch statement");
+ ORE->emit(createMissedAnalysis("LoopContainsSwitch", BB->getTerminator())
+ << "loop contains a switch statement");
return false;
}
// We must be able to predicate all blocks that need to be predicated.
if (blockNeedsPredication(BB)) {
if (!blockCanBePredicated(BB, SafePointes)) {
- emitAnalysis(VectorizationReport(BB->getTerminator())
- << "control flow cannot be substituted for a select");
+ ORE->emit(createMissedAnalysis("NoCFGForSelect", BB->getTerminator())
+ << "control flow cannot be substituted for a select");
return false;
}
} else if (BB != Header && !canIfConvertPHINodes(BB)) {
- emitAnalysis(VectorizationReport(BB->getTerminator())
- << "control flow cannot be substituted for a select");
+ ORE->emit(createMissedAnalysis("NoCFGForSelect", BB->getTerminator())
+ << "control flow cannot be substituted for a select");
return false;
}
}
@@ -4468,8 +5059,8 @@ bool LoopVectorizationLegality::canVectorize() {
// We must have a loop in canonical form. Loops with indirectbr in them cannot
// be canonicalized.
if (!TheLoop->getLoopPreheader()) {
- emitAnalysis(VectorizationReport()
- << "loop control flow is not understood by vectorizer");
+ ORE->emit(createMissedAnalysis("CFGNotUnderstood")
+ << "loop control flow is not understood by vectorizer");
return false;
}
@@ -4478,21 +5069,22 @@ bool LoopVectorizationLegality::canVectorize() {
//
// We can only vectorize innermost loops.
if (!TheLoop->empty()) {
- emitAnalysis(VectorizationReport() << "loop is not the innermost loop");
+ ORE->emit(createMissedAnalysis("NotInnermostLoop")
+ << "loop is not the innermost loop");
return false;
}
// We must have a single backedge.
if (TheLoop->getNumBackEdges() != 1) {
- emitAnalysis(VectorizationReport()
- << "loop control flow is not understood by vectorizer");
+ ORE->emit(createMissedAnalysis("CFGNotUnderstood")
+ << "loop control flow is not understood by vectorizer");
return false;
}
// We must have a single exiting block.
if (!TheLoop->getExitingBlock()) {
- emitAnalysis(VectorizationReport()
- << "loop control flow is not understood by vectorizer");
+ ORE->emit(createMissedAnalysis("CFGNotUnderstood")
+ << "loop control flow is not understood by vectorizer");
return false;
}
@@ -4500,8 +5092,8 @@ bool LoopVectorizationLegality::canVectorize() {
// checked at the end of each iteration. With that we can assume that all
// instructions in the loop are executed the same number of times.
if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
- emitAnalysis(VectorizationReport()
- << "loop control flow is not understood by vectorizer");
+ ORE->emit(createMissedAnalysis("CFGNotUnderstood")
+ << "loop control flow is not understood by vectorizer");
return false;
}
@@ -4519,8 +5111,8 @@ bool LoopVectorizationLegality::canVectorize() {
// ScalarEvolution needs to be able to find the exit count.
const SCEV *ExitCount = PSE.getBackedgeTakenCount();
if (ExitCount == PSE.getSE()->getCouldNotCompute()) {
- emitAnalysis(VectorizationReport()
- << "could not determine number of loop iterations");
+ ORE->emit(createMissedAnalysis("CantComputeNumberOfIterations")
+ << "could not determine number of loop iterations");
DEBUG(dbgs() << "LV: SCEV could not compute the loop exit count.\n");
return false;
}
@@ -4537,9 +5129,6 @@ bool LoopVectorizationLegality::canVectorize() {
return false;
}
- // Collect all of the variables that remain uniform after vectorization.
- collectLoopUniforms();
-
DEBUG(dbgs() << "LV: We can vectorize this loop"
<< (LAI->getRuntimePointerChecking()->Need
? " (with a runtime bound check)"
@@ -4556,14 +5145,20 @@ bool LoopVectorizationLegality::canVectorize() {
if (UseInterleaved)
InterleaveInfo.analyzeInterleaving(*getSymbolicStrides());
+ // Collect all instructions that are known to be uniform after vectorization.
+ collectLoopUniforms();
+
+ // Collect all instructions that are known to be scalar after vectorization.
+ collectLoopScalars();
+
unsigned SCEVThreshold = VectorizeSCEVCheckThreshold;
if (Hints->getForce() == LoopVectorizeHints::FK_Enabled)
SCEVThreshold = PragmaVectorizeSCEVCheckThreshold;
if (PSE.getUnionPredicate().getComplexity() > SCEVThreshold) {
- emitAnalysis(VectorizationReport()
- << "Too many SCEV assumptions need to be made and checked "
- << "at runtime");
+ ORE->emit(createMissedAnalysis("TooManySCEVRunTimeChecks")
+ << "Too many SCEV assumptions need to be made and checked "
+ << "at runtime");
DEBUG(dbgs() << "LV: Too many SCEV checks needed.\n");
return false;
}
@@ -4621,10 +5216,12 @@ void LoopVectorizationLegality::addInductionPhi(
const DataLayout &DL = Phi->getModule()->getDataLayout();
// Get the widest type.
- if (!WidestIndTy)
- WidestIndTy = convertPointerToIntegerType(DL, PhiTy);
- else
- WidestIndTy = getWiderType(DL, PhiTy, WidestIndTy);
+ if (!PhiTy->isFloatingPointTy()) {
+ if (!WidestIndTy)
+ WidestIndTy = convertPointerToIntegerType(DL, PhiTy);
+ else
+ WidestIndTy = getWiderType(DL, PhiTy, WidestIndTy);
+ }
// Int inductions are special because we only allow one IV.
if (ID.getKind() == InductionDescriptor::IK_IntInduction &&
@@ -4667,8 +5264,8 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
// Check that this PHI type is allowed.
if (!PhiTy->isIntegerTy() && !PhiTy->isFloatingPointTy() &&
!PhiTy->isPointerTy()) {
- emitAnalysis(VectorizationReport(Phi)
- << "loop control flow is not understood by vectorizer");
+ ORE->emit(createMissedAnalysis("CFGNotUnderstood", Phi)
+ << "loop control flow is not understood by vectorizer");
DEBUG(dbgs() << "LV: Found an non-int non-pointer PHI.\n");
return false;
}
@@ -4681,16 +5278,16 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
// identified reduction value with an outside user.
if (!hasOutsideLoopUser(TheLoop, Phi, AllowedExit))
continue;
- emitAnalysis(VectorizationReport(Phi)
- << "value could not be identified as "
- "an induction or reduction variable");
+ ORE->emit(createMissedAnalysis("NeitherInductionNorReduction", Phi)
+ << "value could not be identified as "
+ "an induction or reduction variable");
return false;
}
// We only allow if-converted PHIs with exactly two incoming values.
if (Phi->getNumIncomingValues() != 2) {
- emitAnalysis(VectorizationReport(Phi)
- << "control flow not understood by vectorizer");
+ ORE->emit(createMissedAnalysis("CFGNotUnderstood", Phi)
+ << "control flow not understood by vectorizer");
DEBUG(dbgs() << "LV: Found an invalid PHI.\n");
return false;
}
@@ -4705,8 +5302,10 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
}
InductionDescriptor ID;
- if (InductionDescriptor::isInductionPHI(Phi, PSE, ID)) {
+ if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID)) {
addInductionPhi(Phi, ID, AllowedExit);
+ if (ID.hasUnsafeAlgebra() && !HasFunNoNaNAttr)
+ Requirements->addUnsafeAlgebraInst(ID.getUnsafeAlgebraInst());
continue;
}
@@ -4717,14 +5316,14 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
// As a last resort, coerce the PHI to a AddRec expression
// and re-try classifying it a an induction PHI.
- if (InductionDescriptor::isInductionPHI(Phi, PSE, ID, true)) {
+ if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID, true)) {
addInductionPhi(Phi, ID, AllowedExit);
continue;
}
- emitAnalysis(VectorizationReport(Phi)
- << "value that could not be identified as "
- "reduction is used outside the loop");
+ ORE->emit(createMissedAnalysis("NonReductionValueUsedOutsideLoop", Phi)
+ << "value that could not be identified as "
+ "reduction is used outside the loop");
DEBUG(dbgs() << "LV: Found an unidentified PHI." << *Phi << "\n");
return false;
} // end of PHI handling
@@ -4738,8 +5337,8 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
!isa<DbgInfoIntrinsic>(CI) &&
!(CI->getCalledFunction() && TLI &&
TLI->isFunctionVectorizable(CI->getCalledFunction()->getName()))) {
- emitAnalysis(VectorizationReport(CI)
- << "call instruction cannot be vectorized");
+ ORE->emit(createMissedAnalysis("CantVectorizeCall", CI)
+ << "call instruction cannot be vectorized");
DEBUG(dbgs() << "LV: Found a non-intrinsic, non-libfunc callsite.\n");
return false;
}
@@ -4750,8 +5349,8 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
getVectorIntrinsicIDForCall(CI, TLI), 1)) {
auto *SE = PSE.getSE();
if (!SE->isLoopInvariant(PSE.getSCEV(CI->getOperand(1)), TheLoop)) {
- emitAnalysis(VectorizationReport(CI)
- << "intrinsic instruction cannot be vectorized");
+ ORE->emit(createMissedAnalysis("CantVectorizeIntrinsic", CI)
+ << "intrinsic instruction cannot be vectorized");
DEBUG(dbgs() << "LV: Found unvectorizable intrinsic " << *CI << "\n");
return false;
}
@@ -4762,8 +5361,8 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
if ((!VectorType::isValidElementType(I.getType()) &&
!I.getType()->isVoidTy()) ||
isa<ExtractElementInst>(I)) {
- emitAnalysis(VectorizationReport(&I)
- << "instruction return type cannot be vectorized");
+ ORE->emit(createMissedAnalysis("CantVectorizeInstructionReturnType", &I)
+ << "instruction return type cannot be vectorized");
DEBUG(dbgs() << "LV: Found unvectorizable type.\n");
return false;
}
@@ -4772,8 +5371,8 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
if (auto *ST = dyn_cast<StoreInst>(&I)) {
Type *T = ST->getValueOperand()->getType();
if (!VectorType::isValidElementType(T)) {
- emitAnalysis(VectorizationReport(ST)
- << "store instruction cannot be vectorized");
+ ORE->emit(createMissedAnalysis("CantVectorizeStore", ST)
+ << "store instruction cannot be vectorized");
return false;
}
@@ -4791,8 +5390,8 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
// Reduction instructions are allowed to have exit users.
// All other instructions must not have external users.
if (hasOutsideLoopUser(TheLoop, &I, AllowedExit)) {
- emitAnalysis(VectorizationReport(&I)
- << "value cannot be used outside the loop");
+ ORE->emit(createMissedAnalysis("ValueUsedOutsideLoop", &I)
+ << "value cannot be used outside the loop");
return false;
}
@@ -4802,8 +5401,8 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
if (!Induction) {
DEBUG(dbgs() << "LV: Did not find one integer induction var.\n");
if (Inductions.empty()) {
- emitAnalysis(VectorizationReport()
- << "loop induction variable could not be identified");
+ ORE->emit(createMissedAnalysis("NoInductionVariable")
+ << "loop induction variable could not be identified");
return false;
}
}
@@ -4817,12 +5416,132 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
return true;
}
+void LoopVectorizationLegality::collectLoopScalars() {
+
+ // If an instruction is uniform after vectorization, it will remain scalar.
+ Scalars.insert(Uniforms.begin(), Uniforms.end());
+
+ // Collect the getelementptr instructions that will not be vectorized. A
+ // getelementptr instruction is only vectorized if it is used for a legal
+ // gather or scatter operation.
+ for (auto *BB : TheLoop->blocks())
+ for (auto &I : *BB) {
+ if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
+ Scalars.insert(GEP);
+ continue;
+ }
+ auto *Ptr = getPointerOperand(&I);
+ if (!Ptr)
+ continue;
+ auto *GEP = getGEPInstruction(Ptr);
+ if (GEP && isLegalGatherOrScatter(&I))
+ Scalars.erase(GEP);
+ }
+
+ // An induction variable will remain scalar if all users of the induction
+ // variable and induction variable update remain scalar.
+ auto *Latch = TheLoop->getLoopLatch();
+ for (auto &Induction : *getInductionVars()) {
+ auto *Ind = Induction.first;
+ auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
+
+ // Determine if all users of the induction variable are scalar after
+ // vectorization.
+ auto ScalarInd = all_of(Ind->users(), [&](User *U) -> bool {
+ auto *I = cast<Instruction>(U);
+ return I == IndUpdate || !TheLoop->contains(I) || Scalars.count(I);
+ });
+ if (!ScalarInd)
+ continue;
+
+ // Determine if all users of the induction variable update instruction are
+ // scalar after vectorization.
+ auto ScalarIndUpdate = all_of(IndUpdate->users(), [&](User *U) -> bool {
+ auto *I = cast<Instruction>(U);
+ return I == Ind || !TheLoop->contains(I) || Scalars.count(I);
+ });
+ if (!ScalarIndUpdate)
+ continue;
+
+ // The induction variable and its update instruction will remain scalar.
+ Scalars.insert(Ind);
+ Scalars.insert(IndUpdate);
+ }
+}
+
+bool LoopVectorizationLegality::hasConsecutiveLikePtrOperand(Instruction *I) {
+ if (isAccessInterleaved(I))
+ return true;
+ if (auto *Ptr = getPointerOperand(I))
+ return isConsecutivePtr(Ptr);
+ return false;
+}
+
+bool LoopVectorizationLegality::isScalarWithPredication(Instruction *I) {
+ if (!blockNeedsPredication(I->getParent()))
+ return false;
+ switch(I->getOpcode()) {
+ default:
+ break;
+ case Instruction::Store:
+ return !isMaskRequired(I);
+ case Instruction::UDiv:
+ case Instruction::SDiv:
+ case Instruction::SRem:
+ case Instruction::URem:
+ return mayDivideByZero(*I);
+ }
+ return false;
+}
+
+bool LoopVectorizationLegality::memoryInstructionMustBeScalarized(
+ Instruction *I, unsigned VF) {
+
+ // If the memory instruction is in an interleaved group, it will be
+ // vectorized and its pointer will remain uniform.
+ if (isAccessInterleaved(I))
+ return false;
+
+ // Get and ensure we have a valid memory instruction.
+ LoadInst *LI = dyn_cast<LoadInst>(I);
+ StoreInst *SI = dyn_cast<StoreInst>(I);
+ assert((LI || SI) && "Invalid memory instruction");
+
+ // If the pointer operand is uniform (loop invariant), the memory instruction
+ // will be scalarized.
+ auto *Ptr = getPointerOperand(I);
+ if (LI && isUniform(Ptr))
+ return true;
+
+ // If the pointer operand is non-consecutive and neither a gather nor a
+ // scatter operation is legal, the memory instruction will be scalarized.
+ if (!isConsecutivePtr(Ptr) && !isLegalGatherOrScatter(I))
+ return true;
+
+ // If the instruction is a store located in a predicated block, it will be
+ // scalarized.
+ if (isScalarWithPredication(I))
+ return true;
+
+ // If the instruction's allocated size doesn't equal it's type size, it
+ // requires padding and will be scalarized.
+ auto &DL = I->getModule()->getDataLayout();
+ auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
+ if (hasIrregularType(ScalarTy, DL, VF))
+ return true;
+
+ // Otherwise, the memory instruction should be vectorized if the rest of the
+ // loop is.
+ return false;
+}
+
void LoopVectorizationLegality::collectLoopUniforms() {
// We now know that the loop is vectorizable!
- // Collect variables that will remain uniform after vectorization.
+ // Collect instructions inside the loop that will remain uniform after
+ // vectorization.
- // If V is not an instruction inside the current loop, it is a Value
- // outside of the scope which we are interesting in.
+ // Global values, params and instructions outside of current loop are out of
+ // scope.
auto isOutOfScope = [&](Value *V) -> bool {
Instruction *I = dyn_cast<Instruction>(V);
return (!I || !TheLoop->contains(I));
@@ -4830,30 +5549,82 @@ void LoopVectorizationLegality::collectLoopUniforms() {
SetVector<Instruction *> Worklist;
BasicBlock *Latch = TheLoop->getLoopLatch();
- // Start with the conditional branch.
- if (!isOutOfScope(Latch->getTerminator()->getOperand(0))) {
- Instruction *Cmp = cast<Instruction>(Latch->getTerminator()->getOperand(0));
+
+ // Start with the conditional branch. If the branch condition is an
+ // instruction contained in the loop that is only used by the branch, it is
+ // uniform.
+ auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
+ if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) {
Worklist.insert(Cmp);
DEBUG(dbgs() << "LV: Found uniform instruction: " << *Cmp << "\n");
}
- // Also add all consecutive pointer values; these values will be uniform
- // after vectorization (and subsequent cleanup).
- for (auto *BB : TheLoop->blocks()) {
+ // Holds consecutive and consecutive-like pointers. Consecutive-like pointers
+ // are pointers that are treated like consecutive pointers during
+ // vectorization. The pointer operands of interleaved accesses are an
+ // example.
+ SmallSetVector<Instruction *, 8> ConsecutiveLikePtrs;
+
+ // Holds pointer operands of instructions that are possibly non-uniform.
+ SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs;
+
+ // Iterate over the instructions in the loop, and collect all
+ // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible
+ // that a consecutive-like pointer operand will be scalarized, we collect it
+ // in PossibleNonUniformPtrs instead. We use two sets here because a single
+ // getelementptr instruction can be used by both vectorized and scalarized
+ // memory instructions. For example, if a loop loads and stores from the same
+ // location, but the store is conditional, the store will be scalarized, and
+ // the getelementptr won't remain uniform.
+ for (auto *BB : TheLoop->blocks())
for (auto &I : *BB) {
- if (I.getType()->isPointerTy() && isConsecutivePtr(&I)) {
- Worklist.insert(&I);
- DEBUG(dbgs() << "LV: Found uniform instruction: " << I << "\n");
- }
+
+ // If there's no pointer operand, there's nothing to do.
+ auto *Ptr = dyn_cast_or_null<Instruction>(getPointerOperand(&I));
+ if (!Ptr)
+ continue;
+
+ // True if all users of Ptr are memory accesses that have Ptr as their
+ // pointer operand.
+ auto UsersAreMemAccesses = all_of(Ptr->users(), [&](User *U) -> bool {
+ return getPointerOperand(U) == Ptr;
+ });
+
+ // Ensure the memory instruction will not be scalarized, making its
+ // pointer operand non-uniform. If the pointer operand is used by some
+ // instruction other than a memory access, we're not going to check if
+ // that other instruction may be scalarized here. Thus, conservatively
+ // assume the pointer operand may be non-uniform.
+ if (!UsersAreMemAccesses || memoryInstructionMustBeScalarized(&I))
+ PossibleNonUniformPtrs.insert(Ptr);
+
+ // If the memory instruction will be vectorized and its pointer operand
+ // is consecutive-like, the pointer operand should remain uniform.
+ else if (hasConsecutiveLikePtrOperand(&I))
+ ConsecutiveLikePtrs.insert(Ptr);
+
+ // Otherwise, if the memory instruction will be vectorized and its
+ // pointer operand is non-consecutive-like, the memory instruction should
+ // be a gather or scatter operation. Its pointer operand will be
+ // non-uniform.
+ else
+ PossibleNonUniformPtrs.insert(Ptr);
+ }
+
+ // Add to the Worklist all consecutive and consecutive-like pointers that
+ // aren't also identified as possibly non-uniform.
+ for (auto *V : ConsecutiveLikePtrs)
+ if (!PossibleNonUniformPtrs.count(V)) {
+ DEBUG(dbgs() << "LV: Found uniform instruction: " << *V << "\n");
+ Worklist.insert(V);
}
- }
// Expand Worklist in topological order: whenever a new instruction
// is added , its users should be either already inside Worklist, or
// out of scope. It ensures a uniform instruction will only be used
// by uniform instructions or out of scope instructions.
unsigned idx = 0;
- do {
+ while (idx != Worklist.size()) {
Instruction *I = Worklist[idx++];
for (auto OV : I->operand_values()) {
@@ -4867,32 +5638,49 @@ void LoopVectorizationLegality::collectLoopUniforms() {
DEBUG(dbgs() << "LV: Found uniform instruction: " << *OI << "\n");
}
}
- } while (idx != Worklist.size());
+ }
+
+ // Returns true if Ptr is the pointer operand of a memory access instruction
+ // I, and I is known to not require scalarization.
+ auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
+ return getPointerOperand(I) == Ptr && !memoryInstructionMustBeScalarized(I);
+ };
// For an instruction to be added into Worklist above, all its users inside
- // the current loop should be already added into Worklist. This condition
- // cannot be true for phi instructions which is always in a dependence loop.
- // Because any instruction in the dependence cycle always depends on others
- // in the cycle to be added into Worklist first, the result is no ones in
- // the cycle will be added into Worklist in the end.
- // That is why we process PHI separately.
- for (auto &Induction : *getInductionVars()) {
- auto *PN = Induction.first;
- auto *UpdateV = PN->getIncomingValueForBlock(TheLoop->getLoopLatch());
- if (all_of(PN->users(),
- [&](User *U) -> bool {
- return U == UpdateV || isOutOfScope(U) ||
- Worklist.count(cast<Instruction>(U));
- }) &&
- all_of(UpdateV->users(), [&](User *U) -> bool {
- return U == PN || isOutOfScope(U) ||
- Worklist.count(cast<Instruction>(U));
- })) {
- Worklist.insert(cast<Instruction>(PN));
- Worklist.insert(cast<Instruction>(UpdateV));
- DEBUG(dbgs() << "LV: Found uniform instruction: " << *PN << "\n");
- DEBUG(dbgs() << "LV: Found uniform instruction: " << *UpdateV << "\n");
- }
+ // the loop should also be in Worklist. However, this condition cannot be
+ // true for phi nodes that form a cyclic dependence. We must process phi
+ // nodes separately. An induction variable will remain uniform if all users
+ // of the induction variable and induction variable update remain uniform.
+ // The code below handles both pointer and non-pointer induction variables.
+ for (auto &Induction : Inductions) {
+ auto *Ind = Induction.first;
+ auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
+
+ // Determine if all users of the induction variable are uniform after
+ // vectorization.
+ auto UniformInd = all_of(Ind->users(), [&](User *U) -> bool {
+ auto *I = cast<Instruction>(U);
+ return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
+ isVectorizedMemAccessUse(I, Ind);
+ });
+ if (!UniformInd)
+ continue;
+
+ // Determine if all users of the induction variable update instruction are
+ // uniform after vectorization.
+ auto UniformIndUpdate = all_of(IndUpdate->users(), [&](User *U) -> bool {
+ auto *I = cast<Instruction>(U);
+ return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
+ isVectorizedMemAccessUse(I, IndUpdate);
+ });
+ if (!UniformIndUpdate)
+ continue;
+
+ // The induction variable and its update instruction will remain uniform.
+ Worklist.insert(Ind);
+ Worklist.insert(IndUpdate);
+ DEBUG(dbgs() << "LV: Found uniform instruction: " << *Ind << "\n");
+ DEBUG(dbgs() << "LV: Found uniform instruction: " << *IndUpdate << "\n");
}
Uniforms.insert(Worklist.begin(), Worklist.end());
@@ -4901,16 +5689,18 @@ void LoopVectorizationLegality::collectLoopUniforms() {
bool LoopVectorizationLegality::canVectorizeMemory() {
LAI = &(*GetLAA)(*TheLoop);
InterleaveInfo.setLAI(LAI);
- auto &OptionalReport = LAI->getReport();
- if (OptionalReport)
- emitAnalysis(VectorizationReport(*OptionalReport));
+ const OptimizationRemarkAnalysis *LAR = LAI->getReport();
+ if (LAR) {
+ OptimizationRemarkAnalysis VR(Hints->vectorizeAnalysisPassName(),
+ "loop not vectorized: ", *LAR);
+ ORE->emit(VR);
+ }
if (!LAI->canVectorizeMemory())
return false;
if (LAI->hasStoreToLoopInvariantAddress()) {
- emitAnalysis(
- VectorizationReport()
- << "write to a loop invariant address could not be vectorized");
+ ORE->emit(createMissedAnalysis("CantVectorizeStoreToLoopInvariantAddress")
+ << "write to a loop invariant address could not be vectorized");
DEBUG(dbgs() << "LV: We don't allow storing to uniform addresses\n");
return false;
}
@@ -4967,7 +5757,6 @@ bool LoopVectorizationLegality::blockCanBePredicated(
}
}
- // We don't predicate stores at the moment.
if (I.mayWriteToMemory()) {
auto *SI = dyn_cast<StoreInst>(&I);
// We only support predication of stores in basic blocks with one
@@ -4992,17 +5781,6 @@ bool LoopVectorizationLegality::blockCanBePredicated(
}
if (I.mayThrow())
return false;
-
- // The instructions below can trap.
- switch (I.getOpcode()) {
- default:
- continue;
- case Instruction::UDiv:
- case Instruction::SDiv:
- case Instruction::URem:
- case Instruction::SRem:
- return false;
- }
}
return true;
@@ -5029,8 +5807,16 @@ void InterleavedAccessInfo::collectConstStrideAccesses(
if (!LI && !SI)
continue;
- Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand();
- int64_t Stride = getPtrStride(PSE, Ptr, TheLoop, Strides);
+ Value *Ptr = getPointerOperand(&I);
+ // We don't check wrapping here because we don't know yet if Ptr will be
+ // part of a full group or a group with gaps. Checking wrapping for all
+ // pointers (even those that end up in groups with no gaps) will be overly
+ // conservative. For full groups, wrapping should be ok since if we would
+ // wrap around the address space we would do a memory access at nullptr
+ // even without the transformation. The wrapping checks are therefore
+ // deferred until after we've formed the interleaved groups.
+ int64_t Stride = getPtrStride(PSE, Ptr, TheLoop, Strides,
+ /*Assume=*/true, /*ShouldCheckWrap=*/false);
const SCEV *Scev = replaceSymbolicStrideSCEV(PSE, Strides, Ptr);
PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
@@ -5234,20 +6020,66 @@ void InterleavedAccessInfo::analyzeInterleaving(
if (Group->getNumMembers() != Group->getFactor())
releaseGroup(Group);
- // If there is a non-reversed interleaved load group with gaps, we will need
- // to execute at least one scalar epilogue iteration. This will ensure that
- // we don't speculatively access memory out-of-bounds. Note that we only need
- // to look for a member at index factor - 1, since every group must have a
- // member at index zero.
- for (InterleaveGroup *Group : LoadGroups)
- if (!Group->getMember(Group->getFactor() - 1)) {
+ // Remove interleaved groups with gaps (currently only loads) whose memory
+ // accesses may wrap around. We have to revisit the getPtrStride analysis,
+ // this time with ShouldCheckWrap=true, since collectConstStrideAccesses does
+ // not check wrapping (see documentation there).
+ // FORNOW we use Assume=false;
+ // TODO: Change to Assume=true but making sure we don't exceed the threshold
+ // of runtime SCEV assumptions checks (thereby potentially failing to
+ // vectorize altogether).
+ // Additional optional optimizations:
+ // TODO: If we are peeling the loop and we know that the first pointer doesn't
+ // wrap then we can deduce that all pointers in the group don't wrap.
+ // This means that we can forcefully peel the loop in order to only have to
+ // check the first pointer for no-wrap. When we'll change to use Assume=true
+ // we'll only need at most one runtime check per interleaved group.
+ //
+ for (InterleaveGroup *Group : LoadGroups) {
+
+ // Case 1: A full group. Can Skip the checks; For full groups, if the wide
+ // load would wrap around the address space we would do a memory access at
+ // nullptr even without the transformation.
+ if (Group->getNumMembers() == Group->getFactor())
+ continue;
+
+ // Case 2: If first and last members of the group don't wrap this implies
+ // that all the pointers in the group don't wrap.
+ // So we check only group member 0 (which is always guaranteed to exist),
+ // and group member Factor - 1; If the latter doesn't exist we rely on
+ // peeling (if it is a non-reveresed accsess -- see Case 3).
+ Value *FirstMemberPtr = getPointerOperand(Group->getMember(0));
+ if (!getPtrStride(PSE, FirstMemberPtr, TheLoop, Strides, /*Assume=*/false,
+ /*ShouldCheckWrap=*/true)) {
+ DEBUG(dbgs() << "LV: Invalidate candidate interleaved group due to "
+ "first group member potentially pointer-wrapping.\n");
+ releaseGroup(Group);
+ continue;
+ }
+ Instruction *LastMember = Group->getMember(Group->getFactor() - 1);
+ if (LastMember) {
+ Value *LastMemberPtr = getPointerOperand(LastMember);
+ if (!getPtrStride(PSE, LastMemberPtr, TheLoop, Strides, /*Assume=*/false,
+ /*ShouldCheckWrap=*/true)) {
+ DEBUG(dbgs() << "LV: Invalidate candidate interleaved group due to "
+ "last group member potentially pointer-wrapping.\n");
+ releaseGroup(Group);
+ }
+ }
+ else {
+ // Case 3: A non-reversed interleaved load group with gaps: We need
+ // to execute at least one scalar epilogue iteration. This will ensure
+ // we don't speculatively access memory out-of-bounds. We only need
+ // to look for a member at index factor - 1, since every group must have
+ // a member at index zero.
if (Group->isReverse()) {
releaseGroup(Group);
- } else {
- DEBUG(dbgs() << "LV: Interleaved group requires epilogue iteration.\n");
- RequiresScalarEpilogue = true;
+ continue;
}
+ DEBUG(dbgs() << "LV: Interleaved group requires epilogue iteration.\n");
+ RequiresScalarEpilogue = true;
}
+ }
}
LoopVectorizationCostModel::VectorizationFactor
@@ -5255,28 +6087,22 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) {
// Width 1 means no vectorize
VectorizationFactor Factor = {1U, 0U};
if (OptForSize && Legal->getRuntimePointerChecking()->Need) {
- emitAnalysis(
- VectorizationReport()
- << "runtime pointer checks needed. Enable vectorization of this "
- "loop with '#pragma clang loop vectorize(enable)' when "
- "compiling with -Os/-Oz");
+ ORE->emit(createMissedAnalysis("CantVersionLoopWithOptForSize")
+ << "runtime pointer checks needed. Enable vectorization of this "
+ "loop with '#pragma clang loop vectorize(enable)' when "
+ "compiling with -Os/-Oz");
DEBUG(dbgs()
<< "LV: Aborting. Runtime ptr check is required with -Os/-Oz.\n");
return Factor;
}
if (!EnableCondStoresVectorization && Legal->getNumPredStores()) {
- emitAnalysis(
- VectorizationReport()
- << "store that is conditionally executed prevents vectorization");
+ ORE->emit(createMissedAnalysis("ConditionalStore")
+ << "store that is conditionally executed prevents vectorization");
DEBUG(dbgs() << "LV: No vectorization. There are conditional stores.\n");
return Factor;
}
- // Find the trip count.
- unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
- DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
-
MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
unsigned SmallestType, WidestType;
std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
@@ -5334,10 +6160,13 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) {
// If we optimize the program for size, avoid creating the tail loop.
if (OptForSize) {
- // If we are unable to calculate the trip count then don't try to vectorize.
+ unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
+ DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
+
+ // If we don't know the precise trip count, don't try to vectorize.
if (TC < 2) {
- emitAnalysis(
- VectorizationReport()
+ ORE->emit(
+ createMissedAnalysis("UnknownLoopCountComplexCFG")
<< "unable to calculate the loop count due to complex control flow");
DEBUG(dbgs() << "LV: Aborting. A tail loop is required with -Os/-Oz.\n");
return Factor;
@@ -5351,11 +6180,11 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) {
else {
// If the trip count that we found modulo the vectorization factor is not
// zero then we require a tail.
- emitAnalysis(VectorizationReport()
- << "cannot optimize for size and vectorize at the "
- "same time. Enable vectorization of this loop "
- "with '#pragma clang loop vectorize(enable)' "
- "when compiling with -Os/-Oz");
+ ORE->emit(createMissedAnalysis("NoTailLoopWithOptForSize")
+ << "cannot optimize for size and vectorize at the "
+ "same time. Enable vectorization of this loop "
+ "with '#pragma clang loop vectorize(enable)' "
+ "when compiling with -Os/-Oz");
DEBUG(dbgs() << "LV: Aborting. A tail loop is required with -Os/-Oz.\n");
return Factor;
}
@@ -5367,6 +6196,7 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) {
DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
Factor.Width = UserVF;
+ collectInstsToScalarize(UserVF);
return Factor;
}
@@ -5712,15 +6542,16 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
for (unsigned int i = 0; i < Index; ++i) {
Instruction *I = IdxToInstr[i];
- // Ignore instructions that are never used within the loop.
- if (!Ends.count(I))
- continue;
// Remove all of the instructions that end at this location.
InstrList &List = TransposeEnds[i];
for (Instruction *ToRemove : List)
OpenIntervals.erase(ToRemove);
+ // Ignore instructions that are never used within the loop.
+ if (!Ends.count(I))
+ continue;
+
// Skip ignored values.
if (ValuesToIgnore.count(I))
continue;
@@ -5772,10 +6603,160 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
return RUs;
}
+void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) {
+
+ // If we aren't vectorizing the loop, or if we've already collected the
+ // instructions to scalarize, there's nothing to do. Collection may already
+ // have occurred if we have a user-selected VF and are now computing the
+ // expected cost for interleaving.
+ if (VF < 2 || InstsToScalarize.count(VF))
+ return;
+
+ // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
+ // not profitable to scalarize any instructions, the presence of VF in the
+ // map will indicate that we've analyzed it already.
+ ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
+
+ // Find all the instructions that are scalar with predication in the loop and
+ // determine if it would be better to not if-convert the blocks they are in.
+ // If so, we also record the instructions to scalarize.
+ for (BasicBlock *BB : TheLoop->blocks()) {
+ if (!Legal->blockNeedsPredication(BB))
+ continue;
+ for (Instruction &I : *BB)
+ if (Legal->isScalarWithPredication(&I)) {
+ ScalarCostsTy ScalarCosts;
+ if (computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
+ ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
+ }
+ }
+}
+
+int LoopVectorizationCostModel::computePredInstDiscount(
+ Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts,
+ unsigned VF) {
+
+ assert(!Legal->isUniformAfterVectorization(PredInst) &&
+ "Instruction marked uniform-after-vectorization will be predicated");
+
+ // Initialize the discount to zero, meaning that the scalar version and the
+ // vector version cost the same.
+ int Discount = 0;
+
+ // Holds instructions to analyze. The instructions we visit are mapped in
+ // ScalarCosts. Those instructions are the ones that would be scalarized if
+ // we find that the scalar version costs less.
+ SmallVector<Instruction *, 8> Worklist;
+
+ // Returns true if the given instruction can be scalarized.
+ auto canBeScalarized = [&](Instruction *I) -> bool {
+
+ // We only attempt to scalarize instructions forming a single-use chain
+ // from the original predicated block that would otherwise be vectorized.
+ // Although not strictly necessary, we give up on instructions we know will
+ // already be scalar to avoid traversing chains that are unlikely to be
+ // beneficial.
+ if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
+ Legal->isScalarAfterVectorization(I))
+ return false;
+
+ // If the instruction is scalar with predication, it will be analyzed
+ // separately. We ignore it within the context of PredInst.
+ if (Legal->isScalarWithPredication(I))
+ return false;
+
+ // If any of the instruction's operands are uniform after vectorization,
+ // the instruction cannot be scalarized. This prevents, for example, a
+ // masked load from being scalarized.
+ //
+ // We assume we will only emit a value for lane zero of an instruction
+ // marked uniform after vectorization, rather than VF identical values.
+ // Thus, if we scalarize an instruction that uses a uniform, we would
+ // create uses of values corresponding to the lanes we aren't emitting code
+ // for. This behavior can be changed by allowing getScalarValue to clone
+ // the lane zero values for uniforms rather than asserting.
+ for (Use &U : I->operands())
+ if (auto *J = dyn_cast<Instruction>(U.get()))
+ if (Legal->isUniformAfterVectorization(J))
+ return false;
+
+ // Otherwise, we can scalarize the instruction.
+ return true;
+ };
+
+ // Returns true if an operand that cannot be scalarized must be extracted
+ // from a vector. We will account for this scalarization overhead below. Note
+ // that the non-void predicated instructions are placed in their own blocks,
+ // and their return values are inserted into vectors. Thus, an extract would
+ // still be required.
+ auto needsExtract = [&](Instruction *I) -> bool {
+ return TheLoop->contains(I) && !Legal->isScalarAfterVectorization(I);
+ };
+
+ // Compute the expected cost discount from scalarizing the entire expression
+ // feeding the predicated instruction. We currently only consider expressions
+ // that are single-use instruction chains.
+ Worklist.push_back(PredInst);
+ while (!Worklist.empty()) {
+ Instruction *I = Worklist.pop_back_val();
+
+ // If we've already analyzed the instruction, there's nothing to do.
+ if (ScalarCosts.count(I))
+ continue;
+
+ // Compute the cost of the vector instruction. Note that this cost already
+ // includes the scalarization overhead of the predicated instruction.
+ unsigned VectorCost = getInstructionCost(I, VF).first;
+
+ // Compute the cost of the scalarized instruction. This cost is the cost of
+ // the instruction as if it wasn't if-converted and instead remained in the
+ // predicated block. We will scale this cost by block probability after
+ // computing the scalarization overhead.
+ unsigned ScalarCost = VF * getInstructionCost(I, 1).first;
+
+ // Compute the scalarization overhead of needed insertelement instructions
+ // and phi nodes.
+ if (Legal->isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
+ ScalarCost += getScalarizationOverhead(ToVectorTy(I->getType(), VF), true,
+ false, TTI);
+ ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI);
+ }
+
+ // Compute the scalarization overhead of needed extractelement
+ // instructions. For each of the instruction's operands, if the operand can
+ // be scalarized, add it to the worklist; otherwise, account for the
+ // overhead.
+ for (Use &U : I->operands())
+ if (auto *J = dyn_cast<Instruction>(U.get())) {
+ assert(VectorType::isValidElementType(J->getType()) &&
+ "Instruction has non-scalar type");
+ if (canBeScalarized(J))
+ Worklist.push_back(J);
+ else if (needsExtract(J))
+ ScalarCost += getScalarizationOverhead(ToVectorTy(J->getType(), VF),
+ false, true, TTI);
+ }
+
+ // Scale the total scalar cost by block probability.
+ ScalarCost /= getReciprocalPredBlockProb();
+
+ // Compute the discount. A non-negative discount means the vector version
+ // of the instruction costs more, and scalarizing would be beneficial.
+ Discount += VectorCost - ScalarCost;
+ ScalarCosts[I] = ScalarCost;
+ }
+
+ return Discount;
+}
+
LoopVectorizationCostModel::VectorizationCostTy
LoopVectorizationCostModel::expectedCost(unsigned VF) {
VectorizationCostTy Cost;
+ // Collect the instructions (and their associated costs) that will be more
+ // profitable to scalarize.
+ collectInstsToScalarize(VF);
+
// For each block.
for (BasicBlock *BB : TheLoop->blocks()) {
VectorizationCostTy BlockCost;
@@ -5802,11 +6783,14 @@ LoopVectorizationCostModel::expectedCost(unsigned VF) {
<< VF << " For instruction: " << I << '\n');
}
- // We assume that if-converted blocks have a 50% chance of being executed.
- // When the code is scalar then some of the blocks are avoided due to CF.
- // When the code is vectorized we execute all code paths.
+ // If we are vectorizing a predicated block, it will have been
+ // if-converted. This means that the block's instructions (aside from
+ // stores and instructions that may divide by zero) will now be
+ // unconditionally executed. For the scalar case, we may not always execute
+ // the predicated block. Thus, scale the block's cost by the probability of
+ // executing it.
if (VF == 1 && Legal->blockNeedsPredication(BB))
- BlockCost.first /= 2;
+ BlockCost.first /= getReciprocalPredBlockProb();
Cost.first += BlockCost.first;
Cost.second |= BlockCost.second;
@@ -5815,35 +6799,19 @@ LoopVectorizationCostModel::expectedCost(unsigned VF) {
return Cost;
}
-/// \brief Check if the load/store instruction \p I may be translated into
-/// gather/scatter during vectorization.
-///
-/// Pointer \p Ptr specifies address in memory for the given scalar memory
-/// instruction. We need it to retrieve data type.
-/// Using gather/scatter is possible when it is supported by target.
-static bool isGatherOrScatterLegal(Instruction *I, Value *Ptr,
- LoopVectorizationLegality *Legal) {
- auto *DataTy = cast<PointerType>(Ptr->getType())->getElementType();
- return (isa<LoadInst>(I) && Legal->isLegalMaskedGather(DataTy)) ||
- (isa<StoreInst>(I) && Legal->isLegalMaskedScatter(DataTy));
-}
-
-/// \brief Check whether the address computation for a non-consecutive memory
-/// access looks like an unlikely candidate for being merged into the indexing
-/// mode.
+/// \brief Gets Address Access SCEV after verifying that the access pattern
+/// is loop invariant except the induction variable dependence.
///
-/// We look for a GEP which has one index that is an induction variable and all
-/// other indices are loop invariant. If the stride of this access is also
-/// within a small bound we decide that this address computation can likely be
-/// merged into the addressing mode.
-/// In all other cases, we identify the address computation as complex.
-static bool isLikelyComplexAddressComputation(Value *Ptr,
- LoopVectorizationLegality *Legal,
- ScalarEvolution *SE,
- const Loop *TheLoop) {
+/// This SCEV can be sent to the Target in order to estimate the address
+/// calculation cost.
+static const SCEV *getAddressAccessSCEV(
+ Value *Ptr,
+ LoopVectorizationLegality *Legal,
+ ScalarEvolution *SE,
+ const Loop *TheLoop) {
auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
if (!Gep)
- return true;
+ return nullptr;
// We are looking for a gep with all loop invariant indices except for one
// which should be an induction variable.
@@ -5852,33 +6820,11 @@ static bool isLikelyComplexAddressComputation(Value *Ptr,
Value *Opd = Gep->getOperand(i);
if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
!Legal->isInductionVariable(Opd))
- return true;
+ return nullptr;
}
- // Now we know we have a GEP ptr, %inv, %ind, %inv. Make sure that the step
- // can likely be merged into the address computation.
- unsigned MaxMergeDistance = 64;
-
- const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Ptr));
- if (!AddRec)
- return true;
-
- // Check the step is constant.
- const SCEV *Step = AddRec->getStepRecurrence(*SE);
- // Calculate the pointer stride and check if it is consecutive.
- const auto *C = dyn_cast<SCEVConstant>(Step);
- if (!C)
- return true;
-
- const APInt &APStepVal = C->getAPInt();
-
- // Huge step value - give up.
- if (APStepVal.getBitWidth() > 64)
- return true;
-
- int64_t StepVal = APStepVal.getSExtValue();
-
- return StepVal > MaxMergeDistance;
+ // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
+ return SE->getSCEV(Ptr);
}
static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
@@ -5893,6 +6839,9 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
if (Legal->isUniformAfterVectorization(I))
VF = 1;
+ if (VF > 1 && isProfitableToScalarize(I, VF))
+ return VectorizationCostTy(InstsToScalarize[VF][I], false);
+
Type *VectorTy;
unsigned C = getInstructionCost(I, VF, VectorTy);
@@ -5905,7 +6854,7 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
unsigned VF,
Type *&VectorTy) {
Type *RetTy = I->getType();
- if (VF > 1 && MinBWs.count(I))
+ if (canTruncateToMinimalBitwidth(I, VF))
RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
VectorTy = ToVectorTy(RetTy, VF);
auto SE = PSE.getSE();
@@ -5932,17 +6881,42 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
// TODO: IF-converted IFs become selects.
return 0;
}
+ case Instruction::UDiv:
+ case Instruction::SDiv:
+ case Instruction::URem:
+ case Instruction::SRem:
+ // If we have a predicated instruction, it may not be executed for each
+ // vector lane. Get the scalarization cost and scale this amount by the
+ // probability of executing the predicated block. If the instruction is not
+ // predicated, we fall through to the next case.
+ if (VF > 1 && Legal->isScalarWithPredication(I)) {
+ unsigned Cost = 0;
+
+ // These instructions have a non-void type, so account for the phi nodes
+ // that we will create. This cost is likely to be zero. The phi node
+ // cost, if any, should be scaled by the block probability because it
+ // models a copy at the end of each predicated block.
+ Cost += VF * TTI.getCFInstrCost(Instruction::PHI);
+
+ // The cost of the non-predicated instruction.
+ Cost += VF * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy);
+
+ // The cost of insertelement and extractelement instructions needed for
+ // scalarization.
+ Cost += getScalarizationOverhead(I, VF, TTI);
+
+ // Scale the cost by the probability of executing the predicated blocks.
+ // This assumes the predicated block for each vector lane is equally
+ // likely.
+ return Cost / getReciprocalPredBlockProb();
+ }
case Instruction::Add:
case Instruction::FAdd:
case Instruction::Sub:
case Instruction::FSub:
case Instruction::Mul:
case Instruction::FMul:
- case Instruction::UDiv:
- case Instruction::SDiv:
case Instruction::FDiv:
- case Instruction::URem:
- case Instruction::SRem:
case Instruction::FRem:
case Instruction::Shl:
case Instruction::LShr:
@@ -5965,7 +6939,7 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
TargetTransformInfo::OP_None;
Value *Op2 = I->getOperand(1);
- // Check for a splat of a constant or for a non uniform vector of constants.
+ // Check for a splat or for a non uniform vector of constants.
if (isa<ConstantInt>(Op2)) {
ConstantInt *CInt = cast<ConstantInt>(Op2);
if (CInt && CInt->getValue().isPowerOf2())
@@ -5980,10 +6954,12 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
Op2VP = TargetTransformInfo::OP_PowerOf2;
Op2VK = TargetTransformInfo::OK_UniformConstantValue;
}
+ } else if (Legal->isUniform(Op2)) {
+ Op2VK = TargetTransformInfo::OK_UniformValue;
}
-
- return TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, Op1VK, Op2VK,
- Op1VP, Op2VP);
+ SmallVector<const Value *, 4> Operands(I->operand_values());
+ return TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, Op1VK,
+ Op2VK, Op1VP, Op2VP, Operands);
}
case Instruction::Select: {
SelectInst *SI = cast<SelectInst>(I);
@@ -5999,9 +6975,8 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
case Instruction::FCmp: {
Type *ValTy = I->getOperand(0)->getType();
Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
- auto It = MinBWs.find(Op0AsInstruction);
- if (VF > 1 && It != MinBWs.end())
- ValTy = IntegerType::get(ValTy->getContext(), It->second);
+ if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
+ ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
VectorTy = ToVectorTy(ValTy, VF);
return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy);
}
@@ -6015,7 +6990,7 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
unsigned Alignment = SI ? SI->getAlignment() : LI->getAlignment();
unsigned AS =
SI ? SI->getPointerAddressSpace() : LI->getPointerAddressSpace();
- Value *Ptr = SI ? SI->getPointerOperand() : LI->getPointerOperand();
+ Value *Ptr = getPointerOperand(I);
// We add the cost of address computation here instead of with the gep
// instruction because only here we know whether the operation is
// scalarized.
@@ -6072,41 +7047,43 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
return Cost;
}
- // Scalarized loads/stores.
- int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
- bool UseGatherOrScatter =
- (ConsecutiveStride == 0) && isGatherOrScatterLegal(I, Ptr, Legal);
-
- bool Reverse = ConsecutiveStride < 0;
- const DataLayout &DL = I->getModule()->getDataLayout();
- uint64_t ScalarAllocatedSize = DL.getTypeAllocSize(ValTy);
- uint64_t VectorElementSize = DL.getTypeStoreSize(VectorTy) / VF;
- if ((!ConsecutiveStride && !UseGatherOrScatter) ||
- ScalarAllocatedSize != VectorElementSize) {
- bool IsComplexComputation =
- isLikelyComplexAddressComputation(Ptr, Legal, SE, TheLoop);
+ // Check if the memory instruction will be scalarized.
+ if (Legal->memoryInstructionMustBeScalarized(I, VF)) {
unsigned Cost = 0;
- // The cost of extracting from the value vector and pointer vector.
Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
- for (unsigned i = 0; i < VF; ++i) {
- // The cost of extracting the pointer operand.
- Cost += TTI.getVectorInstrCost(Instruction::ExtractElement, PtrTy, i);
- // In case of STORE, the cost of ExtractElement from the vector.
- // In case of LOAD, the cost of InsertElement into the returned
- // vector.
- Cost += TTI.getVectorInstrCost(SI ? Instruction::ExtractElement
- : Instruction::InsertElement,
- VectorTy, i);
- }
- // The cost of the scalar loads/stores.
- Cost += VF * TTI.getAddressComputationCost(PtrTy, IsComplexComputation);
+ // Figure out whether the access is strided and get the stride value
+ // if it's known in compile time
+ const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, SE, TheLoop);
+
+ // Get the cost of the scalar memory instruction and address computation.
+ Cost += VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
Cost += VF *
TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(),
Alignment, AS);
+
+ // Get the overhead of the extractelement and insertelement instructions
+ // we might create due to scalarization.
+ Cost += getScalarizationOverhead(I, VF, TTI);
+
+ // If we have a predicated store, it may not be executed for each vector
+ // lane. Scale the cost by the probability of executing the predicated
+ // block.
+ if (Legal->isScalarWithPredication(I))
+ Cost /= getReciprocalPredBlockProb();
+
return Cost;
}
+ // Determine if the pointer operand of the access is either consecutive or
+ // reverse consecutive.
+ int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
+ bool Reverse = ConsecutiveStride < 0;
+
+ // Determine if either a gather or scatter operation is legal.
+ bool UseGatherOrScatter =
+ !ConsecutiveStride && Legal->isLegalGatherOrScatter(I);
+
unsigned Cost = TTI.getAddressComputationCost(VectorTy);
if (UseGatherOrScatter) {
assert(ConsecutiveStride == 0 &&
@@ -6147,7 +7124,7 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
Type *SrcScalarTy = I->getOperand(0)->getType();
Type *SrcVecTy = ToVectorTy(SrcScalarTy, VF);
- if (VF > 1 && MinBWs.count(I)) {
+ if (canTruncateToMinimalBitwidth(I, VF)) {
// This cast is going to be shrunk. This may remove the cast or it might
// turn it into slightly different cast. For example, if MinBW == 16,
// "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
@@ -6176,28 +7153,11 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
return std::min(CallCost, getVectorIntrinsicCost(CI, VF, TTI, TLI));
return CallCost;
}
- default: {
- // We are scalarizing the instruction. Return the cost of the scalar
- // instruction, plus the cost of insert and extract into vector
- // elements, times the vector width.
- unsigned Cost = 0;
-
- if (!RetTy->isVoidTy() && VF != 1) {
- unsigned InsCost =
- TTI.getVectorInstrCost(Instruction::InsertElement, VectorTy);
- unsigned ExtCost =
- TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy);
-
- // The cost of inserting the results plus extracting each one of the
- // operands.
- Cost += VF * (InsCost + ExtCost * I->getNumOperands());
- }
-
+ default:
// The cost of executing VF copies of the scalar instruction. This opcode
// is unknown. Assume that it is the same as 'mul'.
- Cost += VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy);
- return Cost;
- }
+ return VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy) +
+ getScalarizationOverhead(I, VF, TTI);
} // end of switch.
}
@@ -6217,6 +7177,7 @@ INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
namespace llvm {
@@ -6226,14 +7187,11 @@ Pass *createLoopVectorizePass(bool NoUnrolling, bool AlwaysVectorize) {
}
bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
- // Check for a store.
- if (auto *ST = dyn_cast<StoreInst>(Inst))
- return Legal->isConsecutivePtr(ST->getPointerOperand()) != 0;
-
- // Check for a load.
- if (auto *LI = dyn_cast<LoadInst>(Inst))
- return Legal->isConsecutivePtr(LI->getPointerOperand()) != 0;
+ // Check if the pointer operand of a load or store instruction is
+ // consecutive.
+ if (auto *Ptr = getPointerOperand(Inst))
+ return Legal->isConsecutivePtr(Ptr);
return false;
}
@@ -6249,123 +7207,46 @@ void LoopVectorizationCostModel::collectValuesToIgnore() {
VecValuesToIgnore.insert(Casts.begin(), Casts.end());
}
- // Ignore induction phis that are only used in either GetElementPtr or ICmp
- // instruction to exit loop. Induction variables usually have large types and
- // can have big impact when estimating register usage.
- // This is for when VF > 1.
- for (auto &Induction : *Legal->getInductionVars()) {
- auto *PN = Induction.first;
- auto *UpdateV = PN->getIncomingValueForBlock(TheLoop->getLoopLatch());
-
- // Check that the PHI is only used by the induction increment (UpdateV) or
- // by GEPs. Then check that UpdateV is only used by a compare instruction,
- // the loop header PHI, or by GEPs.
- // FIXME: Need precise def-use analysis to determine if this instruction
- // variable will be vectorized.
- if (all_of(PN->users(),
- [&](const User *U) -> bool {
- return U == UpdateV || isa<GetElementPtrInst>(U);
- }) &&
- all_of(UpdateV->users(), [&](const User *U) -> bool {
- return U == PN || isa<ICmpInst>(U) || isa<GetElementPtrInst>(U);
- })) {
- VecValuesToIgnore.insert(PN);
- VecValuesToIgnore.insert(UpdateV);
- }
- }
-
- // Ignore instructions that will not be vectorized.
- // This is for when VF > 1.
- for (BasicBlock *BB : TheLoop->blocks()) {
- for (auto &Inst : *BB) {
- switch (Inst.getOpcode())
- case Instruction::GetElementPtr: {
- // Ignore GEP if its last operand is an induction variable so that it is
- // a consecutive load/store and won't be vectorized as scatter/gather
- // pattern.
-
- GetElementPtrInst *Gep = cast<GetElementPtrInst>(&Inst);
- unsigned NumOperands = Gep->getNumOperands();
- unsigned InductionOperand = getGEPInductionOperand(Gep);
- bool GepToIgnore = true;
-
- // Check that all of the gep indices are uniform except for the
- // induction operand.
- for (unsigned i = 0; i != NumOperands; ++i) {
- if (i != InductionOperand &&
- !PSE.getSE()->isLoopInvariant(PSE.getSCEV(Gep->getOperand(i)),
- TheLoop)) {
- GepToIgnore = false;
- break;
- }
- }
-
- if (GepToIgnore)
- VecValuesToIgnore.insert(&Inst);
- break;
- }
- }
- }
+ // Insert values known to be scalar into VecValuesToIgnore. This is a
+ // conservative estimation of the values that will later be scalarized.
+ //
+ // FIXME: Even though an instruction is not scalar-after-vectoriztion, it may
+ // still be scalarized. For example, we may find an instruction to be
+ // more profitable for a given vectorization factor if it were to be
+ // scalarized. But at this point, we haven't yet computed the
+ // vectorization factor.
+ for (auto *BB : TheLoop->getBlocks())
+ for (auto &I : *BB)
+ if (Legal->isScalarAfterVectorization(&I))
+ VecValuesToIgnore.insert(&I);
}
void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr,
- bool IfPredicateStore) {
+ bool IfPredicateInstr) {
assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
// Holds vector parameters or scalars, in case of uniform vals.
SmallVector<VectorParts, 4> Params;
setDebugLocFromInst(Builder, Instr);
- // Find all of the vectorized parameters.
- for (Value *SrcOp : Instr->operands()) {
- // If we are accessing the old induction variable, use the new one.
- if (SrcOp == OldInduction) {
- Params.push_back(getVectorValue(SrcOp));
- continue;
- }
-
- // Try using previously calculated values.
- Instruction *SrcInst = dyn_cast<Instruction>(SrcOp);
-
- // If the src is an instruction that appeared earlier in the basic block
- // then it should already be vectorized.
- if (SrcInst && OrigLoop->contains(SrcInst)) {
- assert(WidenMap.has(SrcInst) && "Source operand is unavailable");
- // The parameter is a vector value from earlier.
- Params.push_back(WidenMap.get(SrcInst));
- } else {
- // The parameter is a scalar from outside the loop. Maybe even a constant.
- VectorParts Scalars;
- Scalars.append(UF, SrcOp);
- Params.push_back(Scalars);
- }
- }
-
- assert(Params.size() == Instr->getNumOperands() &&
- "Invalid number of operands");
-
// Does this instruction return a value ?
bool IsVoidRetTy = Instr->getType()->isVoidTy();
- Value *UndefVec = IsVoidRetTy ? nullptr : UndefValue::get(Instr->getType());
- // Create a new entry in the WidenMap and initialize it to Undef or Null.
- VectorParts &VecResults = WidenMap.splat(Instr, UndefVec);
+ // Initialize a new scalar map entry.
+ ScalarParts Entry(UF);
VectorParts Cond;
- if (IfPredicateStore) {
- assert(Instr->getParent()->getSinglePredecessor() &&
- "Only support single predecessor blocks");
- Cond = createEdgeMask(Instr->getParent()->getSinglePredecessor(),
- Instr->getParent());
- }
+ if (IfPredicateInstr)
+ Cond = createBlockInMask(Instr->getParent());
// For each vector unroll 'part':
for (unsigned Part = 0; Part < UF; ++Part) {
+ Entry[Part].resize(1);
// For each scalar that we create:
// Start an "if (pred) a[i] = ..." block.
Value *Cmp = nullptr;
- if (IfPredicateStore) {
+ if (IfPredicateInstr) {
if (Cond[Part]->getType()->isVectorTy())
Cond[Part] =
Builder.CreateExtractElement(Cond[Part], Builder.getInt32(0));
@@ -6376,47 +7257,57 @@ void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr,
Instruction *Cloned = Instr->clone();
if (!IsVoidRetTy)
Cloned->setName(Instr->getName() + ".cloned");
- // Replace the operands of the cloned instructions with extracted scalars.
+
+ // Replace the operands of the cloned instructions with their scalar
+ // equivalents in the new loop.
for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
- Value *Op = Params[op][Part];
- Cloned->setOperand(op, Op);
+ auto *NewOp = getScalarValue(Instr->getOperand(op), Part, 0);
+ Cloned->setOperand(op, NewOp);
}
// Place the cloned scalar in the new loop.
Builder.Insert(Cloned);
+ // Add the cloned scalar to the scalar map entry.
+ Entry[Part][0] = Cloned;
+
// If we just cloned a new assumption, add it the assumption cache.
if (auto *II = dyn_cast<IntrinsicInst>(Cloned))
if (II->getIntrinsicID() == Intrinsic::assume)
AC->registerAssumption(II);
- // If the original scalar returns a value we need to place it in a vector
- // so that future users will be able to use it.
- if (!IsVoidRetTy)
- VecResults[Part] = Cloned;
-
// End if-block.
- if (IfPredicateStore)
- PredicatedStores.push_back(std::make_pair(cast<StoreInst>(Cloned), Cmp));
+ if (IfPredicateInstr)
+ PredicatedInstructions.push_back(std::make_pair(Cloned, Cmp));
}
+ VectorLoopValueMap.initScalar(Instr, Entry);
}
void InnerLoopUnroller::vectorizeMemoryInstruction(Instruction *Instr) {
auto *SI = dyn_cast<StoreInst>(Instr);
- bool IfPredicateStore = (SI && Legal->blockNeedsPredication(SI->getParent()));
+ bool IfPredicateInstr = (SI && Legal->blockNeedsPredication(SI->getParent()));
- return scalarizeInstruction(Instr, IfPredicateStore);
+ return scalarizeInstruction(Instr, IfPredicateInstr);
}
Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; }
Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
-Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step) {
+Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step,
+ Instruction::BinaryOps BinOp) {
// When unrolling and the VF is 1, we only need to add a simple scalar.
- Type *ITy = Val->getType();
- assert(!ITy->isVectorTy() && "Val must be a scalar");
- Constant *C = ConstantInt::get(ITy, StartIdx);
+ Type *Ty = Val->getType();
+ assert(!Ty->isVectorTy() && "Val must be a scalar");
+
+ if (Ty->isFloatingPointTy()) {
+ Constant *C = ConstantFP::get(Ty, (double)StartIdx);
+
+ // Floating point operations had to be 'fast' to enable the unrolling.
+ Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step));
+ return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp));
+ }
+ Constant *C = ConstantInt::get(Ty, StartIdx);
return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction");
}
@@ -6465,7 +7356,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
<< L->getHeader()->getParent()->getName() << "\" from "
<< DebugLocStr << "\n");
- LoopVectorizeHints Hints(L, DisableUnrolling);
+ LoopVectorizeHints Hints(L, DisableUnrolling, *ORE);
DEBUG(dbgs() << "LV: Loop hints:"
<< " force="
@@ -6483,8 +7374,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
// Looking at the diagnostic output is the only way to determine if a loop
// was vectorized (other than looking at the IR or machine code), so it
// is important to generate an optimization remark for each loop. Most of
- // these messages are generated by emitOptimizationRemarkAnalysis. Remarks
- // generated by emitOptimizationRemark and emitOptimizationRemarkMissed are
+ // these messages are generated as OptimizationRemarkAnalysis. Remarks
+ // generated as OptimizationRemark and OptimizationRemarkMissed are
// less verbose reporting vectorized loops and unvectorized loops that may
// benefit from vectorization, respectively.
@@ -6495,17 +7386,18 @@ bool LoopVectorizePass::processLoop(Loop *L) {
// Check the loop for a trip count threshold:
// do not vectorize loops with a tiny trip count.
- const unsigned TC = SE->getSmallConstantTripCount(L);
- if (TC > 0u && TC < TinyTripCountVectorThreshold) {
+ const unsigned MaxTC = SE->getSmallConstantMaxTripCount(L);
+ if (MaxTC > 0u && MaxTC < TinyTripCountVectorThreshold) {
DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
<< "This loop is not worth vectorizing.");
if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
else {
DEBUG(dbgs() << "\n");
- emitAnalysisDiag(F, L, Hints, VectorizationReport()
- << "vectorization is not beneficial "
- "and is not explicitly forced");
+ ORE->emit(createMissedAnalysis(Hints.vectorizeAnalysisPassName(),
+ "NotBeneficial", L)
+ << "vectorization is not beneficial "
+ "and is not explicitly forced");
return false;
}
}
@@ -6513,17 +7405,17 @@ bool LoopVectorizePass::processLoop(Loop *L) {
PredicatedScalarEvolution PSE(*SE, *L);
// Check if it is legal to vectorize the loop.
- LoopVectorizationRequirements Requirements;
- LoopVectorizationLegality LVL(L, PSE, DT, TLI, AA, F, TTI, GetLAA, LI,
+ LoopVectorizationRequirements Requirements(*ORE);
+ LoopVectorizationLegality LVL(L, PSE, DT, TLI, AA, F, TTI, GetLAA, LI, ORE,
&Requirements, &Hints);
if (!LVL.canVectorize()) {
DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
- emitMissedWarning(F, L, Hints);
+ emitMissedWarning(F, L, Hints, ORE);
return false;
}
// Use the cost model.
- LoopVectorizationCostModel CM(L, PSE, LI, &LVL, *TTI, TLI, DB, AC, F,
+ LoopVectorizationCostModel CM(L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, F,
&Hints);
CM.collectValuesToIgnore();
@@ -6551,11 +7443,10 @@ bool LoopVectorizePass::processLoop(Loop *L) {
if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
DEBUG(dbgs() << "LV: Can't vectorize when the NoImplicitFloat"
"attribute is used.\n");
- emitAnalysisDiag(
- F, L, Hints,
- VectorizationReport()
- << "loop not vectorized due to NoImplicitFloat attribute");
- emitMissedWarning(F, L, Hints);
+ ORE->emit(createMissedAnalysis(Hints.vectorizeAnalysisPassName(),
+ "NoImplicitFloat", L)
+ << "loop not vectorized due to NoImplicitFloat attribute");
+ emitMissedWarning(F, L, Hints, ORE);
return false;
}
@@ -6566,10 +7457,10 @@ bool LoopVectorizePass::processLoop(Loop *L) {
if (Hints.isPotentiallyUnsafe() &&
TTI->isFPVectorizationPotentiallyUnsafe()) {
DEBUG(dbgs() << "LV: Potentially unsafe FP op prevents vectorization.\n");
- emitAnalysisDiag(F, L, Hints,
- VectorizationReport()
- << "loop not vectorized due to unsafe FP support.");
- emitMissedWarning(F, L, Hints);
+ ORE->emit(
+ createMissedAnalysis(Hints.vectorizeAnalysisPassName(), "UnsafeFP", L)
+ << "loop not vectorized due to unsafe FP support.");
+ emitMissedWarning(F, L, Hints, ORE);
return false;
}
@@ -6584,38 +7475,43 @@ bool LoopVectorizePass::processLoop(Loop *L) {
unsigned UserIC = Hints.getInterleave();
// Identify the diagnostic messages that should be produced.
- std::string VecDiagMsg, IntDiagMsg;
+ std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
bool VectorizeLoop = true, InterleaveLoop = true;
-
if (Requirements.doesNotMeet(F, L, Hints)) {
DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization "
"requirements.\n");
- emitMissedWarning(F, L, Hints);
+ emitMissedWarning(F, L, Hints, ORE);
return false;
}
if (VF.Width == 1) {
DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
- VecDiagMsg =
- "the cost-model indicates that vectorization is not beneficial";
+ VecDiagMsg = std::make_pair(
+ "VectorizationNotBeneficial",
+ "the cost-model indicates that vectorization is not beneficial");
VectorizeLoop = false;
}
if (IC == 1 && UserIC <= 1) {
// Tell the user interleaving is not beneficial.
DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
- IntDiagMsg =
- "the cost-model indicates that interleaving is not beneficial";
+ IntDiagMsg = std::make_pair(
+ "InterleavingNotBeneficial",
+ "the cost-model indicates that interleaving is not beneficial");
InterleaveLoop = false;
- if (UserIC == 1)
- IntDiagMsg +=
+ if (UserIC == 1) {
+ IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
+ IntDiagMsg.second +=
" and is explicitly disabled or interleave count is set to 1";
+ }
} else if (IC > 1 && UserIC == 1) {
// Tell the user interleaving is beneficial, but it explicitly disabled.
DEBUG(dbgs()
<< "LV: Interleaving is beneficial but is explicitly disabled.");
- IntDiagMsg = "the cost-model indicates that interleaving is beneficial "
- "but is explicitly disabled or interleave count is set to 1";
+ IntDiagMsg = std::make_pair(
+ "InterleavingBeneficialButDisabled",
+ "the cost-model indicates that interleaving is beneficial "
+ "but is explicitly disabled or interleave count is set to 1");
InterleaveLoop = false;
}
@@ -6626,40 +7522,48 @@ bool LoopVectorizePass::processLoop(Loop *L) {
const char *VAPassName = Hints.vectorizeAnalysisPassName();
if (!VectorizeLoop && !InterleaveLoop) {
// Do not vectorize or interleaving the loop.
- emitOptimizationRemarkAnalysis(F->getContext(), VAPassName, *F,
- L->getStartLoc(), VecDiagMsg);
- emitOptimizationRemarkAnalysis(F->getContext(), LV_NAME, *F,
- L->getStartLoc(), IntDiagMsg);
+ ORE->emit(OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
+ L->getStartLoc(), L->getHeader())
+ << VecDiagMsg.second);
+ ORE->emit(OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
+ L->getStartLoc(), L->getHeader())
+ << IntDiagMsg.second);
return false;
} else if (!VectorizeLoop && InterleaveLoop) {
DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
- emitOptimizationRemarkAnalysis(F->getContext(), VAPassName, *F,
- L->getStartLoc(), VecDiagMsg);
+ ORE->emit(OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
+ L->getStartLoc(), L->getHeader())
+ << VecDiagMsg.second);
} else if (VectorizeLoop && !InterleaveLoop) {
DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width << ") in "
<< DebugLocStr << '\n');
- emitOptimizationRemarkAnalysis(F->getContext(), LV_NAME, *F,
- L->getStartLoc(), IntDiagMsg);
+ ORE->emit(OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
+ L->getStartLoc(), L->getHeader())
+ << IntDiagMsg.second);
} else if (VectorizeLoop && InterleaveLoop) {
DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width << ") in "
<< DebugLocStr << '\n');
DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
}
+ using namespace ore;
if (!VectorizeLoop) {
assert(IC > 1 && "interleave count should not be 1 or 0");
// If we decided that it is not legal to vectorize the loop, then
// interleave it.
- InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, IC);
- Unroller.vectorize(&LVL, CM.MinBWs, CM.VecValuesToIgnore);
-
- emitOptimizationRemark(F->getContext(), LV_NAME, *F, L->getStartLoc(),
- Twine("interleaved loop (interleaved count: ") +
- Twine(IC) + ")");
+ InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
+ &CM);
+ Unroller.vectorize();
+
+ ORE->emit(OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
+ L->getHeader())
+ << "interleaved loop (interleaved count: "
+ << NV("InterleaveCount", IC) << ")");
} else {
// If we decided that it is *legal* to vectorize the loop, then do it.
- InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, VF.Width, IC);
- LB.vectorize(&LVL, CM.MinBWs, CM.VecValuesToIgnore);
+ InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
+ &LVL, &CM);
+ LB.vectorize();
++LoopsVectorized;
// Add metadata to disable runtime unrolling a scalar loop when there are
@@ -6669,10 +7573,11 @@ bool LoopVectorizePass::processLoop(Loop *L) {
AddRuntimeUnrollDisableMetaData(L);
// Report the vectorization decision.
- emitOptimizationRemark(F->getContext(), LV_NAME, *F, L->getStartLoc(),
- Twine("vectorized loop (vectorization width: ") +
- Twine(VF.Width) + ", interleaved count: " +
- Twine(IC) + ")");
+ ORE->emit(OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
+ L->getHeader())
+ << "vectorized loop (vectorization width: "
+ << NV("VectorizationFactor", VF.Width)
+ << ", interleaved count: " << NV("InterleaveCount", IC) << ")");
}
// Mark the loop as already vectorized to avoid vectorizing again.
@@ -6686,7 +7591,8 @@ bool LoopVectorizePass::runImpl(
Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
DemandedBits &DB_, AliasAnalysis &AA_, AssumptionCache &AC_,
- std::function<const LoopAccessInfo &(Loop &)> &GetLAA_) {
+ std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,
+ OptimizationRemarkEmitter &ORE_) {
SE = &SE_;
LI = &LI_;
@@ -6698,6 +7604,7 @@ bool LoopVectorizePass::runImpl(
AC = &AC_;
GetLAA = &GetLAA_;
DB = &DB_;
+ ORE = &ORE_;
// Compute some weights outside of the loop over the loops. Compute this
// using a BranchProbability to re-use its scaling math.
@@ -6742,17 +7649,20 @@ PreservedAnalyses LoopVectorizePass::run(Function &F,
auto &TTI = AM.getResult<TargetIRAnalysis>(F);
auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
- auto *TLI = AM.getCachedResult<TargetLibraryAnalysis>(F);
+ auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
auto &AA = AM.getResult<AAManager>(F);
auto &AC = AM.getResult<AssumptionAnalysis>(F);
auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
+ auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
std::function<const LoopAccessInfo &(Loop &)> GetLAA =
[&](Loop &L) -> const LoopAccessInfo & {
- return LAM.getResult<LoopAccessAnalysis>(L);
+ LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI};
+ return LAM.getResult<LoopAccessAnalysis>(L, AR);
};
- bool Changed = runImpl(F, SE, LI, TTI, DT, BFI, TLI, DB, AA, AC, GetLAA);
+ bool Changed =
+ runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE);
if (!Changed)
return PreservedAnalyses::all();
PreservedAnalyses PA;
diff --git a/contrib/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/contrib/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index fb94f79..328f270 100644
--- a/contrib/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/contrib/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -115,22 +115,22 @@ static bool isValidElementType(Type *Ty) {
!Ty->isPPC_FP128Ty();
}
-/// \returns the parent basic block if all of the instructions in \p VL
-/// are in the same block or null otherwise.
-static BasicBlock *getSameBlock(ArrayRef<Value *> VL) {
+/// \returns true if all of the instructions in \p VL are in the same block or
+/// false otherwise.
+static bool allSameBlock(ArrayRef<Value *> VL) {
Instruction *I0 = dyn_cast<Instruction>(VL[0]);
if (!I0)
- return nullptr;
+ return false;
BasicBlock *BB = I0->getParent();
for (int i = 1, e = VL.size(); i < e; i++) {
Instruction *I = dyn_cast<Instruction>(VL[i]);
if (!I)
- return nullptr;
+ return false;
if (BB != I->getParent())
- return nullptr;
+ return false;
}
- return BB;
+ return true;
}
/// \returns True if all of the values in \p VL are constants.
@@ -211,12 +211,12 @@ static unsigned getSameOpcode(ArrayRef<Value *> VL) {
/// of each scalar operation (VL) that will be converted into a vector (I).
/// Flag set: NSW, NUW, exact, and all of fast-math.
static void propagateIRFlags(Value *I, ArrayRef<Value *> VL) {
- if (auto *VecOp = dyn_cast<BinaryOperator>(I)) {
- if (auto *Intersection = dyn_cast<BinaryOperator>(VL[0])) {
+ if (auto *VecOp = dyn_cast<Instruction>(I)) {
+ if (auto *Intersection = dyn_cast<Instruction>(VL[0])) {
// Intersection is initialized to the 0th scalar,
// so start counting from index '1'.
for (int i = 1, e = VL.size(); i < e; ++i) {
- if (auto *Scalar = dyn_cast<BinaryOperator>(VL[i]))
+ if (auto *Scalar = dyn_cast<Instruction>(VL[i]))
Intersection->andIRFlags(Scalar);
}
VecOp->copyIRFlags(Intersection);
@@ -224,15 +224,15 @@ static void propagateIRFlags(Value *I, ArrayRef<Value *> VL) {
}
}
-/// \returns The type that all of the values in \p VL have or null if there
-/// are different types.
-static Type* getSameType(ArrayRef<Value *> VL) {
+/// \returns true if all of the values in \p VL have the same type or false
+/// otherwise.
+static bool allSameType(ArrayRef<Value *> VL) {
Type *Ty = VL[0]->getType();
for (int i = 1, e = VL.size(); i < e; i++)
if (VL[i]->getType() != Ty)
- return nullptr;
+ return false;
- return Ty;
+ return true;
}
/// \returns True if Extract{Value,Element} instruction extracts element Idx.
@@ -393,6 +393,10 @@ public:
/// \returns number of elements in vector if isomorphism exists, 0 otherwise.
unsigned canMapToVector(Type *T, const DataLayout &DL) const;
+ /// \returns True if the VectorizableTree is both tiny and not fully
+ /// vectorizable. We do not vectorize such trees.
+ bool isTreeTinyAndNotFullyVectorizable();
+
private:
struct TreeEntry;
@@ -883,10 +887,10 @@ private:
/// List of users to ignore during scheduling and that don't need extracting.
ArrayRef<Value *> UserIgnoreList;
- // Number of load-bundles, which contain consecutive loads.
+ // Number of load bundles that contain consecutive loads.
int NumLoadsWantToKeepOrder;
- // Number of load-bundles of size 2, which are consecutive loads if reversed.
+ // Number of load bundles that contain consecutive loads in reversed order.
int NumLoadsWantToChangeOrder;
// Analysis and block reference.
@@ -906,8 +910,11 @@ private:
IRBuilder<> Builder;
/// A map of scalar integer values to the smallest bit width with which they
- /// can legally be represented.
- MapVector<Value *, uint64_t> MinBWs;
+ /// can legally be represented. The values map to (width, signed) pairs,
+ /// where "width" indicates the minimum bit width and "signed" is True if the
+ /// value must be signed-extended, rather than zero-extended, back to its
+ /// original width.
+ MapVector<Value *, std::pair<uint64_t, bool>> MinBWs;
};
} // end namespace llvm
@@ -917,7 +924,7 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
ArrayRef<Value *> UserIgnoreLst) {
deleteTree();
UserIgnoreList = UserIgnoreLst;
- if (!getSameType(Roots))
+ if (!allSameType(Roots))
return;
buildTree_rec(Roots, 0);
@@ -958,8 +965,7 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
}
// Ignore users in the user ignore list.
- if (std::find(UserIgnoreList.begin(), UserIgnoreList.end(), UserInst) !=
- UserIgnoreList.end())
+ if (is_contained(UserIgnoreList, UserInst))
continue;
DEBUG(dbgs() << "SLP: Need to extract:" << *U << " from lane " <<
@@ -972,9 +978,8 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
- bool SameTy = allConstant(VL) || getSameType(VL); (void)SameTy;
bool isAltShuffle = false;
- assert(SameTy && "Invalid types!");
+ assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
if (Depth == RecursionMaxDepth) {
DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
@@ -1007,7 +1012,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
}
// If all of the operands are identical or constant we have a simple solution.
- if (allConstant(VL) || isSplat(VL) || !getSameBlock(VL) || !Opcode) {
+ if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !Opcode) {
DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n");
newTreeEntry(VL, false);
return;
@@ -1159,7 +1164,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
return;
}
- // Check if the loads are consecutive or of we need to swizzle them.
+
+ // Make sure all loads in the bundle are simple - we can't vectorize
+ // atomic or volatile loads.
for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) {
LoadInst *L = cast<LoadInst>(VL[i]);
if (!L->isSimple()) {
@@ -1168,20 +1175,47 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
return;
}
+ }
+ // Check if the loads are consecutive, reversed, or neither.
+ // TODO: What we really want is to sort the loads, but for now, check
+ // the two likely directions.
+ bool Consecutive = true;
+ bool ReverseConsecutive = true;
+ for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) {
if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) {
- if (VL.size() == 2 && isConsecutiveAccess(VL[1], VL[0], *DL, *SE)) {
- ++NumLoadsWantToChangeOrder;
- }
- BS.cancelScheduling(VL);
- newTreeEntry(VL, false);
- DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
- return;
+ Consecutive = false;
+ break;
+ } else {
+ ReverseConsecutive = false;
}
}
- ++NumLoadsWantToKeepOrder;
- newTreeEntry(VL, true);
- DEBUG(dbgs() << "SLP: added a vector of loads.\n");
+
+ if (Consecutive) {
+ ++NumLoadsWantToKeepOrder;
+ newTreeEntry(VL, true);
+ DEBUG(dbgs() << "SLP: added a vector of loads.\n");
+ return;
+ }
+
+ // If none of the load pairs were consecutive when checked in order,
+ // check the reverse order.
+ if (ReverseConsecutive)
+ for (unsigned i = VL.size() - 1; i > 0; --i)
+ if (!isConsecutiveAccess(VL[i], VL[i - 1], *DL, *SE)) {
+ ReverseConsecutive = false;
+ break;
+ }
+
+ BS.cancelScheduling(VL);
+ newTreeEntry(VL, false);
+
+ if (ReverseConsecutive) {
+ ++NumLoadsWantToChangeOrder;
+ DEBUG(dbgs() << "SLP: Gathering reversed loads.\n");
+ } else {
+ DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
+ }
return;
}
case Instruction::ZExt:
@@ -1541,8 +1575,8 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
// If we have computed a smaller type for the expression, update VecTy so
// that the costs will be accurate.
if (MinBWs.count(VL[0]))
- VecTy = VectorType::get(IntegerType::get(F->getContext(), MinBWs[VL[0]]),
- VL.size());
+ VecTy = VectorType::get(
+ IntegerType::get(F->getContext(), MinBWs[VL[0]].first), VL.size());
if (E->NeedToGather) {
if (allConstant(VL))
@@ -1553,7 +1587,7 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
return getGatherCost(E->Scalars);
}
unsigned Opcode = getSameOpcode(VL);
- assert(Opcode && getSameType(VL) && getSameBlock(VL) && "Invalid VL");
+ assert(Opcode && allSameType(VL) && allSameBlock(VL) && "Invalid VL");
Instruction *VL0 = cast<Instruction>(VL[0]);
switch (Opcode) {
case Instruction::PHI: {
@@ -1762,7 +1796,10 @@ bool BoUpSLP::isFullyVectorizableTinyTree() {
DEBUG(dbgs() << "SLP: Check whether the tree with height " <<
VectorizableTree.size() << " is fully vectorizable .\n");
- // We only handle trees of height 2.
+ // We only handle trees of heights 1 and 2.
+ if (VectorizableTree.size() == 1 && !VectorizableTree[0].NeedToGather)
+ return true;
+
if (VectorizableTree.size() != 2)
return false;
@@ -1779,6 +1816,27 @@ bool BoUpSLP::isFullyVectorizableTinyTree() {
return true;
}
+bool BoUpSLP::isTreeTinyAndNotFullyVectorizable() {
+
+ // We can vectorize the tree if its size is greater than or equal to the
+ // minimum size specified by the MinTreeSize command line option.
+ if (VectorizableTree.size() >= MinTreeSize)
+ return false;
+
+ // If we have a tiny tree (a tree whose size is less than MinTreeSize), we
+ // can vectorize it if we can prove it fully vectorizable.
+ if (isFullyVectorizableTinyTree())
+ return false;
+
+ assert(VectorizableTree.empty()
+ ? ExternalUses.empty()
+ : true && "We shouldn't have any external users");
+
+ // Otherwise, we can't vectorize the tree. It is both tiny and not fully
+ // vectorizable.
+ return true;
+}
+
int BoUpSLP::getSpillCost() {
// Walk from the bottom of the tree to the top, tracking which values are
// live. When we see a call instruction that is not part of our tree,
@@ -1816,9 +1874,9 @@ int BoUpSLP::getSpillCost() {
);
// Now find the sequence of instructions between PrevInst and Inst.
- BasicBlock::reverse_iterator InstIt(Inst->getIterator()),
- PrevInstIt(PrevInst->getIterator());
- --PrevInstIt;
+ BasicBlock::reverse_iterator InstIt = ++Inst->getIterator().getReverse(),
+ PrevInstIt =
+ PrevInst->getIterator().getReverse();
while (InstIt != PrevInstIt) {
if (PrevInstIt == PrevInst->getParent()->rend()) {
PrevInstIt = Inst->getParent()->rbegin();
@@ -1846,14 +1904,6 @@ int BoUpSLP::getTreeCost() {
DEBUG(dbgs() << "SLP: Calculating cost for tree of size " <<
VectorizableTree.size() << ".\n");
- // We only vectorize tiny trees if it is fully vectorizable.
- if (VectorizableTree.size() < MinTreeSize && !isFullyVectorizableTinyTree()) {
- if (VectorizableTree.empty()) {
- assert(!ExternalUses.size() && "We should not have any external users");
- }
- return INT_MAX;
- }
-
unsigned BundleWidth = VectorizableTree[0].Scalars.size();
for (TreeEntry &TE : VectorizableTree) {
@@ -1882,10 +1932,12 @@ int BoUpSLP::getTreeCost() {
auto *VecTy = VectorType::get(EU.Scalar->getType(), BundleWidth);
auto *ScalarRoot = VectorizableTree[0].Scalars[0];
if (MinBWs.count(ScalarRoot)) {
- auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot]);
+ auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first);
+ auto Extend =
+ MinBWs[ScalarRoot].second ? Instruction::SExt : Instruction::ZExt;
VecTy = VectorType::get(MinTy, BundleWidth);
- ExtractCost += TTI->getExtractWithExtendCost(
- Instruction::SExt, EU.Scalar->getType(), VecTy, EU.Lane);
+ ExtractCost += TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(),
+ VecTy, EU.Lane);
} else {
ExtractCost +=
TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, EU.Lane);
@@ -2182,7 +2234,7 @@ void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL) {
// Set the insertion point after the last instruction in the bundle. Set the
// debug location to Front.
- Builder.SetInsertPoint(BB, next(BasicBlock::iterator(LastInst)));
+ Builder.SetInsertPoint(BB, ++LastInst->getIterator());
Builder.SetCurrentDebugLocation(Front->getDebugLoc());
}
@@ -2383,6 +2435,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
V = Builder.CreateICmp(P0, L, R);
E->VectorizedValue = V;
+ propagateIRFlags(E->VectorizedValue, E->Scalars);
++NumVectorInstructions;
return V;
}
@@ -2440,10 +2493,6 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
Value *LHS = vectorizeTree(LHSVL);
Value *RHS = vectorizeTree(RHSVL);
- if (LHS == RHS && isa<Instruction>(LHS)) {
- assert((VL0->getOperand(0) == VL0->getOperand(1)) && "Invalid order");
- }
-
if (Value *V = alreadyVectorized(E->Scalars))
return V;
@@ -2593,6 +2642,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
ExternalUses.push_back(ExternalUser(ScalarArg, cast<User>(V), 0));
E->VectorizedValue = V;
+ propagateIRFlags(E->VectorizedValue, E->Scalars);
++NumVectorInstructions;
return V;
}
@@ -2669,7 +2719,7 @@ Value *BoUpSLP::vectorizeTree() {
if (auto *I = dyn_cast<Instruction>(VectorRoot))
Builder.SetInsertPoint(&*++BasicBlock::iterator(I));
auto BundleWidth = VectorizableTree[0].Scalars.size();
- auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot]);
+ auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first);
auto *VecTy = VectorType::get(MinTy, BundleWidth);
auto *Trunc = Builder.CreateTrunc(VectorRoot, VecTy);
VectorizableTree[0].VectorizedValue = Trunc;
@@ -2677,6 +2727,16 @@ Value *BoUpSLP::vectorizeTree() {
DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size() << " values .\n");
+ // If necessary, sign-extend or zero-extend ScalarRoot to the larger type
+ // specified by ScalarType.
+ auto extend = [&](Value *ScalarRoot, Value *Ex, Type *ScalarType) {
+ if (!MinBWs.count(ScalarRoot))
+ return Ex;
+ if (MinBWs[ScalarRoot].second)
+ return Builder.CreateSExt(Ex, ScalarType);
+ return Builder.CreateZExt(Ex, ScalarType);
+ };
+
// Extract all of the elements with the external uses.
for (const auto &ExternalUse : ExternalUses) {
Value *Scalar = ExternalUse.Scalar;
@@ -2684,8 +2744,7 @@ Value *BoUpSLP::vectorizeTree() {
// Skip users that we already RAUW. This happens when one instruction
// has multiple uses of the same value.
- if (std::find(Scalar->user_begin(), Scalar->user_end(), User) ==
- Scalar->user_end())
+ if (!is_contained(Scalar->users(), User))
continue;
assert(ScalarToTreeEntry.count(Scalar) && "Invalid scalar");
@@ -2712,8 +2771,7 @@ Value *BoUpSLP::vectorizeTree() {
Builder.SetInsertPoint(PH->getIncomingBlock(i)->getTerminator());
}
Value *Ex = Builder.CreateExtractElement(Vec, Lane);
- if (MinBWs.count(ScalarRoot))
- Ex = Builder.CreateSExt(Ex, Scalar->getType());
+ Ex = extend(ScalarRoot, Ex, Scalar->getType());
CSEBlocks.insert(PH->getIncomingBlock(i));
PH->setOperand(i, Ex);
}
@@ -2721,16 +2779,14 @@ Value *BoUpSLP::vectorizeTree() {
} else {
Builder.SetInsertPoint(cast<Instruction>(User));
Value *Ex = Builder.CreateExtractElement(Vec, Lane);
- if (MinBWs.count(ScalarRoot))
- Ex = Builder.CreateSExt(Ex, Scalar->getType());
+ Ex = extend(ScalarRoot, Ex, Scalar->getType());
CSEBlocks.insert(cast<Instruction>(User)->getParent());
User->replaceUsesOfWith(Scalar, Ex);
}
} else {
Builder.SetInsertPoint(&F->getEntryBlock().front());
Value *Ex = Builder.CreateExtractElement(Vec, Lane);
- if (MinBWs.count(ScalarRoot))
- Ex = Builder.CreateSExt(Ex, Scalar->getType());
+ Ex = extend(ScalarRoot, Ex, Scalar->getType());
CSEBlocks.insert(&F->getEntryBlock());
User->replaceUsesOfWith(Scalar, Ex);
}
@@ -2759,8 +2815,7 @@ Value *BoUpSLP::vectorizeTree() {
assert((ScalarToTreeEntry.count(U) ||
// It is legal to replace users in the ignorelist by undef.
- (std::find(UserIgnoreList.begin(), UserIgnoreList.end(), U) !=
- UserIgnoreList.end())) &&
+ is_contained(UserIgnoreList, U)) &&
"Replacing out-of-tree value with undef");
}
#endif
@@ -2853,7 +2908,7 @@ void BoUpSLP::optimizeGatherSequence() {
}
}
if (In) {
- assert(std::find(Visited.begin(), Visited.end(), In) == Visited.end());
+ assert(!is_contained(Visited, In));
Visited.push_back(In);
}
}
@@ -2994,9 +3049,10 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V) {
}
// Search up and down at the same time, because we don't know if the new
// instruction is above or below the existing scheduling region.
- BasicBlock::reverse_iterator UpIter(ScheduleStart->getIterator());
+ BasicBlock::reverse_iterator UpIter =
+ ++ScheduleStart->getIterator().getReverse();
BasicBlock::reverse_iterator UpperEnd = BB->rend();
- BasicBlock::iterator DownIter(ScheduleEnd);
+ BasicBlock::iterator DownIter = ScheduleEnd->getIterator();
BasicBlock::iterator LowerEnd = BB->end();
for (;;) {
if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
@@ -3451,6 +3507,11 @@ void BoUpSLP::computeMinimumValueSizes() {
Mask.getBitWidth() - Mask.countLeadingZeros(), MaxBitWidth);
}
+ // True if the roots can be zero-extended back to their original type, rather
+ // than sign-extended. We know that if the leading bits are not demanded, we
+ // can safely zero-extend. So we initialize IsKnownPositive to True.
+ bool IsKnownPositive = true;
+
// If all the bits of the roots are demanded, we can try a little harder to
// compute a narrower type. This can happen, for example, if the roots are
// getelementptr indices. InstCombine promotes these indices to the pointer
@@ -3462,11 +3523,41 @@ void BoUpSLP::computeMinimumValueSizes() {
// compute the number of high-order bits we can truncate.
if (MaxBitWidth == DL->getTypeSizeInBits(TreeRoot[0]->getType())) {
MaxBitWidth = 8u;
+
+ // Determine if the sign bit of all the roots is known to be zero. If not,
+ // IsKnownPositive is set to False.
+ IsKnownPositive = all_of(TreeRoot, [&](Value *R) {
+ bool KnownZero = false;
+ bool KnownOne = false;
+ ComputeSignBit(R, KnownZero, KnownOne, *DL);
+ return KnownZero;
+ });
+
+ // Determine the maximum number of bits required to store the scalar
+ // values.
for (auto *Scalar : ToDemote) {
auto NumSignBits = ComputeNumSignBits(Scalar, *DL, 0, AC, 0, DT);
auto NumTypeBits = DL->getTypeSizeInBits(Scalar->getType());
MaxBitWidth = std::max<unsigned>(NumTypeBits - NumSignBits, MaxBitWidth);
}
+
+ // If we can't prove that the sign bit is zero, we must add one to the
+ // maximum bit width to account for the unknown sign bit. This preserves
+ // the existing sign bit so we can safely sign-extend the root back to the
+ // original type. Otherwise, if we know the sign bit is zero, we will
+ // zero-extend the root instead.
+ //
+ // FIXME: This is somewhat suboptimal, as there will be cases where adding
+ // one to the maximum bit width will yield a larger-than-necessary
+ // type. In general, we need to add an extra bit only if we can't
+ // prove that the upper bit of the original type is equal to the
+ // upper bit of the proposed smaller type. If these two bits are the
+ // same (either zero or one) we know that sign-extending from the
+ // smaller type will result in the same value. Here, since we can't
+ // yet prove this, we are just making the proposed smaller type
+ // larger to ensure correctness.
+ if (!IsKnownPositive)
+ ++MaxBitWidth;
}
// Round MaxBitWidth up to the next power-of-two.
@@ -3486,7 +3577,7 @@ void BoUpSLP::computeMinimumValueSizes() {
// Finally, map the values we can demote to the maximum bit with we computed.
for (auto *Scalar : ToDemote)
- MinBWs[Scalar] = MaxBitWidth;
+ MinBWs[Scalar] = std::make_pair(MaxBitWidth, !IsKnownPositive);
}
namespace {
@@ -3642,8 +3733,7 @@ static bool hasValueBeenRAUWed(ArrayRef<Value *> VL, ArrayRef<WeakVH> VH,
return !std::equal(VL.begin(), VL.end(), VH.begin());
}
-bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain,
- int CostThreshold, BoUpSLP &R,
+bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
unsigned VecRegSize) {
unsigned ChainLen = Chain.size();
DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen
@@ -3672,12 +3762,15 @@ bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain,
ArrayRef<Value *> Operands = Chain.slice(i, VF);
R.buildTree(Operands);
+ if (R.isTreeTinyAndNotFullyVectorizable())
+ continue;
+
R.computeMinimumValueSizes();
int Cost = R.getTreeCost();
DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n");
- if (Cost < CostThreshold) {
+ if (Cost < -SLPCostThreshold) {
DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n");
R.vectorizeTree();
@@ -3691,7 +3784,7 @@ bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain,
}
bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
- int costThreshold, BoUpSLP &R) {
+ BoUpSLP &R) {
SetVector<StoreInst *> Heads, Tails;
SmallDenseMap<StoreInst *, StoreInst *> ConsecutiveChain;
@@ -3746,8 +3839,9 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
// FIXME: Is division-by-2 the correct step? Should we assert that the
// register size is a power-of-2?
- for (unsigned Size = R.getMaxVecRegSize(); Size >= R.getMinVecRegSize(); Size /= 2) {
- if (vectorizeStoreChain(Operands, costThreshold, R, Size)) {
+ for (unsigned Size = R.getMaxVecRegSize(); Size >= R.getMinVecRegSize();
+ Size /= 2) {
+ if (vectorizeStoreChain(Operands, R, Size)) {
// Mark the vectorized stores so that we don't vectorize them again.
VectorizedStores.insert(Operands.begin(), Operands.end());
Changed = true;
@@ -3805,11 +3899,12 @@ bool SLPVectorizerPass::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) {
bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
ArrayRef<Value *> BuildVector,
- bool allowReorder) {
+ bool AllowReorder) {
if (VL.size() < 2)
return false;
- DEBUG(dbgs() << "SLP: Vectorizing a list of length = " << VL.size() << ".\n");
+ DEBUG(dbgs() << "SLP: Trying to vectorize a list of length = " << VL.size()
+ << ".\n");
// Check that all of the parts are scalar instructions of the same type.
Instruction *I0 = dyn_cast<Instruction>(VL[0]);
@@ -3818,10 +3913,11 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
unsigned Opcode0 = I0->getOpcode();
- // FIXME: Register size should be a parameter to this function, so we can
- // try different vectorization factors.
unsigned Sz = R.getVectorElementSize(I0);
- unsigned VF = R.getMinVecRegSize() / Sz;
+ unsigned MinVF = std::max(2U, R.getMinVecRegSize() / Sz);
+ unsigned MaxVF = std::max<unsigned>(PowerOf2Floor(VL.size()), MinVF);
+ if (MaxVF < 2)
+ return false;
for (Value *V : VL) {
Type *Ty = V->getType();
@@ -3837,70 +3933,89 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
// Keep track of values that were deleted by vectorizing in the loop below.
SmallVector<WeakVH, 8> TrackValues(VL.begin(), VL.end());
- for (unsigned i = 0, e = VL.size(); i < e; ++i) {
- unsigned OpsWidth = 0;
+ unsigned NextInst = 0, MaxInst = VL.size();
+ for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF;
+ VF /= 2) {
+ // No actual vectorization should happen, if number of parts is the same as
+ // provided vectorization factor (i.e. the scalar type is used for vector
+ // code during codegen).
+ auto *VecTy = VectorType::get(VL[0]->getType(), VF);
+ if (TTI->getNumberOfParts(VecTy) == VF)
+ continue;
+ for (unsigned I = NextInst; I < MaxInst; ++I) {
+ unsigned OpsWidth = 0;
- if (i + VF > e)
- OpsWidth = e - i;
- else
- OpsWidth = VF;
+ if (I + VF > MaxInst)
+ OpsWidth = MaxInst - I;
+ else
+ OpsWidth = VF;
- if (!isPowerOf2_32(OpsWidth) || OpsWidth < 2)
- break;
+ if (!isPowerOf2_32(OpsWidth) || OpsWidth < 2)
+ break;
- // Check that a previous iteration of this loop did not delete the Value.
- if (hasValueBeenRAUWed(VL, TrackValues, i, OpsWidth))
- continue;
+ // Check that a previous iteration of this loop did not delete the Value.
+ if (hasValueBeenRAUWed(VL, TrackValues, I, OpsWidth))
+ continue;
- DEBUG(dbgs() << "SLP: Analyzing " << OpsWidth << " operations "
- << "\n");
- ArrayRef<Value *> Ops = VL.slice(i, OpsWidth);
-
- ArrayRef<Value *> BuildVectorSlice;
- if (!BuildVector.empty())
- BuildVectorSlice = BuildVector.slice(i, OpsWidth);
-
- R.buildTree(Ops, BuildVectorSlice);
- // TODO: check if we can allow reordering also for other cases than
- // tryToVectorizePair()
- if (allowReorder && R.shouldReorder()) {
- assert(Ops.size() == 2);
- assert(BuildVectorSlice.empty());
- Value *ReorderedOps[] = { Ops[1], Ops[0] };
- R.buildTree(ReorderedOps, None);
- }
- R.computeMinimumValueSizes();
- int Cost = R.getTreeCost();
+ DEBUG(dbgs() << "SLP: Analyzing " << OpsWidth << " operations "
+ << "\n");
+ ArrayRef<Value *> Ops = VL.slice(I, OpsWidth);
+
+ ArrayRef<Value *> BuildVectorSlice;
+ if (!BuildVector.empty())
+ BuildVectorSlice = BuildVector.slice(I, OpsWidth);
+
+ R.buildTree(Ops, BuildVectorSlice);
+ // TODO: check if we can allow reordering for more cases.
+ if (AllowReorder && R.shouldReorder()) {
+ // Conceptually, there is nothing actually preventing us from trying to
+ // reorder a larger list. In fact, we do exactly this when vectorizing
+ // reductions. However, at this point, we only expect to get here from
+ // tryToVectorizePair().
+ assert(Ops.size() == 2);
+ assert(BuildVectorSlice.empty());
+ Value *ReorderedOps[] = {Ops[1], Ops[0]};
+ R.buildTree(ReorderedOps, None);
+ }
+ if (R.isTreeTinyAndNotFullyVectorizable())
+ continue;
- if (Cost < -SLPCostThreshold) {
- DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
- Value *VectorizedRoot = R.vectorizeTree();
-
- // Reconstruct the build vector by extracting the vectorized root. This
- // way we handle the case where some elements of the vector are undefined.
- // (return (inserelt <4 xi32> (insertelt undef (opd0) 0) (opd1) 2))
- if (!BuildVectorSlice.empty()) {
- // The insert point is the last build vector instruction. The vectorized
- // root will precede it. This guarantees that we get an instruction. The
- // vectorized tree could have been constant folded.
- Instruction *InsertAfter = cast<Instruction>(BuildVectorSlice.back());
- unsigned VecIdx = 0;
- for (auto &V : BuildVectorSlice) {
- IRBuilder<NoFolder> Builder(InsertAfter->getParent(),
- ++BasicBlock::iterator(InsertAfter));
- Instruction *I = cast<Instruction>(V);
- assert(isa<InsertElementInst>(I) || isa<InsertValueInst>(I));
- Instruction *Extract = cast<Instruction>(Builder.CreateExtractElement(
- VectorizedRoot, Builder.getInt32(VecIdx++)));
- I->setOperand(1, Extract);
- I->removeFromParent();
- I->insertAfter(Extract);
- InsertAfter = I;
+ R.computeMinimumValueSizes();
+ int Cost = R.getTreeCost();
+
+ if (Cost < -SLPCostThreshold) {
+ DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
+ Value *VectorizedRoot = R.vectorizeTree();
+
+ // Reconstruct the build vector by extracting the vectorized root. This
+ // way we handle the case where some elements of the vector are
+ // undefined.
+ // (return (inserelt <4 xi32> (insertelt undef (opd0) 0) (opd1) 2))
+ if (!BuildVectorSlice.empty()) {
+ // The insert point is the last build vector instruction. The
+ // vectorized root will precede it. This guarantees that we get an
+ // instruction. The vectorized tree could have been constant folded.
+ Instruction *InsertAfter = cast<Instruction>(BuildVectorSlice.back());
+ unsigned VecIdx = 0;
+ for (auto &V : BuildVectorSlice) {
+ IRBuilder<NoFolder> Builder(InsertAfter->getParent(),
+ ++BasicBlock::iterator(InsertAfter));
+ Instruction *I = cast<Instruction>(V);
+ assert(isa<InsertElementInst>(I) || isa<InsertValueInst>(I));
+ Instruction *Extract =
+ cast<Instruction>(Builder.CreateExtractElement(
+ VectorizedRoot, Builder.getInt32(VecIdx++)));
+ I->setOperand(1, Extract);
+ I->removeFromParent();
+ I->insertAfter(Extract);
+ InsertAfter = I;
+ }
}
+ // Move to the next bundle.
+ I += VF - 1;
+ NextInst = I + 1;
+ Changed = true;
}
- // Move to the next bundle.
- i += VF - 1;
- Changed = true;
}
}
@@ -3973,7 +4088,7 @@ static Value *createRdxShuffleMask(unsigned VecLen, unsigned NumEltsToRdx,
return ConstantVector::get(ShuffleMask);
}
-
+namespace {
/// Model horizontal reductions.
///
/// A horizontal reduction is a tree of reduction operations (currently add and
@@ -4006,7 +4121,14 @@ class HorizontalReduction {
SmallVector<Value *, 32> ReducedVals;
BinaryOperator *ReductionRoot;
- PHINode *ReductionPHI;
+ // After successfull horizontal reduction vectorization attempt for PHI node
+ // vectorizer tries to update root binary op by combining vectorized tree and
+ // the ReductionPHI node. But during vectorization this ReductionPHI can be
+ // vectorized itself and replaced by the undef value, while the instruction
+ // itself is marked for deletion. This 'marked for deletion' PHI node then can
+ // be used in new binary operation, causing "Use still stuck around after Def
+ // is destroyed" crash upon PHI node deletion.
+ WeakVH ReductionPHI;
/// The opcode of the reduction.
unsigned ReductionOpcode;
@@ -4025,14 +4147,13 @@ public:
unsigned MinVecRegSize;
HorizontalReduction(unsigned MinVecRegSize)
- : ReductionRoot(nullptr), ReductionPHI(nullptr), ReductionOpcode(0),
- ReducedValueOpcode(0), IsPairwiseReduction(false), ReduxWidth(0),
+ : ReductionRoot(nullptr), ReductionOpcode(0), ReducedValueOpcode(0),
+ IsPairwiseReduction(false), ReduxWidth(0),
MinVecRegSize(MinVecRegSize) {}
/// \brief Try to find a reduction tree.
bool matchAssociativeReduction(PHINode *Phi, BinaryOperator *B) {
- assert((!Phi ||
- std::find(Phi->op_begin(), Phi->op_end(), B) != Phi->op_end()) &&
+ assert((!Phi || is_contained(Phi->operands(), B)) &&
"Thi phi needs to use the binary operator");
// We could have a initial reductions that is not an add.
@@ -4113,12 +4234,21 @@ public:
// Visit left or right.
Value *NextV = TreeN->getOperand(EdgeToVist);
- // We currently only allow BinaryOperator's and SelectInst's as reduction
- // values in our tree.
- if (isa<BinaryOperator>(NextV) || isa<SelectInst>(NextV))
- Stack.push_back(std::make_pair(cast<Instruction>(NextV), 0));
- else if (NextV != Phi)
+ if (NextV != Phi) {
+ auto *I = dyn_cast<Instruction>(NextV);
+ // Continue analysis if the next operand is a reduction operation or
+ // (possibly) a reduced value. If the reduced value opcode is not set,
+ // the first met operation != reduction operation is considered as the
+ // reduced value class.
+ if (I && (!ReducedValueOpcode || I->getOpcode() == ReducedValueOpcode ||
+ I->getOpcode() == ReductionOpcode)) {
+ if (!ReducedValueOpcode && I->getOpcode() != ReductionOpcode)
+ ReducedValueOpcode = I->getOpcode();
+ Stack.push_back(std::make_pair(I, 0));
+ continue;
+ }
return false;
+ }
}
return true;
}
@@ -4141,7 +4271,15 @@ public:
unsigned i = 0;
for (; i < NumReducedVals - ReduxWidth + 1; i += ReduxWidth) {
- V.buildTree(makeArrayRef(&ReducedVals[i], ReduxWidth), ReductionOps);
+ auto VL = makeArrayRef(&ReducedVals[i], ReduxWidth);
+ V.buildTree(VL, ReductionOps);
+ if (V.shouldReorder()) {
+ SmallVector<Value *, 8> Reversed(VL.rbegin(), VL.rend());
+ V.buildTree(Reversed, ReductionOps);
+ }
+ if (V.isTreeTinyAndNotFullyVectorizable())
+ continue;
+
V.computeMinimumValueSizes();
// Estimate cost.
@@ -4175,7 +4313,7 @@ public:
ReducedVals[i]);
}
// Update users.
- if (ReductionPHI) {
+ if (ReductionPHI && !isa<UndefValue>(ReductionPHI)) {
assert(ReductionRoot && "Need a reduction operation");
ReductionRoot->setOperand(0, VectorizedTree);
ReductionRoot->setOperand(1, ReductionPHI);
@@ -4202,7 +4340,8 @@ private:
int VecReduxCost = IsPairwiseReduction ? PairwiseRdxCost : SplittingRdxCost;
int ScalarReduxCost =
- ReduxWidth * TTI->getArithmeticInstrCost(ReductionOpcode, VecTy);
+ (ReduxWidth - 1) *
+ TTI->getArithmeticInstrCost(ReductionOpcode, ScalarTy);
DEBUG(dbgs() << "SLP: Adding cost " << VecReduxCost - ScalarReduxCost
<< " for reduction that starts with " << *FirstReducedVal
@@ -4254,6 +4393,7 @@ private:
return Builder.CreateExtractElement(TmpVec, Builder.getInt32(0));
}
};
+} // end anonymous namespace
/// \brief Recognize construction of vectors like
/// %ra = insertelement <4 x float> undef, float %s0, i32 0
@@ -4354,7 +4494,7 @@ static Value *getReductionValue(const DominatorTree *DT, PHINode *P,
return nullptr;
// There is a loop latch, return the incoming value if it comes from
- // that. This reduction pattern occassionaly turns up.
+ // that. This reduction pattern occasionally turns up.
if (P->getIncomingBlock(0) == BBLatch) {
Rdx = P->getIncomingValue(0);
} else if (P->getIncomingBlock(1) == BBLatch) {
@@ -4510,8 +4650,10 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
if (BinaryOperator *BinOp =
dyn_cast<BinaryOperator>(RI->getOperand(0))) {
DEBUG(dbgs() << "SLP: Found a return to vectorize.\n");
- if (tryToVectorizePair(BinOp->getOperand(0),
- BinOp->getOperand(1), R)) {
+ if (canMatchHorizontalReduction(nullptr, BinOp, R, TTI,
+ R.getMinVecRegSize()) ||
+ tryToVectorizePair(BinOp->getOperand(0), BinOp->getOperand(1),
+ R)) {
Changed = true;
it = BB->begin();
e = BB->end();
@@ -4690,8 +4832,7 @@ bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
// may cause a significant compile-time increase.
for (unsigned CI = 0, CE = it->second.size(); CI < CE; CI+=16) {
unsigned Len = std::min<unsigned>(CE - CI, 16);
- Changed |= vectorizeStores(makeArrayRef(&it->second[CI], Len),
- -SLPCostThreshold, R);
+ Changed |= vectorizeStores(makeArrayRef(&it->second[CI], Len), R);
}
}
return Changed;
OpenPOWER on IntegriCloud